In [0]:
train_file_path = "/FileStore/tables/gold_train.parquet"
test_file_path = "/FileStore/tables/gold_test.parquet"

In [0]:
train = spark.read.format("parquet").load(train_file_path)
test = spark.read.format("parquet").load(test_file_path)

In [0]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator,  MulticlassClassificationEvaluator

lr = LogisticRegression(featuresCol='features',
                      labelCol='Attrition_index')

lr_model = lr.fit(train)

In [0]:
# Predictions
predictions = lr_model.transform(test)
predictions.select("features","Attrition", 'Attrition_index', "prediction", "probability").show(5)

# Evaluation
binary_evaluator = BinaryClassificationEvaluator(labelCol="Attrition_index", rawPredictionCol="prediction", metricName="areaUnderROC")

# Initialize MulticlassClassificationEvaluator for metrics like Accuracy, Precision, Recall
multi_evaluator = MulticlassClassificationEvaluator(labelCol="Attrition_index", predictionCol="prediction")




+--------------------+---------+---------------+----------+--------------------+
|            features|Attrition|Attrition_index|prediction|         probability|
+--------------------+---------+---------------+----------+--------------------+
|(26,[0,1,2,3,4,5,...|   Stayed|            0.0|       0.0|[0.64530855809774...|
|(26,[0,1,2,3,4,5,...|   Stayed|            0.0|       0.0|[0.93547049474102...|
|(26,[0,1,2,3,4,5,...|     Left|            1.0|       1.0|[0.43991746387523...|
|[-1.6830448420997...|   Stayed|            0.0|       1.0|[0.40473876303161...|
|[-0.8578298986642...|   Stayed|            0.0|       0.0|[0.91825702865186...|
+--------------------+---------+---------------+----------+--------------------+
only showing top 5 rows



In [0]:
# Calculate Area Under ROC (AUC)
auc = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})
print(f"Area Under ROC (AUC): {auc}")

# Calculate Accuracy
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")


Area Under ROC (AUC): 0.7260020533002525
Accuracy: 0.7268926056338029


In [0]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier# checking for features importance 

# Define the RandomForest model
rf = RandomForestClassifier(featuresCol="features", labelCol="Attrition_index")




# Fit the model
rf_model = rf.fit(train)

In [0]:
# Get feature importances
importances = rf_model.featureImportances.toArray()
feature_columns = ['scaled_numerical_feature_vector',
 'Work-Life Balance_index',
 'Job Satisfaction_index',
 'Performance Rating_index',
 'Education Level_index',
 'Job Level_index',
 'Company Size_index',
 'Company Reputation_index',
 'Employee Recognition_index',
 'Gender_onehot',
 'Job Role_onehot',
 'Overtime_onehot',
 'Marital Status_onehot',
 'Remote Work_onehot',
 'Leadership Opportunities_onehot',
 'Innovation Opportunities_onehot']
# Convert importances to a list of tuples with each feature name and its importance as float
importances_list = [(name, float(importance)) for name, importance in zip(feature_columns, importances)]

# Create the DataFrame with explicit schema definition
from pyspark.sql.types import StructType, StructField, StringType, FloatType
schema = StructType([
    StructField("Feature", StringType(), True),
    StructField("Importance", FloatType(), True)
])

# Create DataFrame with the defined schema
feature_importance_df = spark.createDataFrame(importances_list, schema=schema)

# Show feature importance sorted by descending order
feature_importance_df.orderBy("Importance", ascending=False).show()

+--------------------+------------+
|             Feature|  Importance|
+--------------------+------------+
|     Overtime_onehot|  0.30360973|
|Company Reputatio...| 0.061705243|
|Performance Ratin...|   0.0381825|
|     Job Role_onehot| 0.022714224|
|Education Level_i...| 0.021211961|
|Innovation Opport...| 0.016206777|
|Work-Life Balance...| 0.008545652|
|  Remote Work_onehot| 0.007681143|
|     Job Level_index|0.0069579277|
|scaled_numerical_...|0.0058900337|
|Employee Recognit...|  0.00501718|
|Job Satisfaction_...| 0.004442644|
|       Gender_onehot|0.0035444451|
|  Company Size_index|0.0025570716|
|Leadership Opport...|0.0010700992|
|Marital Status_on...|5.1556166E-5|
+--------------------+------------+

