# Logistic Regression on cluster_1_updated_data


## Importing the libraries

In [81]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

## Importing the dataset

In [82]:
# pyspark_df = spark.table("sd_bdc_demo.k_cluster_logistic_regression.2_5_am_workday_cluster_1_updated_data")
# pyspark_df.display()
# df = pyspark_df.toPandas()

In [83]:
df = pd.read_csv("2_5_am_workday_cluster_1_updated_data/2_5_am_workday_cluster_1_updated_data.csv")


In [84]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [85]:
print(df.columns)

Index(['Current_Role', 'Zip_Code', 'Employee_ID', 'Employee_Annual_Salary',
       'Monthly_Medical_contribution', 'Monthly_Dental_Contribution',
       'Monthly_Vision_Contribution', 'Bonus', 'Years_Of_Service',
       'Department', 'Gender', 'Employee_HR_rate', 'Hours_per_week',
       'Years_Since_Last_Promotion', 'age', 'left'],
      dtype='object')


In [86]:
print(df.head())

             Current_Role  Zip_Code  Employee_ID  Employee_Annual_Salary  \
0         Design Director     78701           72                  215200   
1         Design Director     78702           73                  217600   
2         Design Director     78702           75                  218200   
3          Director of HR      2108           76                  220000   
4  Director of Operations     98106           77                  211000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           180                           45   
1                           180                           45   
2                           180                           45   
3                           180                           45   
4                           180                           45   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service       Department  \
0                           23  48635               9.2           Design   
1     

In [87]:
print(df.shape)

(16, 16)


## Encoding categorical data

### Encoding the Independent Variable

In [88]:
df_encoded = pd.get_dummies(df, columns=['Current_Role', 'Department', 'Gender'], drop_first=False).astype(int)


In [89]:
print(df_encoded.head())

   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0     78701           72                  215200   
1     78702           73                  217600   
2     78702           75                  218200   
3      2108           76                  220000   
4     98106           77                  211000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           180                           45   
1                           180                           45   
2                           180                           45   
3                           180                           45   
4                           180                           45   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           23  48635                 9                69   
1                           23  49613                 9               150   
2                           23  49859                 9               1

In [90]:
print(df_encoded.shape)

(16, 34)


## Split two Data Frame for Independent Features & Dependent Columns

In [91]:
df_independent = df_encoded.drop(columns=['left'])
X = df_independent
df_dependent = df_encoded[['left']]
y = df_dependent

In [92]:
print(df_independent.head())


   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0     78701           72                  215200   
1     78702           73                  217600   
2     78702           75                  218200   
3      2108           76                  220000   
4     98106           77                  211000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           180                           45   
1                           180                           45   
2                           180                           45   
3                           180                           45   
4                           180                           45   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           23  48635                 9                69   
1                           23  49613                 9               150   
2                           23  49859                 9               1

In [93]:
print(df_independent.shape)

(16, 33)


In [94]:
print(df_dependent.head())

   left
0     0
1     1
2     0
3     1
4     1


In [95]:
print(df_dependent.shape)

(16, 1)


## Splitting the dataset into the Training set and Test set

In [96]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [97]:
print(X_train.head())

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
6      94104           79                  155100   
10     78704          117                  179500   
4      98106           77                  211000   
1      78702           73                  217600   
14     90010          126                  209800   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
6                            155                           40   
10                           155                           40   
4                            180                           45   
1                            180                           45   
14                           180                           45   

    Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
6                            21  24661                 7                71   
10                           21  34015                 7                62   
4                            23  46948                 8

In [98]:
print(X_train.shape)

(12, 33)


In [99]:
print(X_test.head())

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
3       2108           76                  220000   
13     10010          121                  209200   
7      10010          103                  153750   
2      78702           75                  218200   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
3                            180                           45   
13                           180                           45   
7                            155                           40   
2                            180                           45   

    Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
3                            23  50600                11                76   
13                           23  46233                 8               128   
7                            21  18066                 7               108   
2                            23  49859                 9               110   

    Hours_per_wee

In [100]:
print(X_test.shape)

(4, 33)


In [101]:
print(y_train.head())

    left
6      0
10     0
4      1
1      1
14     0


In [102]:
print(y_train.shape)

(12, 1)


In [103]:
print(y_test.head())

    left
3      1
13     0
7      0
2      0


In [104]:
print(y_test.shape)

(4, 1)


## Feature Scaling

In [105]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# List of columns to scale
columns_to_scale = [
    'Employee_Annual_Salary', 'Monthly_Medical_contribution',
    'Monthly_Dental_Contribution', 'Monthly_Vision_Contribution', 'Bonus',
    'Years_Of_Service', 'Employee_HR_rate', 'Hours_per_week',
    'Years_Since_Last_Promotion', 'age'
]

# Fit sc on training data and transform both training and test sets
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[columns_to_scale] = sc.fit_transform(X_train_scaled[columns_to_scale])
X_test_scaled[columns_to_scale] = sc.transform(X_test_scaled[columns_to_scale])


In [106]:
print(X_train_scaled)

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
6      94104           79               -2.249036   
10     78704          117               -0.963872   
4      98106           77                0.695252   
1      78702           73                1.042878   
14     90010          126                0.632047   
0      78701           72                0.916469   
15     94103          149                1.169288   
9      98102          116               -0.410831   
8      98106          115               -0.463501   
12     98102          119               -0.410831   
11     10010          118               -0.779525   
5      10010           78                0.821662   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
6                           -1.0                         -1.0   
10                          -1.0                         -1.0   
4                            1.0                          1.0   
1                            1.0                  

In [107]:
print(X_test_scaled)

    Zip_Code  Employee_ID  Employee_Annual_Salary  \
3       2108           76                1.169288   
13     10010          121                0.600445   
7      10010          103               -2.320141   
2      78702           75                1.074481   

    Monthly_Medical_contribution  Monthly_Dental_Contribution  \
3                            1.0                          1.0   
13                           1.0                          1.0   
7                           -1.0                         -1.0   
2                            1.0                          1.0   

    Monthly_Vision_Contribution     Bonus  Years_Of_Service  Employee_HR_rate  \
3                           1.0  1.167188          2.329336         -0.753454   
13                          1.0  0.634895         -0.562254          0.919697   
7                          -1.0 -2.798381         -1.526117          0.276177   
2                           1.0  1.076868          0.401610          0.340529   

  

### Removing `Employee ID` & `Zip Code` in `X_train_scaled` & `X_test_scaled`

In [108]:
print(type(X_train_scaled))

<class 'pandas.core.frame.DataFrame'>


In [109]:
X_train_scaled_new = X_train_scaled.drop(columns=['Employee_ID', 'Zip_Code'])
print(X_train_scaled_new)

    Employee_Annual_Salary  Monthly_Medical_contribution  \
6                -2.249036                          -1.0   
10               -0.963872                          -1.0   
4                 0.695252                           1.0   
1                 1.042878                           1.0   
14                0.632047                           1.0   
0                 0.916469                           1.0   
15                1.169288                           1.0   
9                -0.410831                          -1.0   
8                -0.463501                          -1.0   
12               -0.410831                          -1.0   
11               -0.779525                          -1.0   
5                 0.821662                           1.0   

    Monthly_Dental_Contribution  Monthly_Vision_Contribution     Bonus  \
6                          -1.0                         -1.0 -1.994517   
10                         -1.0                         -1.0 -0.854358 

In [110]:
print(type(X_test_scaled))

<class 'pandas.core.frame.DataFrame'>


In [111]:
X_test_scaled_new = X_test_scaled.drop(columns=['Employee_ID', 'Zip_Code'])
print(X_test_scaled_new)

    Employee_Annual_Salary  Monthly_Medical_contribution  \
3                 1.169288                           1.0   
13                0.600445                           1.0   
7                -2.320141                          -1.0   
2                 1.074481                           1.0   

    Monthly_Dental_Contribution  Monthly_Vision_Contribution     Bonus  \
3                           1.0                          1.0  1.167188   
13                          1.0                          1.0  0.634895   
7                          -1.0                         -1.0 -2.798381   
2                           1.0                          1.0  1.076868   

    Years_Of_Service  Employee_HR_rate  Hours_per_week  \
3           2.329336         -0.753454             0.0   
13         -0.562254          0.919697             0.0   
7          -1.526117          0.276177             0.0   
2           0.401610          0.340529             0.0   

    Years_Since_Last_Promotion       

## Training the Model

In [112]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [113]:
training_timestamp = datetime.now().strftime("%m/%d/%Y %H:%M")
print(training_timestamp)
model.fit(X_train_scaled_new, y_train.values.ravel())

06/19/2025 19:33


## Prediction

In [114]:
y_pred = model.predict(X_test_scaled_new)

In [115]:
print(y_pred)

[1 0 0 1]


### Saving the output as new table

In [116]:
# Get Employee_IDs from X_test
employee_ids = X_test_scaled['Employee_ID'].values

In [117]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)


employee_ids shape: (4,)
y_test shape: (4, 1)
y_pred shape: (4,)


In [118]:
print(y_test)

    left
3      1
13     0
7      0
2      0


In [119]:
# Flatten y_test to 1D
y_test = y_test.values.ravel()

In [120]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)

employee_ids shape: (4,)
y_test shape: (4,)
y_pred shape: (4,)


In [121]:
print(y_test)

[1 0 0 0]


### get top 3 features

In [122]:
top_indices = np.argsort(model.coef_[0])[::-1][:3]


In [123]:
print(top_indices)

[ 8  5 26]


In [124]:
top_features = X_train_scaled_new.columns[top_indices].tolist()

In [125]:
results_df = pd.DataFrame({
    'Employee_ID': employee_ids,
    'Actual': y_test,
    'Predicted': y_pred,
    'Cluster_label': 1,
    'primary_attrition_feature': top_features[0],
    'secondary_attrition_feature': top_features[1],
    'tertiary_attrition_feature': top_features[2]
})




In [126]:
# Create the folder if it doesn't exist
os.makedirs("5_5_am_workday_log_reg_prediction_on_cluster_1_data", exist_ok=True)

# Save the Excel file inside the folder
results_df.to_csv("5_5_am_workday_log_reg_prediction_on_cluster_1_data/5_5_am_workday_log_reg_prediction_on_cluster_1_data.csv", index=False)


In [127]:
# spark_df = spark.createDataFrame(results_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.5_5_am_workday_log_reg_prediction_on_cluster_1_data")

## Accuracy

In [128]:
model.score(X_test_scaled_new,y_test)

0.75

In [129]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Confusion Matrix:
 [[2 1]
 [0 1]]
Accuracy: 0.75
Precision: 0.5
Recall: 1.0
F1 Score: 0.6666666666666666


### saving the metrics

In [130]:
Accuracy = accuracy_score(y_test, y_pred)
Precision = precision_score(y_test, y_pred)
Recall = recall_score(y_test, y_pred) 
F1_Score = f1_score(y_test, y_pred)

# Create a DataFrame
# metrics_df = pd.DataFrame({
#     "Metric": ["Accuracy", "Precision", "Recall", "F1_Score"],
#     "Value": [Accuracy, Precision, Recall, F1_Score]
# })


In [131]:
# Model Run Id
model_type_abbr = "lr"
project_name = "attrition_prediction"
timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
model_run_id = f"{model_type_abbr}_{project_name}_{timestamp_str}"

# Model Type
model_type = "logistic regression"

# model version 
version_file = "3_6_am_workday_log_reg_metrics_data.csv"
if os.path.exists(version_file):
    versions_df = pd.read_csv(version_file)
    max_version = versions_df[versions_df["model_type"] == model_type]["model_version"].max()
    model_version = int(max_version) + 1
else:
    model_version = 1

# training_timestamp
training_timestamp = training_timestamp

# dataset size
dataset_size = X_test_scaled_new.shape[0]

# feature count
feature_count = X_test_scaled_new.shape[1] 

# ==== Create metrics DataFrame ====
metrics_df = pd.DataFrame([{
    "model_run_id": model_run_id,
    "model_type": model_type,
    "model_version": model_version,
    "training_timestamp": training_timestamp,
    "dataset_size": dataset_size,
    "feature_count": feature_count,
    "classification_accuracy": Accuracy,
    "Precision": Precision,
    "Recall": Recall,
    "F1_Score": F1_Score
}])




In [132]:
# Directory & file path
folder = "5_6_am_workday_log_reg_metrics_data"
os.makedirs(folder, exist_ok=True)

# Full path to the CSV file
version_file = os.path.join(folder, "5_6_am_workday_log_reg_metrics_data.csv")

# Append the full metrics row to the CSV log (create file if it doesn't exist)
metrics_df.to_csv(
    version_file,
    mode='a',
    header=not os.path.exists(version_file),
    index=False
)

In [133]:
# # Create the folder if it doesn't exist
# os.makedirs("5_6_am_workday_log_reg_metrics_data", exist_ok=True)

# # Save the Excel file inside the folder
# metrics_df.to_csv("5_6_am_workday_log_reg_metrics_data/5_6_am_workday_log_reg_metrics_data.csv", index=False)


In [134]:
# spark_df = spark.createDataFrame(metrics_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.5_6_am_workday_log_reg_metrics_data")

## Bias & Weights

In [135]:
print(model.coef_)       # β₁, β₂, ..., βn
print(model.intercept_)  # β₀

[[ 0.16603365 -0.0132156  -0.0132156  -0.0132156   0.24339353  0.71466824
   0.23390572  0.          1.18908022  0.02795965 -0.0140812   0.
   0.28634735 -0.26132578 -0.03033514  0.          0.0709957  -0.04561407
  -0.14171672  0.15317962  0.         -0.10309106  0.08544447 -0.05969528
   0.27928466  0.          0.28634735 -0.4030425  -0.10309106 -0.18069019
   0.18049337]]
[-0.57116721]


In [136]:
len(model.coef_.ravel())

31

In [137]:
max(model.coef_.ravel())

np.float64(1.1890802173916384)

In [138]:
feature_weights = pd.Series(model.coef_[0], index=X_train_scaled_new.columns)
print(feature_weights)


Employee_Annual_Salary                      0.166034
Monthly_Medical_contribution               -0.013216
Monthly_Dental_Contribution                -0.013216
Monthly_Vision_Contribution                -0.013216
Bonus                                       0.243394
Years_Of_Service                            0.714668
Employee_HR_rate                            0.233906
Hours_per_week                              0.000000
Years_Since_Last_Promotion                  1.189080
age                                         0.027960
Current_Role_Design Director               -0.014081
Current_Role_Director of HR                 0.000000
Current_Role_Director of Operations         0.286347
Current_Role_Director of Product           -0.261326
Current_Role_Engineering Manager           -0.030335
Current_Role_Lead Product Manager           0.000000
Current_Role_Principal Engineer             0.070996
Current_Role_Principal Product Designer    -0.045614
Current_Role_Principal Product Manager     -0.

### Saving All Feature & Weights

In [139]:
weights_df = pd.DataFrame({
    'Feature': X_train_scaled_new.columns,
    'Weight': model.coef_[0] 
})

print(weights_df)

                                     Feature    Weight
0                     Employee_Annual_Salary  0.166034
1               Monthly_Medical_contribution -0.013216
2                Monthly_Dental_Contribution -0.013216
3                Monthly_Vision_Contribution -0.013216
4                                      Bonus  0.243394
5                           Years_Of_Service  0.714668
6                           Employee_HR_rate  0.233906
7                             Hours_per_week  0.000000
8                 Years_Since_Last_Promotion  1.189080
9                                        age  0.027960
10              Current_Role_Design Director -0.014081
11               Current_Role_Director of HR  0.000000
12       Current_Role_Director of Operations  0.286347
13          Current_Role_Director of Product -0.261326
14          Current_Role_Engineering Manager -0.030335
15         Current_Role_Lead Product Manager  0.000000
16           Current_Role_Principal Engineer  0.070996
17   Curre

In [140]:
# Create the folder if it doesn't exist
os.makedirs("5_1_am_workday_all_feature_vs_weights_on_cluster_1_data", exist_ok=True)

# Save the Excel file inside the folder
weights_df.to_csv("5_1_am_workday_all_feature_vs_weights_on_cluster_1_data/5_1_am_workday_all_feature_vs_weights_on_cluster_1_data.csv", index=False)

In [141]:
# spark_df = spark.createDataFrame(weights_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.5_1_am_workday_all_feature_vs_weights_on_cluster_1_data")

### Saving Current_Role & Weights

In [142]:
# pyspark_df = spark.table("sd_bdc_demo.k_cluster_logistic_regression.5_1_am_workday_all_feature_vs_weights_on_cluster_1_data")
# pyspark_df.display()
# all_feature_df = pyspark_df.toPandas()

In [143]:
all_feature_df = pd.read_csv("5_1_am_workday_all_feature_vs_weights_on_cluster_1_data/5_1_am_workday_all_feature_vs_weights_on_cluster_1_data.csv")

In [144]:
print(all_feature_df.head())

                        Feature    Weight
0        Employee_Annual_Salary  0.166034
1  Monthly_Medical_contribution -0.013216
2   Monthly_Dental_Contribution -0.013216
3   Monthly_Vision_Contribution -0.013216
4                         Bonus  0.243394


In [145]:
all_feature_df_Current_Role = all_feature_df[all_feature_df['Feature'].str.startswith('Current_Role_')]
print(all_feature_df_Current_Role)


                                     Feature    Weight
10              Current_Role_Design Director -0.014081
11               Current_Role_Director of HR  0.000000
12       Current_Role_Director of Operations  0.286347
13          Current_Role_Director of Product -0.261326
14          Current_Role_Engineering Manager -0.030335
15         Current_Role_Lead Product Manager  0.000000
16           Current_Role_Principal Engineer  0.070996
17   Current_Role_Principal Product Designer -0.045614
18    Current_Role_Principal Product Manager -0.141717
19  Current_Role_Principal Software Engineer  0.153180
20             Current_Role_Product Director  0.000000
21               Current_Role_Sales Director -0.103091
22           Current_Role_Technical Director  0.085444


In [146]:
Current_Role_df = all_feature_df_Current_Role.copy()

In [147]:
Current_Role_df['Feature'] = Current_Role_df['Feature'].str.removeprefix('Current_Role_')


In [148]:
print(Current_Role_df)

                        Feature    Weight
10              Design Director -0.014081
11               Director of HR  0.000000
12       Director of Operations  0.286347
13          Director of Product -0.261326
14          Engineering Manager -0.030335
15         Lead Product Manager  0.000000
16           Principal Engineer  0.070996
17   Principal Product Designer -0.045614
18    Principal Product Manager -0.141717
19  Principal Software Engineer  0.153180
20             Product Director  0.000000
21               Sales Director -0.103091
22           Technical Director  0.085444


In [149]:
# Create the folder if it doesn't exist
os.makedirs("5_2_am_workday_current_role_vs_weights_on_cluster_1_data", exist_ok=True)

# Save the Excel file inside the folder
Current_Role_df.to_csv("5_2_am_workday_current_role_vs_weights_on_cluster_1_data/5_2_am_workday_current_role_vs_weights_on_cluster_1_data.csv", index=False)

In [150]:
# spark_df = spark.createDataFrame(Current_Role_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.5_2_am_workday_current_role_vs_weights_on_cluster_1_data")

### Saving Department & Weights

In [151]:
all_feature_df_Department = all_feature_df[all_feature_df['Feature'].str.startswith('Department_')]
print(all_feature_df_Department)


                       Feature    Weight
23           Department_Design -0.059695
24      Department_Engineering  0.279285
25  Department_Human Resources  0.000000
26       Department_Operations  0.286347
27          Department_Product -0.403042
28            Department_Sales -0.103091


In [152]:
Department_df = all_feature_df_Department.copy()

In [153]:
Department_df['Feature'] = Department_df['Feature'].str.removeprefix('Department_')

In [154]:
# Create the folder if it doesn't exist
os.makedirs("5_3_am_workday_department_vs_weights_on_cluster_1_data", exist_ok=True)

# Save the Excel file inside the folder
Department_df.to_csv("5_3_am_workday_department_vs_weights_on_cluster_1_data/5_3_am_workday_department_vs_weights_on_cluster_1_data.csv", index=False)

In [155]:
# spark_df = spark.createDataFrame(Department_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.5_3_am_workday_department_vs_weights_on_cluster_1_data")

### Saving Gender & Weights

In [156]:
all_feature_df_Gender = all_feature_df[all_feature_df['Feature'].str.startswith('Gender_')]
print(all_feature_df_Gender)


          Feature    Weight
29  Gender_Female -0.180690
30    Gender_Male  0.180493


In [157]:
Gender_df = all_feature_df_Gender.copy()

In [158]:
Gender_df['Feature'] = Gender_df['Feature'].str.removeprefix('Gender_')

In [159]:
# Create the folder if it doesn't exist
os.makedirs("5_4_am_workday_gender_vs_weights_on_cluster_1_data", exist_ok=True)

# Save the Excel file inside the folder
Gender_df.to_csv("5_4_am_workday_gender_vs_weights_on_cluster_1_data/5_4_am_workday_gender_vs_weights_on_cluster_1_data.csv", index=False)

In [160]:
# spark_df = spark.createDataFrame(Gender_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.5_4_am_workday_gender_vs_weights_on_cluster_1_data")