# Logistic Regression on cluster_2_updated_data


## Importing the libraries

In [372]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [373]:
# pyspark_df = spark.table("sd_bdc_demo.k_cluster_logistic_regression.2_6_am_workday_cluster_2_updated_data")
# pyspark_df.display()
# df = pyspark_df.toPandas()

In [374]:
df = pd.read_csv("2_6_am_workday_cluster_2_updated_data/2_6_am_workday_cluster_2_updated_data.csv")


In [375]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [376]:
print(df.columns)

Index(['Current_Role', 'Zip_Code', 'Employee_ID', 'Employee_Annual_Salary',
       'Monthly_Medical_contribution', 'Monthly_Dental_Contribution',
       'Monthly_Vision_Contribution', 'Bonus', 'Years_Of_Service',
       'Department', 'Gender', 'Employee_HR_rate', 'Hours_per_week',
       'Years_Since_Last_Promotion', 'age', 'left'],
      dtype='object')


In [377]:
print(df.head())

               Current_Role  Zip_Code  Employee_ID  Employee_Annual_Salary  \
0   Chief Operating Officer      2101           61                  300000   
1  Chief Technology Officer     98102           62                  300000   
2          VP of Technology     98109          156                  300000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           230                           55   
1                           230                           55   
2                           230                           55   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service Department  Gender  \
0                           27  90000              15.2  Executive  Female   
1                           27  90000              14.7  Executive    Male   
2                           27  90000              12.3  Executive  Female   

   Employee_HR_rate  Hours_per_week  Years_Since_Last_Promotion  age  left  
0                79              40     

In [378]:
print(df.shape)

(3, 16)


## Encoding categorical data

### Encoding the Independent Variable

In [379]:
df_encoded = pd.get_dummies(df, columns=['Current_Role', 'Department', 'Gender'], drop_first=False).astype(int)


In [380]:
print(df_encoded.head())

   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0      2101           61                  300000   
1     98102           62                  300000   
2     98109          156                  300000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           230                           55   
1                           230                           55   
2                           230                           55   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           27  90000                15                79   
1                           27  90000                14                70   
2                           27  90000                12                67   

   Hours_per_week  Years_Since_Last_Promotion  age  left  \
0              40                           5   41     1   
1              40                           4   40     0   
2              40                           3

In [381]:
print(df_encoded.shape)

(3, 19)


## Split two Data Frame for Independent Features & Dependent Columns

In [382]:
df_independent = df_encoded.drop(columns=['left'])
X = df_independent
df_dependent = df_encoded[['left']]
y = df_dependent

In [383]:
print(df_independent.head())


   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0      2101           61                  300000   
1     98102           62                  300000   
2     98109          156                  300000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           230                           55   
1                           230                           55   
2                           230                           55   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           27  90000                15                79   
1                           27  90000                14                70   
2                           27  90000                12                67   

   Hours_per_week  Years_Since_Last_Promotion  age  \
0              40                           5   41   
1              40                           4   40   
2              40                           3   35   

   Curre

In [384]:
print(df_independent.shape)

(3, 18)


In [385]:
print(df_dependent.head())

   left
0     1
1     0
2     0


In [386]:
print(df_dependent.shape)

(3, 1)


## Splitting the dataset into the Training set and Test set

In [387]:
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [388]:
# Separate the two classes
df_class_0 = df_encoded[df_encoded['left'] == 0]
df_class_1 = df_encoded[df_encoded['left'] == 1]

# Take 1 row from each class for training
df_train = pd.concat([df_class_0.iloc[:1], df_class_1])
df_test = df_encoded.drop(df_train.index)

# Split into X and y
X_train = df_train.drop(columns='left')
y_train = df_train[['left']]

X_test = df_test.drop(columns='left')
y_test = df_test[['left']]

# (Optional) Reset index for clean display
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Check outputs
print("X_train:\n", X_train)
print("y_train:\n", y_train)
print("X_test:\n", X_test)
print("y_test:\n", y_test)

X_train:
    Zip_Code  Employee_ID  Employee_Annual_Salary  \
0     98102           62                  300000   
1      2101           61                  300000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           230                           55   
1                           230                           55   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           27  90000                14                70   
1                           27  90000                15                79   

   Hours_per_week  Years_Since_Last_Promotion  age  \
0              40                           4   40   
1              40                           5   41   

   Current_Role_Chief Operating Officer  \
0                                     0   
1                                     1   

   Current_Role_Chief Technology Officer  Current_Role_VP of Technology  \
0                                      1

In [389]:
print(X_train.head())

   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0     98102           62                  300000   
1      2101           61                  300000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           230                           55   
1                           230                           55   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           27  90000                14                70   
1                           27  90000                15                79   

   Hours_per_week  Years_Since_Last_Promotion  age  \
0              40                           4   40   
1              40                           5   41   

   Current_Role_Chief Operating Officer  \
0                                     0   
1                                     1   

   Current_Role_Chief Technology Officer  Current_Role_VP of Technology  \
0                                      1          

In [390]:
print(X_train.shape)

(2, 18)


In [391]:
print(X_test.head())

   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0     98109          156                  300000   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           230                           55   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                           27  90000                12                67   

   Hours_per_week  Years_Since_Last_Promotion  age  \
0              40                           3   35   

   Current_Role_Chief Operating Officer  \
0                                     0   

   Current_Role_Chief Technology Officer  Current_Role_VP of Technology  \
0                                      0                              1   

   Department_Executive  Gender_Female  Gender_Male  
0                     1              1            0  


In [392]:
print(X_test.shape)

(1, 18)


In [393]:
print(y_train.head())

   left
0     0
1     1


In [394]:
print(y_train.shape)

(2, 1)


In [395]:
print(y_test.head())

   left
0     0


In [396]:
print(y_test.shape)

(1, 1)


## Feature Scaling

In [397]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# List of columns to scale
columns_to_scale = [
    'Employee_Annual_Salary', 'Monthly_Medical_contribution',
    'Monthly_Dental_Contribution', 'Monthly_Vision_Contribution', 'Bonus',
    'Years_Of_Service', 'Employee_HR_rate', 'Hours_per_week',
    'Years_Since_Last_Promotion', 'age'
]

# Fit sc on training data and transform both training and test sets
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[columns_to_scale] = sc.fit_transform(X_train_scaled[columns_to_scale])
X_test_scaled[columns_to_scale] = sc.transform(X_test_scaled[columns_to_scale])


In [398]:
print(X_train_scaled)

   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0     98102           62                     0.0   
1      2101           61                     0.0   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           0.0                          0.0   
1                           0.0                          0.0   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                          0.0    0.0              -1.0              -1.0   
1                          0.0    0.0               1.0               1.0   

   Hours_per_week  Years_Since_Last_Promotion  age  \
0             0.0                        -1.0 -1.0   
1             0.0                         1.0  1.0   

   Current_Role_Chief Operating Officer  \
0                                     0   
1                                     1   

   Current_Role_Chief Technology Officer  Current_Role_VP of Technology  \
0                                      1          

In [399]:
print(X_test_scaled)

   Zip_Code  Employee_ID  Employee_Annual_Salary  \
0     98109          156                     0.0   

   Monthly_Medical_contribution  Monthly_Dental_Contribution  \
0                           0.0                          0.0   

   Monthly_Vision_Contribution  Bonus  Years_Of_Service  Employee_HR_rate  \
0                          0.0    0.0              -5.0         -1.666667   

   Hours_per_week  Years_Since_Last_Promotion   age  \
0             0.0                        -3.0 -11.0   

   Current_Role_Chief Operating Officer  \
0                                     0   

   Current_Role_Chief Technology Officer  Current_Role_VP of Technology  \
0                                      0                              1   

   Department_Executive  Gender_Female  Gender_Male  
0                     1              1            0  


### Removing `Employee ID` & `Zip Code` in `X_train_scaled` & `X_test_scaled`

In [400]:
print(type(X_train_scaled))

<class 'pandas.core.frame.DataFrame'>


In [401]:
X_train_scaled_new = X_train_scaled.drop(columns=['Employee_ID', 'Zip_Code'])
print(X_train_scaled_new)

   Employee_Annual_Salary  Monthly_Medical_contribution  \
0                     0.0                           0.0   
1                     0.0                           0.0   

   Monthly_Dental_Contribution  Monthly_Vision_Contribution  Bonus  \
0                          0.0                          0.0    0.0   
1                          0.0                          0.0    0.0   

   Years_Of_Service  Employee_HR_rate  Hours_per_week  \
0              -1.0              -1.0             0.0   
1               1.0               1.0             0.0   

   Years_Since_Last_Promotion  age  Current_Role_Chief Operating Officer  \
0                        -1.0 -1.0                                     0   
1                         1.0  1.0                                     1   

   Current_Role_Chief Technology Officer  Current_Role_VP of Technology  \
0                                      1                              0   
1                                      0                    

In [402]:
print(type(X_test_scaled))

<class 'pandas.core.frame.DataFrame'>


In [403]:
X_test_scaled_new = X_test_scaled.drop(columns=['Employee_ID', 'Zip_Code'])
print(X_test_scaled_new)

   Employee_Annual_Salary  Monthly_Medical_contribution  \
0                     0.0                           0.0   

   Monthly_Dental_Contribution  Monthly_Vision_Contribution  Bonus  \
0                          0.0                          0.0    0.0   

   Years_Of_Service  Employee_HR_rate  Hours_per_week  \
0              -5.0         -1.666667             0.0   

   Years_Since_Last_Promotion   age  Current_Role_Chief Operating Officer  \
0                        -3.0 -11.0                                     0   

   Current_Role_Chief Technology Officer  Current_Role_VP of Technology  \
0                                      0                              1   

   Department_Executive  Gender_Female  Gender_Male  
0                     1              1            0  


## Training the Model

In [404]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [405]:
model.fit(X_train_scaled_new, y_train.values.ravel())

## Prediction

In [406]:
y_pred = model.predict(X_test_scaled_new)

In [407]:
print(y_pred)

[0]


### Saving the output as new table

In [408]:
# Get Employee_IDs from X_test
employee_ids = X_test_scaled['Employee_ID'].values

In [409]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)


employee_ids shape: (1,)
y_test shape: (1, 1)
y_pred shape: (1,)


In [410]:
print(y_test)

   left
0     0


In [411]:
# Flatten y_test to 1D
y_test = y_test.values.ravel()

In [412]:
print("employee_ids shape:", employee_ids.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)

employee_ids shape: (1,)
y_test shape: (1,)
y_pred shape: (1,)


In [413]:
print(y_test)

[0]


### get top 3 features

In [414]:
top_indices = np.argsort(model.coef_[0])[::-1][:3]


In [415]:
print(top_indices)

[9 8 5]


In [416]:
top_features = X_train_scaled_new.columns[top_indices].tolist()

In [417]:
results_df = pd.DataFrame({
    'Employee_ID': employee_ids,
    'Actual': y_test,
    'Predicted': y_pred,
    'primary_attrition_feature': top_features[0],
    'secondary_attrition_feature': top_features[1],
    'tertiary_attrition_feature': top_features[2]
})




In [418]:
# Create the folder if it doesn't exist
os.makedirs("6_5_am_workday_log_reg_prediction_on_cluster_2_data", exist_ok=True)

# Save the Excel file inside the folder
results_df.to_csv("6_5_am_workday_log_reg_prediction_on_cluster_2_data/6_5_am_workday_log_reg_prediction_on_cluster_2_data.csv", index=False)


In [419]:
# spark_df = spark.createDataFrame(results_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.6_5_am_workday_log_reg_prediction_on_cluster_2_data")

## Accuracy

In [420]:
model.score(X_test_scaled_new,y_test)

1.0

In [421]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Confusion Matrix:
 [[1]]
Accuracy: 1.0
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Bias & Weights

In [422]:
print(model.coef_)       # β₁, β₂, ..., βn
print(model.intercept_)  # β₀

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  3.26701625e-01  3.26701625e-01  0.00000000e+00
   3.26701625e-01  3.26701625e-01  1.63350813e-01 -1.63350813e-01
   0.00000000e+00 -5.02219350e-18  1.63350813e-01 -1.63350813e-01]]
[-6.59343577e-19]


In [423]:
len(model.coef_.ravel())

16

In [424]:
max(model.coef_.ravel())

np.float64(0.32670162526907587)

In [425]:
feature_weights = pd.Series(model.coef_[0], index=X_train_scaled_new.columns)
print(feature_weights)


Employee_Annual_Salary                   0.000000e+00
Monthly_Medical_contribution             0.000000e+00
Monthly_Dental_Contribution              0.000000e+00
Monthly_Vision_Contribution              0.000000e+00
Bonus                                    0.000000e+00
Years_Of_Service                         3.267016e-01
Employee_HR_rate                         3.267016e-01
Hours_per_week                           0.000000e+00
Years_Since_Last_Promotion               3.267016e-01
age                                      3.267016e-01
Current_Role_Chief Operating Officer     1.633508e-01
Current_Role_Chief Technology Officer   -1.633508e-01
Current_Role_VP of Technology            0.000000e+00
Department_Executive                    -5.022194e-18
Gender_Female                            1.633508e-01
Gender_Male                             -1.633508e-01
dtype: float64


### Saving All Feature & Weights

In [426]:
weights_df = pd.DataFrame({
    'Feature': X_train_scaled_new.columns,
    'Weight': model.coef_[0] 
})

print(weights_df)

                                  Feature        Weight
0                  Employee_Annual_Salary  0.000000e+00
1            Monthly_Medical_contribution  0.000000e+00
2             Monthly_Dental_Contribution  0.000000e+00
3             Monthly_Vision_Contribution  0.000000e+00
4                                   Bonus  0.000000e+00
5                        Years_Of_Service  3.267016e-01
6                        Employee_HR_rate  3.267016e-01
7                          Hours_per_week  0.000000e+00
8              Years_Since_Last_Promotion  3.267016e-01
9                                     age  3.267016e-01
10   Current_Role_Chief Operating Officer  1.633508e-01
11  Current_Role_Chief Technology Officer -1.633508e-01
12          Current_Role_VP of Technology  0.000000e+00
13                   Department_Executive -5.022194e-18
14                          Gender_Female  1.633508e-01
15                            Gender_Male -1.633508e-01


In [427]:
# Create the folder if it doesn't exist
os.makedirs("6_1_am_workday_all_feature_vs_weights_on_cluster_2_data", exist_ok=True)

# Save the Excel file inside the folder
weights_df.to_csv("6_1_am_workday_all_feature_vs_weights_on_cluster_2_data/6_1_am_workday_all_feature_vs_weights_on_cluster_2_data.csv", index=False)

In [428]:
# spark_df = spark.createDataFrame(weights_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.6_1_am_workday_all_feature_vs_weights_on_cluster_2_data")

### Saving Current_Role & Weights

In [429]:
# pyspark_df = spark.table("sd_bdc_demo.k_cluster_logistic_regression.6_1_am_workday_all_feature_vs_weights_on_cluster_2_data")
# pyspark_df.display()
# all_feature_df = pyspark_df.toPandas()

In [430]:
all_feature_df = pd.read_csv("6_1_am_workday_all_feature_vs_weights_on_cluster_2_data/6_1_am_workday_all_feature_vs_weights_on_cluster_2_data.csv")

In [431]:
print(all_feature_df.head())

                        Feature  Weight
0        Employee_Annual_Salary     0.0
1  Monthly_Medical_contribution     0.0
2   Monthly_Dental_Contribution     0.0
3   Monthly_Vision_Contribution     0.0
4                         Bonus     0.0


In [432]:
all_feature_df_Current_Role = all_feature_df[all_feature_df['Feature'].str.startswith('Current_Role_')]
print(all_feature_df_Current_Role)


                                  Feature    Weight
10   Current_Role_Chief Operating Officer  0.163351
11  Current_Role_Chief Technology Officer -0.163351
12          Current_Role_VP of Technology  0.000000


In [433]:
Current_Role_df = all_feature_df_Current_Role.copy()

In [434]:
Current_Role_df['Feature'] = Current_Role_df['Feature'].str.removeprefix('Current_Role_')


In [435]:
print(Current_Role_df)

                     Feature    Weight
10   Chief Operating Officer  0.163351
11  Chief Technology Officer -0.163351
12          VP of Technology  0.000000


In [436]:
# Create the folder if it doesn't exist
os.makedirs("6_2_am_workday_current_role_vs_weights_on_cluster_2_data", exist_ok=True)

# Save the Excel file inside the folder
Current_Role_df.to_csv("6_2_am_workday_current_role_vs_weights_on_cluster_2_data/6_2_am_workday_current_role_vs_weights_on_cluster_2_data.csv", index=False)

In [437]:
# spark_df = spark.createDataFrame(Current_Role_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.6_2_am_workday_current_role_vs_weights_on_cluster_2_data")

### Saving Department & Weights

In [438]:
all_feature_df_Department = all_feature_df[all_feature_df['Feature'].str.startswith('Department_')]
print(all_feature_df_Department)


                 Feature        Weight
13  Department_Executive -5.022194e-18


In [439]:
Department_df = all_feature_df_Department.copy()

In [440]:
Department_df['Feature'] = Department_df['Feature'].str.removeprefix('Department_')

In [441]:
# Create the folder if it doesn't exist
os.makedirs("6_3_am_workday_department_vs_weights_on_cluster_2_data", exist_ok=True)

# Save the Excel file inside the folder
Department_df.to_csv("6_3_am_workday_department_vs_weights_on_cluster_2_data/6_3_am_workday_department_vs_weights_on_cluster_2_data.csv", index=False)

In [442]:
# spark_df = spark.createDataFrame(Department_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.6_3_am_workday_department_vs_weights_on_cluster_2_data")

### Saving Gender & Weights

In [443]:
all_feature_df_Gender = all_feature_df[all_feature_df['Feature'].str.startswith('Gender_')]
print(all_feature_df_Gender)


          Feature    Weight
14  Gender_Female  0.163351
15    Gender_Male -0.163351


In [444]:
Gender_df = all_feature_df_Gender.copy()

In [445]:
Gender_df['Feature'] = Gender_df['Feature'].str.removeprefix('Gender_')

In [446]:
# Create the folder if it doesn't exist
os.makedirs("6_4_am_workday_gender_vs_weights_on_cluster_2_data", exist_ok=True)

# Save the Excel file inside the folder
Gender_df.to_csv("6_4_am_workday_gender_vs_weights_on_cluster_2_data/6_4_am_workday_gender_vs_weights_on_cluster_2_data.csv", index=False)

In [447]:
# spark_df = spark.createDataFrame(Gender_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.k_cluster_logistic_regression.6_4_am_workday_gender_vs_weights_on_cluster_2_data")