# 0. Import libraries and load dataset

In [1]:
import numpy as np
import pandas as pd

import sys

In [2]:
# Classification models
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score

## Paths

In [3]:
# Check whether the code is running in Google Colab or in a local environment
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  # Mount Google Drive to access datasets
  from google.colab import drive, userdata
  gdrive_string = '/content/drive'
  drive.mount(gdrive_string)

  dataset_path = f'{gdrive_string}/MyDrive/BPI DATA Wave 2025/Potential Datasets'
  employee_df = pd.read_csv(f'{dataset_path}/banking_employee_data.csv')
else:
  # Local environment setup
  dataset_path = '../Datasets'
  employee_df = pd.read_csv(f'{dataset_path}/banking_employee_data.csv')

In [4]:
employee_df.head()

Unnamed: 0,Department,Gender,Job_Title,Employee_ID,Age,Hire_Date,Years_At_Company,Education_Level,Performance_Score,Monthly_Salary,...,Overtime_Hours,Sick_Days,Remote_Work_Frequency,Team_Size,Training_Hours,Promotions,Employee_Satisfaction_Score,Resigned,Resignation_Date,Hiring_Age
0,Retail Banking,Male,Branch Manager,1,60,1988-01-01,37.6,Bachelor,3.22,149436,...,0,5,4,2,19,4,3.11,False,,22
1,Retail Banking,Female,Branch Manager,2,60,1988-01-01,37.6,Bachelor,3.7,220296,...,3,2,3,8,41,4,4.07,False,,22
2,Compliance,Male,Compliance Officer,3,60,1988-01-02,37.6,Bachelor,3.62,161636,...,0,5,2,6,10,3,3.19,False,,22
3,Risk Management,Male,Risk Manager,4,60,1988-01-04,37.59,Master,3.28,228415,...,13,6,3,3,32,2,2.59,False,,22
4,Compliance,Female,Compliance Officer,5,60,1988-01-04,37.59,Bachelor,3.31,165926,...,13,6,4,21,16,2,3.18,False,,22


In [5]:
employee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Department                   100000 non-null  object 
 1   Gender                       100000 non-null  object 
 2   Job_Title                    100000 non-null  object 
 3   Employee_ID                  100000 non-null  int64  
 4   Age                          100000 non-null  int64  
 5   Hire_Date                    100000 non-null  object 
 6   Years_At_Company             100000 non-null  float64
 7   Education_Level              100000 non-null  object 
 8   Performance_Score            100000 non-null  float64
 9   Monthly_Salary               100000 non-null  int64  
 10  Work_Hours_Per_Week          100000 non-null  int64  
 11  Projects_Handled             100000 non-null  int64  
 12  Overtime_Hours               100000 non-null  int64  
 13  

# 1. Modeling

In [6]:
# Convert Hire_Date to datetime and extract features
employee_df['Hire_Date'] = pd.to_datetime(employee_df['Hire_Date'])
employee_df['Hire_Year'] = employee_df['Hire_Date'].dt.year
employee_df['Hire_Month'] = employee_df['Hire_Date'].dt.month
employee_df['Hire_Day'] = employee_df['Hire_Date'].dt.day

In [7]:
# Set target
target = "Resigned"
cols_to_drop = ["Employee_ID", "Hire_Date", "Resignation_Date", target]

X = employee_df.drop(columns=cols_to_drop)
y = employee_df[target].astype('int')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Create a preprocessor
preprocessor = ColumnTransformer(
  transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
  ],
  remainder='passthrough'
)

## XGBoost

In [8]:
# Create a pipeline for XGBoost
pipe_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

pipe_xgb.fit(X_train, y_train)

y_pred_xgb = pipe_xgb.predict(X_test)
y_prob_xgb = pipe_xgb.predict_proba(X_test)[:, 1]

print("Metrics of XGBoost:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_xgb)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_prob_xgb)}')
print(f'F1 Score: {f1_score(y_test, y_pred_xgb)}')
print(f'Training Accuracy: {accuracy_score(y_train, pipe_xgb.predict(X_train))}')
print(f'\nClassification Report:\n {classification_report(y_test, y_pred_xgb)}')
print(f'Confusion Matrix:\n {confusion_matrix(y_test, y_pred_xgb)}')

Parameters: { "use_label_encoder" } are not used.



Metrics of XGBoost:
Accuracy: 0.99195
ROC AUC: 0.9979332856290378
F1 Score: 0.9821527546835163
Training Accuracy: 0.9950125

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     15417
           1       1.00      0.97      0.98      4583

    accuracy                           0.99     20000
   macro avg       0.99      0.98      0.99     20000
weighted avg       0.99      0.99      0.99     20000

Confusion Matrix:
 [[15409     8]
 [  153  4430]]


## Random Forest

In [9]:
pipe_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

pipe_rf.fit(X_train, y_train)

y_pred_rf = pipe_rf.predict(X_test)
y_prob_rf = pipe_rf.predict_proba(X_test)[:, 1]

print("Metrics of Random Forest:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_rf)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_prob_rf)}')
print(f'F1 Score: {f1_score(y_test, y_pred_rf)}')
print(f'Training Accuracy: {accuracy_score(y_train, pipe_rf.predict(X_train))}')
print(f'\nClassification Report:\n {classification_report(y_test, y_pred_rf)}')
print(f'Confusion Matrix:\n {confusion_matrix(y_test, y_pred_rf)}')

Metrics of Random Forest:
Accuracy: 0.9439
ROC AUC: 0.9825837499038124
F1 Score: 0.8659498207885304
Training Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96     15417
           1       0.96      0.79      0.87      4583

    accuracy                           0.94     20000
   macro avg       0.95      0.89      0.92     20000
weighted avg       0.94      0.94      0.94     20000

Confusion Matrix:
 [[15254   163]
 [  959  3624]]


## Logistic Regression

In [10]:
pipe_logreg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

pipe_logreg.fit(X_train, y_train)

y_pred_logreg = pipe_logreg.predict(X_test)
y_prob_logreg = pipe_logreg.predict_proba(X_test)[:, 1]

print("Metrics of Logistic Regression:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_logreg)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_prob_logreg)}')
print(f'F1 Score: {f1_score(y_test, y_pred_logreg)}')
print(f'Training Accuracy: {accuracy_score(y_train, pipe_logreg.predict(X_train))}')
print(f'\nClassification Report:\n {classification_report(y_test, y_pred_logreg)}')
print(f'Confusion Matrix:\n {confusion_matrix(y_test, y_pred_logreg)}')

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Metrics of Logistic Regression:
Accuracy: 0.87515
ROC AUC: 0.9121992576126925
F1 Score: 0.6944818304172274
Training Accuracy: 0.8709625

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.95      0.92     15417
           1       0.79      0.62      0.69      4583

    accuracy                           0.88     20000
   macro avg       0.84      0.79      0.81     20000
weighted avg       0.87      0.88      0.87     20000

Confusion Matrix:
 [[14665   752]
 [ 1745  2838]]
