In [57]:
import pandas as pd

from category_encoders import OrdinalEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier



In [58]:
df = pd.read_csv('../data/clean_HR_Analytics.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


In [59]:
# Vertical split 
target = "Attrition"
X = df.drop(columns=target)
y = df[target]

In [60]:
label = LabelEncoder()
y_encoded = label.fit_transform(y)

ordinal = OrdinalEncoder()
X_encoded = ordinal.fit_transform(X)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [11]:
smote = SMOTE()
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

In [71]:
def train_model(clf, params):
    pipeline = make_pipeline(
        # StandardScaler(),
        clf
    )
    # model = clf


    model = GridSearchCV(
        pipeline,
        param_grid=params,
        cv=5,
        n_jobs=-1,
        verbose=1
    )


    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)

    print("Training Accuracy: ", round(train_acc, 2))
    print("Test Accuracy: ", round(test_acc, 2))

    cm = confusion_matrix(y_test, y_pred)
    print("---confusion matrix----")
    print(cm)

    return model

In [46]:
clf = LogisticRegression()
# train_model(clf)

In [70]:
clf = LogisticRegression(max_iter=10000)
train_model(clf)

Training Accuracy:  0.86
Test Accuracy:  0.87
---confusion matrix----
[[247   8]
 [ 31   8]]


LogisticRegression(max_iter=10000)

#### RandomForest

In [38]:
clf = RandomForestClassifier()
params = {
    "randomforestclassifier__n_estimators": range(30, 100, 10),
    "randomforestclassifier__max_depth": range(10, 50, 5)
}

rf_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Training Accuracy:  1.0
Test Accuracy:  0.86
---confusion matrix----
[[242  13]
 [ 29  10]]


In [48]:
clf = RandomForestClassifier()
params = {
    "randomforestclassifier__n_estimators": range(30, 100, 10),
    "randomforestclassifier__max_depth": range(10, 50, 5)
}

rf_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Training Accuracy:  1.0
Test Accuracy:  0.86
---confusion matrix----
[[242  13]
 [ 28  11]]


In [72]:
clf = RandomForestClassifier()
params = {
    "randomforestclassifier__n_estimators": range(30, 100, 10),
    "randomforestclassifier__max_depth": range(10, 50, 5)
}

rf_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Training Accuracy:  1.0
Test Accuracy:  0.87
---confusion matrix----
[[252   3]
 [ 35   4]]


#### GradientBoosting

In [40]:
clf = GradientBoostingClassifier()
params = {
    "gradientboostingclassifier__max_depth": range(2, 5, 1),
    "gradientboostingclassifier__n_estimators": range(20, 40, 5),
}

gd_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Training Accuracy:  0.94
Test Accuracy:  0.85
---confusion matrix----
[[233  22]
 [ 23  16]]


In [51]:
clf = GradientBoostingClassifier()
params = {
    "gradientboostingclassifier__max_depth": range(2, 5, 1),
    "gradientboostingclassifier__n_estimators": range(20, 40, 5),
}

gd_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Training Accuracy:  0.94
Test Accuracy:  0.84
---confusion matrix----
[[233  22]
 [ 24  15]]


In [73]:
clf = GradientBoostingClassifier()
params = {
    "gradientboostingclassifier__max_depth": range(2, 5, 1),
    "gradientboostingclassifier__n_estimators": range(20, 40, 5),
}

gd_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Training Accuracy:  0.9
Test Accuracy:  0.87
---confusion matrix----
[[252   3]
 [ 35   4]]


#### AdaBoostClassifier

In [41]:
clf = AdaBoostClassifier(random_state=42)
params = {
    "adaboostclassifier__n_estimators": range(30, 100, 10)
}

ab_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Training Accuracy:  0.91
Test Accuracy:  0.83
---confusion matrix----
[[221  34]
 [ 17  22]]


In [53]:
clf = AdaBoostClassifier(random_state=42)
params = {
    "adaboostclassifier__n_estimators": range(30, 100, 10)
}
ab_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Training Accuracy:  0.91
Test Accuracy:  0.83
---confusion matrix----
[[221  34]
 [ 17  22]]


In [74]:
clf = AdaBoostClassifier(random_state=42)
params = {
    "adaboostclassifier__n_estimators": range(30, 100, 10)
}
ab_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Training Accuracy:  0.91
Test Accuracy:  0.86
---confusion matrix----
[[242  13]
 [ 29  10]]


#### XGBoostClassifier

In [42]:
clf = XGBClassifier()

params = {
    'xgbclassifier__n_estimators': range(100, 300, 50),
    'xgbclassifier__max_depth': range(3, 8, 2),
    'xgbclassifier__learning_rate': [0.1, 0.01, 0.001]
}

xg_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Training Accuracy:  1.0
Test Accuracy:  0.86
---confusion matrix----
[[241  14]
 [ 28  11]]


In [55]:
clf = XGBClassifier()

params = {
    'xgbclassifier__n_estimators': range(100, 300, 50),
    'xgbclassifier__max_depth': range(3, 8, 2),
    'xgbclassifier__learning_rate': [0.1, 0.01, 0.001]
}

xg_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Training Accuracy:  1.0
Test Accuracy:  0.86
---confusion matrix----
[[241  14]
 [ 28  11]]


In [75]:
clf = XGBClassifier()

params = {
    'xgbclassifier__n_estimators': range(100, 300, 50),
    'xgbclassifier__max_depth': range(3, 8, 2),
    'xgbclassifier__learning_rate': [0.1, 0.01, 0.001]
}

xg_model = train_model(clf=clf, params=params)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Training Accuracy:  0.94
Test Accuracy:  0.87
---confusion matrix----
[[250   5]
 [ 34   5]]
