In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import scale

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv(r'data/dataset_e1.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,0.15,1102,0.206,1,2,0.147,1,1,...,3,1,0,8,0,1,6,4,0,5
1,49,0,0.249,279,0.138,8,1,0.147,1,2,...,4,4,1,10,3,3,10,7,1,7
2,37,1,0.15,1373,0.138,2,2,0.134,1,4,...,3,2,0,7,3,3,0,0,0,0
3,33,0,0.249,1392,0.138,3,4,0.147,1,5,...,3,3,0,8,3,3,8,7,3,0
4,27,0,0.15,591,0.138,2,1,0.136,1,7,...,3,4,1,6,3,3,2,2,2,2


In [3]:
y = df['Attrition']
X = df.drop(['Attrition'], axis = 1)
X = scale(X)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify = y)

In [5]:
os = SMOTE()
X_train_os, y_train_os = os.fit_resample(X_train, y_train)

In [6]:
def model_metrics(y_test, y_pred):
    print('Accuracy:\t', accuracy_score(y_test, y_pred).round(3))
    print('Precision:\t', precision_score(y_test, y_pred).round(3))
    print('Recall:\t\t', recall_score(y_test, y_pred).round(3))
    print('F1 score:\t', f1_score(y_test, y_pred).round(3))
    print('ROC AUC score:\t', roc_auc_score(y_test, y_pred).round(3))

In [7]:
model_list = []
model_list.append(LogisticRegression(random_state = 1))
model_list.append(RandomForestClassifier(random_state = 1))
model_list.append(GradientBoostingClassifier(random_state = 1))
model_list.append(XGBClassifier(random_state = 1))

# DEFAULT MODELS - no oversampling

In [8]:
for model in model_list:
    print(model)
    model.fit(X_train, y_train)
    print('Train')
    model_metrics(y_train, model.predict(X_train))
    print('')
    print('Test')
    model_metrics(y_test, model.predict(X_test))
    print('')
    print('')

LogisticRegression(random_state=1)
Train
Accuracy:	 0.89
Precision:	 0.782
Recall:		 0.444
F1 score:	 0.566
ROC AUC score:	 0.71

Test
Accuracy:	 0.886
Precision:	 0.689
Recall:		 0.525
F1 score:	 0.596
ROC AUC score:	 0.74


RandomForestClassifier(random_state=1)
Train
Accuracy:	 1.0
Precision:	 1.0
Recall:		 1.0
F1 score:	 1.0
ROC AUC score:	 1.0

Test
Accuracy:	 0.867
Precision:	 0.857
Recall:		 0.203
F1 score:	 0.329
ROC AUC score:	 0.598


GradientBoostingClassifier(random_state=1)
Train
Accuracy:	 0.963
Precision:	 0.993
Recall:		 0.775
F1 score:	 0.871
ROC AUC score:	 0.887

Test
Accuracy:	 0.867
Precision:	 0.647
Recall:		 0.373
F1 score:	 0.473
ROC AUC score:	 0.667


XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_chi

# DEFAULT MODELS - with oversampling

In [9]:
for model in model_list:
    print(model)
    model.fit(X_train_os, y_train_os)
    print('Train')
    model_metrics(y_train_os, model.predict(X_train_os))
    print('')
    print('Test')
    model_metrics(y_test, model.predict(X_test))
    print('')
    print('')

LogisticRegression(random_state=1)
Train
Accuracy:	 0.809
Precision:	 0.794
Recall:		 0.834
F1 score:	 0.814
ROC AUC score:	 0.809

Test
Accuracy:	 0.739
Precision:	 0.363
Recall:		 0.831
F1 score:	 0.505
ROC AUC score:	 0.776


RandomForestClassifier(random_state=1)
Train
Accuracy:	 1.0
Precision:	 1.0
Recall:		 1.0
F1 score:	 1.0
ROC AUC score:	 1.0

Test
Accuracy:	 0.864
Precision:	 0.61
Recall:		 0.424
F1 score:	 0.5
ROC AUC score:	 0.686


GradientBoostingClassifier(random_state=1)
Train
Accuracy:	 0.97
Precision:	 0.994
Recall:		 0.946
F1 score:	 0.969
ROC AUC score:	 0.97

Test
Accuracy:	 0.88
Precision:	 0.667
Recall:		 0.508
F1 score:	 0.577
ROC AUC score:	 0.73


XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missi

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
params_1 = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
            'C' : [0.001, 0.01, 0.1, 1, 10, 100],
            'max_iter': [100, 200, 500]}
model_1 = LogisticRegression(random_state=1)
grid = GridSearchCV(model, params_1, cv = 3)

In [12]:
grid.fit(X_train_os, y_train_os)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=0.5, booster=None,
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints=None,
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1, random_state=1,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method=None,
                                     

In [13]:
grid.best_params_

{'C': 0.001, 'max_iter': 100, 'penalty': 'l1'}

In [14]:
print('Train')
model_metrics(y_train_os, grid.predict(X_train_os))
print('')
print('Test')
model_metrics(y_test, grid.predict(X_test))

Train
Accuracy:	 1.0
Precision:	 1.0
Recall:		 1.0
F1 score:	 1.0
ROC AUC score:	 1.0

Test
Accuracy:	 0.864
Precision:	 0.605
Recall:		 0.441
F1 score:	 0.51
ROC AUC score:	 0.693


In [15]:
params_2 = {'learning_rate' : [0.001, 0.01, 0.1, 0.5],
            'n_estimators' : [50, 100, 200],
            'max_depth': [3, 4, 5]}
model_2 = GradientBoostingClassifier(random_state=1)
grid_2 = GridSearchCV(model, params_2, cv = 3)

In [16]:
grid_2.fit(X_train_os, y_train_os)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=0.5, booster=None,
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints=None,
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1, random_state=1,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method=None,
                                     

In [17]:
grid_2.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}

In [18]:
print('Train')
model_metrics(y_train_os, grid_2.predict(X_train_os))
print('')
print('Test')
model_metrics(y_test, grid_2.predict(X_test))

Train
Accuracy:	 0.982
Precision:	 0.994
Recall:		 0.969
F1 score:	 0.981
ROC AUC score:	 0.982

Test
Accuracy:	 0.861
Precision:	 0.58
Recall:		 0.492
F1 score:	 0.532
ROC AUC score:	 0.712
