In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import scale

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv(r'data/dataset_e1.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,0.15,1102,0.206,1,2,0.147,1,1,...,3,1,0,8,0,1,6,4,0,5
1,49,0,0.249,279,0.138,8,1,0.147,1,2,...,4,4,1,10,3,3,10,7,1,7
2,37,1,0.15,1373,0.138,2,2,0.134,1,4,...,3,2,0,7,3,3,0,0,0,0
3,33,0,0.249,1392,0.138,3,4,0.147,1,5,...,3,3,0,8,3,3,8,7,3,0
4,27,0,0.15,591,0.138,2,1,0.136,1,7,...,3,4,1,6,3,3,2,2,2,2


In [3]:
y = df['Attrition']
X = df.drop(['Attrition'], axis = 1)
X = scale(X)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [5]:
os = SMOTE()
X_train, y_train = os.fit_resample(X_train, y_train)

In [6]:
def model_run(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))
    print('F1 score:', f1_score(y_test, y_pred))
    print('')


In [7]:
model_list = []
model_list.append(LogisticRegression(random_state = 1))
model_list.append(RandomForestClassifier(random_state = 1))
model_list.append(GradientBoostingClassifier(random_state = 1))
model_list.append(XGBClassifier(random_state = 1))

# DEFAULT MODELS

In [8]:
for model in model_list:
    model_run(model)

LogisticRegression(random_state=1)
Accuracy: 0.7527173913043478
Precision: 0.4049586776859504
Recall: 0.7205882352941176
F1 score: 0.5185185185185185

RandomForestClassifier(random_state=1)
Accuracy: 0.845108695652174
Precision: 0.7037037037037037
Recall: 0.27941176470588236
F1 score: 0.4

GradientBoostingClassifier(random_state=1)
Accuracy: 0.8532608695652174
Precision: 0.6666666666666666
Recall: 0.4117647058823529
F1 score: 0.509090909090909

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method=None, validate_para