In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [25]:
df = pd.read_csv('employee-attrition.csv')

In [26]:
df.columns = df.columns.str.lower()

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,...,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [28]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [29]:
df

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,...,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager
0,41,yes,travel_rarely,1102,sales,1,2,life_sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,no,travel_frequently,279,research_&_development,8,1,life_sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,yes,travel_rarely,1373,research_&_development,2,2,other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,no,travel_frequently,1392,research_&_development,3,4,life_sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,no,travel_rarely,591,research_&_development,2,1,medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,no,travel_frequently,884,research_&_development,23,2,medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,no,travel_rarely,613,research_&_development,6,1,medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,no,travel_rarely,155,research_&_development,4,3,life_sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,no,travel_frequently,1023,sales,2,3,medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [30]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

In [31]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [32]:
y_train = (df_train.attrition == 'yes').astype('int').values
y_val = (df_val.attrition == 'yes').astype('int').values
y_test = (df_test.attrition == 'yes').astype('int').values
y_full_train = (df_full_train.attrition == 'yes').astype('int').values

In [33]:
del df_train['attrition']
del df_val['attrition']
del df_test['attrition']
del df_full_train['attrition']

In [34]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')
dict_full_train = df_full_train.to_dict(orient='records')

In [35]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_full_train)
X_val = dv.transform(dict_val)

In [36]:
features = dv.get_feature_names_out()
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
}

model = xgb.train(xgb_params, dtrain,
                  num_boost_round=170, verbose_eval=10,
                  evals=watchlist)

[0]	train-auc:0.76941	val-auc:0.69678
[10]	train-auc:0.86264	val-auc:0.78584
[20]	train-auc:0.91646	val-auc:0.79746
[30]	train-auc:0.93890	val-auc:0.80956
[40]	train-auc:0.96166	val-auc:0.81860
[50]	train-auc:0.97474	val-auc:0.82134
[60]	train-auc:0.98121	val-auc:0.82588
[70]	train-auc:0.98655	val-auc:0.82580
[80]	train-auc:0.99019	val-auc:0.83199
[90]	train-auc:0.99139	val-auc:0.83175
[100]	train-auc:0.99320	val-auc:0.83207
[110]	train-auc:0.99458	val-auc:0.83246
[120]	train-auc:0.99591	val-auc:0.83011
[130]	train-auc:0.99688	val-auc:0.83442
[140]	train-auc:0.99829	val-auc:0.83262
[150]	train-auc:0.99910	val-auc:0.82972
[160]	train-auc:0.99940	val-auc:0.83402
[169]	train-auc:0.99971	val-auc:0.83598


In [37]:
y_pred_xgb = model.predict(dval)

In [38]:
roc_auc_score(y_val, y_pred_xgb)

0.8359821498473342

In [39]:
roc_auc_score(y_val, y_pred_xgb)

0.8359821498473342

In [40]:
import bentoml

In [41]:
bentoml.xgboost.save_model(
    'employee_attrition',
    model,
    custom_objects={
        'dictVectorizer': dv
    })

Model(tag="employee_attrition:fiyrdzk52c3f2ia6", path="C:\Users\ivtpr\bentoml\models\employee_attrition\fiyrdzk52c3f2ia6\")

In [20]:
dict_train[100]

{'businesstravel': 'travel_rarely',
 'department': 'sales',
 'educationfield': 'marketing',
 'gender': 'female',
 'jobrole': 'sales_executive',
 'age': 43}