In [42]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import pandas_profiling
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import patsy


from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

%config InlineBackend.figure_format = 'svg'

In [43]:
df = pd.read_pickle('../emp_df_clean')
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,RandD,accounting,hr,management,marketing,product_mng,support,technical,left
0,0.38,0.53,2,157,3,0,0,1,0,0,0,0,0,0,0,0,1
1,0.8,0.86,5,262,6,0,0,2,0,0,0,0,0,0,0,0,1
2,0.11,0.88,7,272,4,0,0,2,0,0,0,0,0,0,0,0,1
3,0.72,0.87,5,223,5,0,0,1,0,0,0,0,0,0,0,0,1
4,0.37,0.52,2,159,3,0,0,1,0,0,0,0,0,0,0,0,1


## Feature Engineering Before Normalization

### Feature 1 - last_evaluation * number_project

In [44]:
df['int_term_1'] = df.last_evaluation * df.number_project

In [45]:
y = df['left']
X = df.drop(columns=['left', 'last_evaluation', 'number_project'])

- Ensure duplicate rows are not in test data

In [46]:
# finding duplicated rows
df_no_dupelicates = df[df.duplicated()==False]
df_no_dupelicates.shape

(11989, 18)

In [51]:
df_no_dupelicates.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary', 'RandD', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'support', 'technical',
       'left', 'int_term_1'],
      dtype='object')

In [52]:
def patsy_names(df, dependent_var, *excluded_cols):
    '''
    Generates the R style formula for statsmodels (patsy) given
    the dataframe, dependent variable and optional excluded columns
    as strings
    '''
    df_columns = list(df.columns.values)
    df_columns.remove(dependent_var)
    for col in excluded_cols:
        df_columns.remove(col)
    return dependent_var + ' ~ ' + ' + '.join(df_columns)

In [53]:
patsy_names(df_no_dupelicates, 'left')

'left ~ satisfaction_level + last_evaluation + number_project + average_monthly_hours + time_spend_company + Work_accident + promotion_last_5years + salary + RandD + accounting + hr + management + marketing + product_mng + support + technical + int_term_1'

In [54]:
y, X = patsy.dmatrices('left ~ satisfaction_level + last_evaluation + number_project + average_monthly_hours + time_spend_company + Work_accident + promotion_last_5years + salary + RandD + accounting + hr + management + marketing + product_mng + support + technical + int_term_1', data=df_no_dupelicates, return_type="dataframe")

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.values)
X_test = scaler.transform(X_test.values)

In [57]:
X.columns

Index(['Intercept', 'satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary', 'RandD', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'support', 'technical',
       'int_term_1'],
      dtype='object')

KNN

In [58]:
# find out why f1 score gives knn n of 1
ks = range(1,20,2)
param_grid = [{'n_neighbors': ks}]

knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, param_grid, cv=5, scoring='roc_auc', 
                       verbose=10, n_jobs=-1)
knn_results = knn_grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:   17.7s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:   17.7s remaining:    1.1s


KeyboardInterrupt: 

In [None]:
knn_results.best_params_

In [None]:
knn_results.best_score_

SVM

In [None]:
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = {'kernel': kernel}

svc = SVC(probability=True)
svc_grid = GridSearchCV(svc, param_grid, cv=5, scoring='roc_auc', 
                       verbose=10, n_jobs=-1)
svc_results = svc_grid.fit(X_train, y_train)

In [None]:
svc_results.best_estimator_

In [None]:
svc_results.best_params_

In [None]:
svc_results.best_score_

Logistic Regression

In [None]:
penalty = ['l2', 'l2']
C = np.logspace(-3, 1, 100)
param_grid = dict(C=C, penalty=penalty)

logistic = linear_model.LogisticRegression(solver='liblinear', max_iter=10000)
logistic_grid = GridSearchCV(logistic, param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)
logistic_grid.fit(X_train, y_train)

In [None]:
logistic_grid.best_estimator_

In [None]:
logistic_grid.best_score_

Random Forest

In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [50, 100, None],
    'max_features': ['sqrt'], 
    'min_samples_leaf': [1, 2, 5, 10],
    'min_samples_split': [2, 3, 5, 10],
    'n_estimators': [100, 200, 400, 1000]
}

rf = RandomForestClassifier()
rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)
rf_grid.fit(X_train, y_train)

In [None]:
rf_grid.best_estimator_

In [None]:
rf_grid.best_score_

In [None]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)

In [None]:
print('Best ROC_AUC for knn: %0.4f' % knn_grid.best_score_)
print('Best ROC_AUC for logit: %0.4f' % logistic_grid.best_score_)
print('Best ROC_AUC for svm: %0.4f' % svc_grid.best_score_)
print('Best ROC_AUC for rf: %0.4f' % rf_grid.best_score_)

In [None]:
# from sklearn.calibration import CalibratedClassifierCV
# clf = CalibratedClassifierCV(svc)
# clf.fit(X_train, y_train)

In [None]:
# clf.predict_proba(X_test)

In [None]:
# ROC for all the models
model_list = [knn_grid.best_estimator_, 
              logistic_grid.best_estimator_, 
              svc_grid.best_estimator_, 
              rf_grid.best_estimator_,
              'ensemble']

model_name = ['knn', 'logit', 'svm', 'random_forest', 'ensemble']

# Plot ROC curve for all my models
fig, ax = plt.subplots(figsize=(10,8))
for i, model in enumerate(model_list):
    if model == 'ensemble':
        w1 = 0.10
        w2 = 0.80
        y_pred = (w1*logistic_grid.best_estimator_.predict_proba(X_test)[:,1] 
                  + w2*rf_grid.best_estimator_.predict_proba(X_test)[:,1])
    else:
        # changed form model.predict_proba to clf.predict_proba
        y_pred = list(model.predict_proba(X_test)[:,1])
    fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
    roc_auc = metrics.auc(fpr, tpr)
    plt.plot(fpr, tpr, label = (model_name[i] + ' AUC = %0.4f' % roc_auc))

plt.legend(loc = 'lower right')
plt.title('Receiver Operating Characteristic')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### Final dataset

In [13]:
# df = pd.read_pickle('../emp_df_clean')
# df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,RandD,accounting,hr,management,marketing,product_mng,support,technical,left
0,0.38,0.53,2,157,3,0,0,1,0,0,0,0,0,0,0,0,1
1,0.8,0.86,5,262,6,0,0,2,0,0,0,0,0,0,0,0,1
2,0.11,0.88,7,272,4,0,0,2,0,0,0,0,0,0,0,0,1
3,0.72,0.87,5,223,5,0,0,1,0,0,0,0,0,0,0,0,1
4,0.37,0.52,2,159,3,0,0,1,0,0,0,0,0,0,0,0,1


In [14]:
# df.drop(columns=['last_evaluation', 'number_project'], inplace=True)

In [15]:
# df.to_pickle('emp_final_df')

In [16]:
# df.corr()

Unnamed: 0,satisfaction_level,average_monthly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,RandD,accounting,hr,management,marketing,product_mng,support,technical,left
satisfaction_level,1.0,-0.020048,-0.100866,0.058697,0.025605,0.050022,0.006615,-0.028649,-0.012841,0.007172,0.005715,0.006919,0.009185,-0.009345,-0.388375
average_monthly_hours,-0.020048,1.0,0.127755,-0.010143,-0.003544,-0.002242,-0.001177,0.000524,-0.010783,0.000834,-0.00821,-0.005494,-0.002444,0.013638,0.071287
time_spend_company,-0.100866,0.127755,1.0,0.00212,0.067433,0.048715,-0.021116,0.003909,-0.022194,0.115436,0.012096,-0.003919,-0.030111,-0.027991,0.144822
Work_accident,0.058697,-0.010143,0.00212,1.0,0.039245,0.009247,0.017167,-0.012836,-0.015649,0.011242,0.011367,0.001246,0.012079,-0.00607,-0.154622
promotion_last_5years,0.025605,-0.003544,0.067433,0.039245,1.0,0.098119,0.021268,-0.004852,-0.001531,0.128087,0.049253,-0.037288,-0.035605,-0.035799,-0.061788
salary,0.050022,-0.002242,0.048715,0.009247,0.098119,1.0,0.0028,0.012759,0.004599,0.156665,0.011599,-0.007669,-0.029888,-0.01863,-0.157898
RandD,0.006615,-0.001177,-0.021116,0.017167,0.021268,0.0028,1.0,-0.054629,-0.05357,-0.049274,-0.057965,-0.059525,-0.098315,-0.110755,-0.046596
accounting,-0.028649,0.000524,0.003909,-0.012836,-0.004852,0.012759,-0.054629,1.0,-0.052848,-0.04861,-0.057183,-0.058723,-0.096989,-0.109262,0.015201
hr,-0.012841,-0.010783,-0.022194,-0.015649,-0.001531,0.004599,-0.05357,-0.052848,1.0,-0.047667,-0.056075,-0.057584,-0.095109,-0.107143,0.028249
management,0.007172,0.000834,0.115436,0.011242,0.128087,0.156665,-0.049274,-0.04861,-0.047667,1.0,-0.051578,-0.052966,-0.087482,-0.098551,-0.046035
