In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import pandas_profiling
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import patsy


from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_recall_curve, make_scorer,fbeta_score, confusion_matrix
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE


%config InlineBackend.figure_format = 'svg'

In [2]:
df = pd.read_pickle('emp_df_final')
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,RandD,accounting,hr,management,marketing,product_mng,support,technical,left,int_term_1
0,0.38,0.53,2,157,3,0,0,1,0,0,0,0,0,0,0,0,1,1.06
1,0.8,0.86,5,262,6,0,0,2,0,0,0,0,0,0,0,0,1,4.3
2,0.11,0.88,7,272,4,0,0,2,0,0,0,0,0,0,0,0,1,6.16
3,0.72,0.87,5,223,5,0,0,1,0,0,0,0,0,0,0,0,1,4.35
4,0.37,0.52,2,159,3,0,0,1,0,0,0,0,0,0,0,0,1,1.04


### F-beta Scoring

In [5]:
# function to give columns names in patsy format
def patsy_names(df, dependent_var, *excluded_cols):
    '''
    Generates the R style formula for statsmodels (patsy) given
    the dataframe, dependent variable and optional excluded columns
    as strings
    '''
    df_columns = list(df.columns.values)
    df_columns.remove(dependent_var)
    for col in excluded_cols:
        df_columns.remove(col)
    return dependent_var + ' ~ ' + ' + '.join(df_columns)

In [6]:
patsy_names(df, 'left')

'left ~ satisfaction_level + last_evaluation + number_project + average_monthly_hours + time_spend_company + Work_accident + promotion_last_5years + salary + RandD + accounting + hr + management + marketing + product_mng + support + technical + int_term_1'

In [7]:
y, X = patsy.dmatrices('left ~ satisfaction_level + last_evaluation + number_project + average_monthly_hours + time_spend_company + Work_accident + promotion_last_5years + salary + RandD + accounting + hr + management + marketing + product_mng + support + technical + int_term_1', data=df, return_type="dataframe")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
scaler = StandardScaler()

In [10]:
X_train_scl = X_train.drop(columns=['Intercept', 'RandD', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'support', 'Work_accident',
       'promotion_last_5years', 'technical'])

In [11]:
X_train_scaled = scaler.fit_transform(X_train_scl)

In [12]:
X_train_scaled = pd.DataFrame(X_train_scaled)

In [13]:
X_train = pd.merge(X_train_scaled, X_train.drop(columns=['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'time_spend_company', 'salary', 'int_term_1']).reset_index(drop=True), left_index=True, right_index=True)

In [14]:
X_test_scl = X_test.drop(columns=['Intercept', 'RandD', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'support', 'Work_accident',
       'promotion_last_5years', 'technical'])

In [15]:
X_test_scaled = scaler.fit_transform(X_test_scl)

In [16]:
X_test_scaled = pd.DataFrame(X_test_scaled)

In [17]:
X_test = pd.merge(X_test_scaled, X_test.drop(columns=['satisfaction_level', 'last_evaluation', 'number_project',
       'average_monthly_hours', 'time_spend_company', 'salary', 'int_term_1']).reset_index(drop=True), left_index=True, right_index=True)

In [18]:
rf = RandomForestClassifier(bootstrap=True, max_depth=100, max_features='sqrt', min_samples_leaf=2, 
                            min_samples_split=3, n_estimators=400)

In [19]:
model = rf.fit(X_train, np.array(y_train))

  """Entry point for launching an IPython kernel.


In [20]:
rf.predict(X_train)

array([0., 0., 0., ..., 0., 0., 1.])

In [21]:
f1_score(y_train,rf.predict(X_train))

0.9570032573289902

In [22]:
fbeta_score(y_test, rf.predict(X_test), average='macro', beta=2)

0.9661377363864136

In [23]:
y_probs = rf.predict_proba(X_test)[:,1] #the probability that my model is predicting
fbetas = []
thresholds = []
for t in range(0,100,5):
   y_pred = pd.Series(y_probs).apply(lambda x: 0 if x<t/100 else 1) 
   fbetas.append(fbeta_score(y_test, y_pred, average='macro', beta=2))
   thresholds.append(t/100)
print(np.argmax(fbetas)) #the index of the best recall
print(thresholds[np.argmax(fbetas)]) #the best threshold for your model

9
0.45


  'precision', 'predicted', average, warn_for)


In [35]:
fbetas[9]

0.968303741417343

## Column names and model for flask:

In [59]:
column_names = list(df.columns.drop('left'))

In [62]:
with open('column_names.pkl', 'wb') as f:
    pickle.dump(column_names, f)

In [28]:
filename = 'final_retention_model.sav'
pickle.dump(model, open(filename, 'wb'))