In [59]:
# importing libraries
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
import precision_recall_cutoff
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
import cost_function

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'turnover_train.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
train = pd.read_csv(file_content_stream)
# dropping missing values
train = train.dropna()
train.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary,left
0,0.92,0.95,6,239,4,0,0,sales,medium,0
1,0.88,0.89,4,254,5,0,0,sales,low,1
2,0.66,0.93,5,253,5,0,0,product_mng,low,1
3,0.46,0.45,2,172,2,1,0,management,low,0
4,0.88,0.75,5,152,3,0,0,hr,high,0


In [42]:
# defining the csv file
file_key = 'turnover_test.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
test = pd.read_csv(file_content_stream)
# dropping missing values
test = test.dropna()
test.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary,left
0,0.77,0.91,5,261,6,0,0,sales,medium,1
1,0.17,0.45,2,119,3,0,0,sales,medium,0
2,0.54,0.58,3,169,2,1,0,technical,high,0
3,0.56,0.73,3,226,3,0,0,RandD,medium,0
4,0.88,0.71,5,255,3,0,0,support,medium,0


In [43]:
# defining the csv file
file_key = 'turnover_val.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
validation = pd.read_csv(file_content_stream)
# dropping missing values
validation = validation.dropna()
validation.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary,left
0,0.83,0.5,5,274,3,0,0,accounting,low,0
1,0.8,0.72,3,271,2,0,0,product_mng,medium,0
2,0.21,0.39,2,118,4,0,0,sales,medium,0
3,0.85,0.87,5,246,5,1,0,sales,medium,1
4,0.46,0.53,2,147,3,0,0,sales,low,1


In [44]:
# changing sales to dummy variable
train = pd.concat([train.drop(columns = ['sales']), pd.get_dummies(train['sales'])], axis = 1)
test = pd.concat([test.drop(columns = ['sales']), pd.get_dummies(test['sales'])], axis = 1)
validation = pd.concat([validation.drop(columns = ['sales']), pd.get_dummies(validation['sales'])], axis = 1)

# changing salary to dummy variable
train = pd.concat([train, pd.get_dummies(train['salary'])], axis = 1)
test = pd.concat([test, pd.get_dummies(test['salary'])], axis = 1)
validation = pd.concat([validation, pd.get_dummies(validation['salary'])], axis = 1)

In [45]:
# creating interactions from the decision tree
train['interaction_1'] = np.where((train['satisfaction_level'] <= 0.465) & (train['number_project'] <= 2.5) & (train['last_evaluation'] <= 0.575), 1, 0)
train['interaction_2'] = np.where((train['satisfaction_level'] <= 0.465) & (train['number_project'] >= 2.5) & (train['satisfaction_level'] <= 0.115), 1, 0)
train['interaction_3'] = np.where((train['satisfaction_level'] >= 0.465) & (train['time_spend_company'] <= 4.5) & (train['average_montly_hours'] <= 290.5), 1, 0)

# creating interactions from the decision tree
test['interaction_1'] = np.where((test['satisfaction_level'] <= 0.465) & (test['number_project'] <= 2.5) & (test['last_evaluation'] <= 0.575), 1, 0)
test['interaction_2'] = np.where((test['satisfaction_level'] <= 0.465) & (test['number_project'] >= 2.5) & (test['satisfaction_level'] <= 0.115), 1, 0)
test['interaction_3'] = np.where((test['satisfaction_level'] >= 0.465) & (test['time_spend_company'] <= 4.5) & (test['average_montly_hours'] <= 290.5), 1, 0)

# creating interactions from the decision tree
validation['interaction_1'] = np.where((validation['satisfaction_level'] <= 0.465) & (validation['number_project'] <= 2.5) & (validation['last_evaluation'] <= 0.575), 1, 0)
validation['interaction_2'] = np.where((validation['satisfaction_level'] <= 0.465) & (validation['number_project'] >= 2.5) & (validation['satisfaction_level'] <= 0.115), 1, 0)
validation['interaction_3'] = np.where((validation['satisfaction_level'] >= 0.465) & (validation['time_spend_company'] <= 4.5) & (validation['average_montly_hours'] <= 290.5), 1, 0)


In [47]:
# defining input and target variables
X_train = train[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y_train = train['left']

X_test = test[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y_test = test['left']

X_val = validation[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y_val = validation['left']

# scaling data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_val = scaler.fit_transform(X_val)

## Random Forest

In [None]:
# defining parameter grid
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

# defining customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# performing grid search
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = my_score_function, 
                              n_jobs = -1).fit(X_train, Y_train)

RF_md = RF_grid_search.best_estimator_

In [60]:
# predicting on validation and test
RF_val_pred = RF_md.predict_proba(X_val)[:, 1]
RF_test_pred = RF_md.predict_proba(X_test)[:, 1]

# identifying the optimal cutoff values
opt_cutoff = cost_function_cutoff(Y_val, RF_val_pred)

# changing likelihoods to labels
RF_label = np.where(RF_test_pred < opt_cutoff, 0, 1)

# computing confusion matrix
X = confusion_matrix(Y_test, RF_label)
print(X)
print('The cost of the RF model is:', -1500*X[1, 0] - 1000*X[0, 1] + 500*X[1, 1])

[[1119   24]
 [  39  318]]
The cost of the RF model is: 76500


## Support Vector Machine



In [None]:
# defining hyper parameters
SVM_param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'],
                  'C': [0.01, 0.1, 1, 10],
                  'gamma': [0.01, 0.1, 1]}

# performing grid search
svm_grid_search = GridSearchCV(SVC(probability = True), SVM_param_grid, cv = 3, scoring = my_score_function, 
                              n_jobs = -1).fit(X_train, Y_train)

svm_md = svm_grid_search.best_estimator_

In [64]:
# predicting on validation and test
svm_val_pred = svm_md.predict_proba(X_val)[:, 1]
svm_test_pred = svm_md.predict_proba(X_test)[:, 1]

# identifying the optimal cutoff values
opt_cutoff = cost_function_cutoff(Y_val, svm_val_pred)

# changing likelihoods to labels
svm_label = np.where(svm_test_pred < opt_cutoff, 0, 1)

# computing confusion matrix
X = confusion_matrix(Y_test, svm_label)
print(X)
print('The cost of the SVM model is:', -1500*X[1, 0] - 1000*X[0, 1] + 500*X[1, 1])

[[1093   50]
 [ 124  233]]
The cost of the SVM model is: -119500
