In [1]:
# importing libraries
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import precision_recall_cutoff

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
turnover = pd.read_csv(file_content_stream)
# dropping missing values
turnover = turnover.dropna()
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
# changing sales to dummy variable
turnover = pd.concat([turnover.drop(columns = ['sales']), pd.get_dummies(turnover['sales'])], axis = 1)

# changing salary to dummy variable
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0


In [3]:
# creating interactions from the decision tree
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] <= 2.5) & (turnover['last_evaluation'] <= 0.575), 1, 0)
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] >= 2.5) & (turnover['satisfaction_level'] <= 0.115), 1, 0)
turnover['interaction_3'] = np.where((turnover['satisfaction_level'] >= 0.465) & (turnover['time_spend_company'] <= 4.5) & (turnover['average_montly_hours'] <= 290.5), 1, 0)
turnover.head()


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0


In [4]:
# defining input and target variables
X = turnover.drop(columns = ['left', 'salary'], axis = 1)
Y = turnover['left']

# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [6]:
# defining the list to store results
results = []

for i in range(0, 10):
    
    # splitting the data 
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)
    
    # building the random forest model
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    # extracting and appending importances
    results.append(RF.feature_importances_)
    
# changing to data frame
results = pd.DataFrame(results)
results.columns = X.columns
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.1959,0.033175,0.086901,0.043161,0.064343,0.006124,0.000463,2.8e-05,0.000179,7e-06,8.7e-05,0.000465,6e-06,3.7e-05,3.1e-05,4.7e-05,0.000211,0.002472,0.00441,0.000508,0.210678,0.118267,0.232501
1,0.175317,0.035731,0.105903,0.050256,0.075096,0.010844,0.000898,3.3e-05,0.000446,1.4e-05,8.8e-05,0.000203,1.2e-05,2.7e-05,6.2e-05,2.3e-05,0.000162,0.004007,0.005318,0.000818,0.195947,0.101246,0.237549
2,0.183768,0.038392,0.093974,0.06752,0.068785,0.008136,0.000998,7e-05,0.00028,4.6e-05,0.000125,0.00017,5e-06,1.7e-05,4.3e-05,2.9e-05,0.000175,0.003824,0.007362,0.001349,0.177281,0.116577,0.231074
3,0.187279,0.040402,0.090804,0.048602,0.080536,0.01057,0.000675,3.2e-05,0.00043,5.2e-05,0.000125,0.000187,5.1e-05,1e-05,9.3e-05,6.3e-05,0.000143,0.005044,0.004776,0.001349,0.193636,0.104011,0.23113
4,0.167576,0.031397,0.107811,0.052947,0.067334,0.008764,0.000843,1.8e-05,0.000259,1.2e-05,0.000158,0.00014,1.3e-05,1.8e-05,9.7e-05,7.2e-05,0.000238,0.004872,0.004778,0.000996,0.184659,0.098343,0.268656
5,0.178257,0.040559,0.104397,0.053957,0.070301,0.008266,0.00082,7.4e-05,0.000164,1.4e-05,6.9e-05,0.000246,1.1e-05,4.2e-05,6.3e-05,6.8e-05,0.000127,0.003593,0.006115,0.000706,0.187414,0.110668,0.234067
6,0.164038,0.043691,0.105169,0.054431,0.076696,0.009389,0.000625,3e-05,0.000221,6.8e-05,8.7e-05,0.000234,1.6e-05,7.4e-05,2.5e-05,4.3e-05,0.00025,0.003163,0.004959,0.000896,0.189848,0.109062,0.236985
7,0.164648,0.035785,0.101512,0.058,0.070143,0.007866,0.000576,3.1e-05,0.000343,3.4e-05,9e-05,0.000178,6e-06,6e-05,7.3e-05,4.2e-05,0.000144,0.004535,0.005471,0.0007,0.20115,0.108787,0.239825
8,0.17143,0.039103,0.093414,0.053792,0.06516,0.008872,0.000806,2.6e-05,0.000303,1.5e-05,0.000151,0.000223,3.6e-05,3.8e-05,2.9e-05,3.6e-05,0.000235,0.005332,0.004787,0.000825,0.183899,0.131889,0.2396
9,0.19197,0.03609,0.10315,0.049448,0.073336,0.009704,0.001164,3.5e-05,0.000302,8e-06,7.5e-05,0.000208,1.6e-05,4.9e-05,2.8e-05,7.8e-05,0.000283,0.005526,0.006413,0.000633,0.179325,0.114528,0.227629


In [None]:
# computing averages and sorting by importance
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results = pd.DataFrame({'Feature':results.index, 'Importance':results[0].values})
results = results.sort_values(by = 'Importance', ascending = False)
results

In [14]:
# defining inputs
X_train_1 = X_train[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]
X_test_1 = X_test[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]

# building random forest model with top 5
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_1, Y_train)

# predicting on test
RF_pred = RF.predict_proba(X_test_1)[:, 1]

# predicting the labels
RF_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, RF_pred)

# computing classification report
print(classification_report(Y_test, RF_labels))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2286
           1       0.91      0.89      0.90       714

    accuracy                           0.95      3000
   macro avg       0.94      0.93      0.94      3000
weighted avg       0.95      0.95      0.95      3000



In [15]:
# defining inputs
X_train_2 = X_train[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]
X_test_2 = X_test[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]

# building random forest model with top 5
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_2, Y_train)

# predicting on test
RF_pred = RF.predict_proba(X_test_2)[:, 1]

# predicting the labels
RF_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, RF_pred)

# computing classification report
print(classification_report(Y_test, RF_labels))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      2286
           1       0.93      0.91      0.92       714

    accuracy                           0.96      3000
   macro avg       0.95      0.94      0.95      3000
weighted avg       0.96      0.96      0.96      3000



In [None]:
# I would choose the second model because it has a slightly higher precision, recall, accuracy, and f1-score. 