In [373]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from scipy.stats.mstats import winsorize
from scipy.stats import boxcox
from scipy.stats import jarque_bera
from scipy.stats import normaltest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from statsmodels.tools.eval_measures import mse, rmse
from wordcloud import WordCloud
import statsmodels.api as sm
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest

from sklearn import tree


from IPython.display import Image


import pydotplus
from sklearn import ensemble

import warnings

%matplotlib inline
sns.set()

warnings.filterwarnings('ignore')

In [61]:
#assign data frames
employees_df = pd.read_csv('Data/IMB_HR.csv')

In [62]:
#examine the data frame

print(
    employees_df.head(),
    employees_df.info(),
    employees_df.describe()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
BusinessTravel              1470 non-null object
DailyRate                   1470 non-null int64
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EducationField              1470 non-null object
EmployeeCount               1470 non-null int64
EmployeeNumber              1470 non-null int64
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
HourlyRate                  1470 non-null int64
JobInvolvement              1470 non-null int64
JobLevel                    1470 non-null int64
JobRole                     1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome         

In [63]:
#Check for missing values

missing_values_ratios = (employees_df.isnull().sum()/employees_df.isnull().count())
missing_values_ratios.sort_values(ascending=False).head()

YearsWithCurrManager    0.0
EmployeeCount           0.0
JobLevel                0.0
JobInvolvement          0.0
HourlyRate              0.0
dtype: float64

In [64]:
#Make a numeric version of the target to check for correlation
Attrition_numeric = []

for idx, val in enumerate(employees_df['Attrition']):
    if val == 'Yes':
        Attrition_numeric.append(1)
    else:
        Attrition_numeric.append(0)
        
employees_df['Attrition_numeric'] = Attrition_numeric

In [65]:
#check the correlation of the numerical variables with the target
np.abs(employees_df[employees_df.select_dtypes(['int64', 'float64']).columns].iloc[:,1:].corr().loc[:,"Attrition_numeric"]).sort_values(ascending=False).head(20)



Attrition_numeric           1.000000
TotalWorkingYears           0.171063
JobLevel                    0.169105
YearsInCurrentRole          0.160545
MonthlyIncome               0.159840
YearsWithCurrManager        0.156199
StockOptionLevel            0.137145
YearsAtCompany              0.134392
JobInvolvement              0.130016
JobSatisfaction             0.103481
EnvironmentSatisfaction     0.103369
DistanceFromHome            0.077924
WorkLifeBalance             0.063939
TrainingTimesLastYear       0.059478
DailyRate                   0.056652
RelationshipSatisfaction    0.045872
NumCompaniesWorked          0.043494
YearsSinceLastPromotion     0.033019
Education                   0.031373
MonthlyRate                 0.015170
Name: Attrition_numeric, dtype: float64

In [66]:
#Check the mutual correlation of the potential features 
np.abs(employees_df[['Attrition_numeric', 'TotalWorkingYears', 'JobLevel', 'YearsInCurrentRole', 'MonthlyIncome',
            'YearsWithCurrManager', 'StockOptionLevel', 'YearsAtCompany', 'JobInvolvement', 'JobSatisfaction',
            'EnvironmentSatisfaction']].corr())

Unnamed: 0,Attrition_numeric,TotalWorkingYears,JobLevel,YearsInCurrentRole,MonthlyIncome,YearsWithCurrManager,StockOptionLevel,YearsAtCompany,JobInvolvement,JobSatisfaction,EnvironmentSatisfaction
Attrition_numeric,1.0,0.171063,0.169105,0.160545,0.15984,0.156199,0.137145,0.134392,0.130016,0.103481,0.103369
TotalWorkingYears,0.171063,1.0,0.782208,0.460365,0.772893,0.459188,0.010136,0.628133,0.005533,0.020185,0.002693
JobLevel,0.169105,0.782208,1.0,0.389447,0.9503,0.375281,0.013984,0.534739,0.01263,0.001944,0.001212
YearsInCurrentRole,0.160545,0.460365,0.389447,1.0,0.363818,0.714365,0.050818,0.758754,0.008717,0.002305,0.018007
MonthlyIncome,0.15984,0.772893,0.9503,0.363818,1.0,0.344079,0.005408,0.514285,0.015271,0.007157,0.006259
YearsWithCurrManager,0.156199,0.459188,0.375281,0.714365,0.344079,1.0,0.024698,0.769212,0.025976,0.027656,0.004999
StockOptionLevel,0.137145,0.010136,0.013984,0.050818,0.005408,0.024698,1.0,0.015058,0.021523,0.01069,0.003432
YearsAtCompany,0.134392,0.628133,0.534739,0.758754,0.514285,0.769212,0.015058,1.0,0.021355,0.003803,0.001458
JobInvolvement,0.130016,0.005533,0.01263,0.008717,0.015271,0.025976,0.021523,0.021355,1.0,0.021476,0.008278
JobSatisfaction,0.103481,0.020185,0.001944,0.002305,0.007157,0.027656,0.01069,0.003803,0.021476,1.0,0.006784


## PCA

In [256]:
# Normalize the data so that all variables have a mean of 0 and standard deviation
# of 1.
X = StandardScaler().fit_transform(employees_df.select_dtypes(['int64', 'float64']))
Y = employees_df.Attrition

Setting up the parameter grid for tuning PCA.

In [268]:
n_comps = np.arange(1, 28)

In [269]:
param_grid_pca = [{'pca__n_components':n_comps}]

In [270]:
pipe_tree_pca = make_pipeline(PCA())

Tuning PCA with GridSearchCV.  We are using GridSearchCV's default scorer.

In [378]:
gs_pca = GridSearchCV(pipe_tree_pca, param_grid=param_grid_pca, cv=10)

#gs_pca.get_params().keys()

In [379]:
gs_pca.fit(X,Y)
print(gs_pca.best_params_)

{'pca__n_components': 27}


Processing PCA with the GridSearchCV results.

In [380]:
sklearn_pca = PCA(n_components=27)
X_sklearn = sklearn_pca.fit_transform(X)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    sklearn_pca.explained_variance_ratio_
)

The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 [1.88042137e-01 7.34520080e-02 7.02270863e-02 5.14844780e-02
 4.83408293e-02 4.52917697e-02 4.30758500e-02 4.20705631e-02
 4.09523849e-02 4.01786845e-02 3.89980308e-02 3.81817909e-02
 3.69360101e-02 3.62244852e-02 3.56063511e-02 3.47964591e-02
 3.12089921e-02 2.74407054e-02 2.11831775e-02 2.03729096e-02
 1.11922618e-02 9.02699533e-03 8.12643880e-03 5.65405718e-03
 1.93554403e-03 3.70847727e-32 9.68079635e-34]


## Decision Tree Model

Splitting the data into training and testing sets.

In [352]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X_sklearn, Y, test_size = 0.20, random_state = 1
)

Setting up the parameters for tuning the decision tree model.

In [353]:
depths = np.arange(1, 21)
features = [1, 2, 3]

In [354]:
param_grid_dt = [{'decisiontreeclassifier__max_depth':depths,
               'decisiontreeclassifier__max_features':features}]

In [355]:
pipe_tree_dt = make_pipeline(tree.DecisionTreeClassifier(criterion='entropy'))

Tuning the decision tree model with GridSearchCV using the default scorer.

In [356]:
gs_dt = GridSearchCV(pipe_tree_dt, param_grid=param_grid_dt, cv=10)

#gs.get_params().keys()

In [357]:
gs_dt.fit(X_train,Y_train)
print(gs_dt.best_params_)

{'decisiontreeclassifier__max_depth': 16, 'decisiontreeclassifier__max_features': 3}


Building the decision tree model using the best GridSearchCV parameters.

In [360]:
#Initialize and train our Decision Tree.
decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=3,
    max_depth=16
)

decision_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=16,
            max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [361]:
#Setting the predictions for analysis
Y_pred = decision_tree.fit(X_train, Y_train).predict(X_test)

Scores for the training set:

In [364]:
print(cross_val_score(decision_tree, X_train, Y_train, cv=10))
print('The 10-fold cross validation average for the training set is ', 
      cross_val_score(decision_tree, X_train, Y_train, cv=10).mean())

[0.94915254 0.89830508 0.86440678 0.94067797 0.96610169 0.99152542
 0.94915254 0.88034188 0.93162393 0.93103448]
The 10-fold cross validation average for the training set is  0.9158543012283514


Scores for the testing set:

In [365]:
print(cross_val_score(decision_tree, X_test, Y_test, cv=10))
print('The 10-fold cross validation average for the testing set  is ', 
      cross_val_score(decision_tree, X_test, Y_test, cv=10).mean())

[0.76666667 0.9        0.8        0.86666667 0.73333333 0.8
 0.86206897 0.96551724 0.78571429 0.78571429]
The 10-fold cross validation average for the testing set  is  0.9045648604269292


The classification report shows that our decision tree model is better at predicting if an individual is staying at the company than if an individual is leaving the company, but it still much better than a dummy model.

In [366]:
print(classification_report(Y_test, Y_pred, labels=None, target_names=None, 
                      sample_weight=None, digits=2, output_dict=False
                     ))

              precision    recall  f1-score   support

          No       0.96      0.99      0.98       236
         Yes       0.96      0.84      0.90        58

   micro avg       0.96      0.96      0.96       294
   macro avg       0.96      0.92      0.94       294
weighted avg       0.96      0.96      0.96       294



In [367]:
confusion_matrix(Y_test, Y_pred, labels=None, sample_weight=None)

array([[234,   2],
       [  9,  49]])

## Random Forest Classifier Model

In [368]:
# Initialize and train our Random Forest Classifier.
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_train,Y_train)

Y_preds_rfc=rfc.predict(X_test)

The classification report and cross val scores show that the random forest model is stronger by several points than the decision tree model.

In [369]:
print(classification_report(Y_test, Y_preds_rfc, labels=None, target_names=None, 
                      sample_weight=None, digits=2, output_dict=False
                     ))

              precision    recall  f1-score   support

          No       0.98      1.00      0.99       236
         Yes       1.00      0.91      0.95        58

   micro avg       0.98      0.98      0.98       294
   macro avg       0.99      0.96      0.97       294
weighted avg       0.98      0.98      0.98       294



In [370]:
print(cross_val_score(rfc, X_train, Y_train, cv=10))
print('The 10-fold cross validation average for the training set is ', 
      cross_val_score(rfc, X_train, Y_train, cv=10).mean())

[0.97457627 0.98305085 0.97457627 0.97457627 0.98305085 0.98305085
 0.96610169 0.98290598 0.98290598 1.        ]
The 10-fold cross validation average for the training set is  0.9889467098263124


In [371]:
print(cross_val_score(rfc, X_test, Y_test, cv=10))
print('The 10-fold cross validation average for the testing set  is ', 
      cross_val_score(rfc, X_test, Y_test, cv=10).mean())

[1.         0.86666667 0.96666667 1.         0.96666667 1.
 1.         1.         0.96428571 0.89285714]
The 10-fold cross validation average for the testing set  is  0.9552463054187191


In [372]:
confusion_matrix(Y_test, Y_preds_rfc, labels=None, sample_weight=None)

array([[236,   0],
       [  5,  53]])

## Conclusion

The random forest classifier model performed best by several points, with only 5 out of 294 predictions incorrect. In comparision, the decision tree model had 11 incorrect predictions out of the 294 points.