In [105]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import matplotlib.pylab as plb
import matplotlib as mpl
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
import numpy as np

In [106]:
#Loading the data into a variable called train_data 
train_data = pd.read_csv('HR_comma_sep.csv') 
train_data.head() 

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [107]:
train_data = train_data.rename(columns={'satisfaction_level': 'satisfaction', 
                        'last_evaluation': 'LatestEvaluation',
                        'number_project': 'Projects',
                        'average_montly_hours': 'AvgWorkingHours',
                        'time_spend_company': 'TimeAtCompany',
                        'Work_accident': 'Accident',
                        'promotion_last_5years': 'Promotions',
                        'sales' : 'Department',
                        'left' : 'Turnover'
                        })
train_data.head()

Unnamed: 0,satisfaction,LatestEvaluation,Projects,AvgWorkingHours,TimeAtCompany,Accident,Turnover,Promotions,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [108]:
train_data["salary"] = train_data["salary"].replace(['low' , 'medium' , 'high'] , [0 , 1 , 2])

train_data['Department'].replace(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
        'IT', 'product_mng', 'marketing', 'RandD'], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace = True)

train_data.head()

Unnamed: 0,satisfaction,LatestEvaluation,Projects,AvgWorkingHours,TimeAtCompany,Accident,Turnover,Promotions,Department,salary
0,0.38,0.53,2,157,3,0,1,0,0,0
1,0.8,0.86,5,262,6,0,1,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0
4,0.37,0.52,2,159,3,0,1,0,0,0


In [109]:
FirstAttribute = train_data['Turnover']
train_data.drop(labels=['Turnover'], axis=1,inplace = True)
train_data.insert(0, 'Turnover', FirstAttribute)
train_data.head()

Unnamed: 0,Turnover,satisfaction,LatestEvaluation,Projects,AvgWorkingHours,TimeAtCompany,Accident,Promotions,Department,salary
0,1,0.38,0.53,2,157,3,0,0,0,0
1,1,0.8,0.86,5,262,6,0,0,0,1
2,1,0.11,0.88,7,272,4,0,0,0,1
3,1,0.72,0.87,5,223,5,0,0,0,0
4,1,0.37,0.52,2,159,3,0,0,0,0


In [110]:
#Data Segregation into training and testing
from sklearn.model_selection import train_test_split
labels = train_data.pop('Turnover')
TrainData, TestData, TrainLabel, TestLabel = train_test_split(train_data, labels, test_size = 0.1, random_state = 10)

In [111]:
TrainData.head()

Unnamed: 0,satisfaction,LatestEvaluation,Projects,AvgWorkingHours,TimeAtCompany,Accident,Promotions,Department,salary
9292,0.18,0.96,5,208,6,0,0,3,1
9307,0.68,0.44,5,165,3,0,0,9,1
4017,0.69,0.91,4,186,3,0,0,4,0
14501,0.4,0.46,2,128,3,0,0,5,0
9243,0.14,0.63,6,215,5,0,0,0,0


In [112]:
TrainLabel.head()

9292     0
9307     0
4017     0
14501    1
9243     0
Name: Turnover, dtype: int64

In [113]:
TestData.head()

Unnamed: 0,satisfaction,LatestEvaluation,Projects,AvgWorkingHours,TimeAtCompany,Accident,Promotions,Department,salary
13982,0.65,0.4,5,125,4,0,0,0,0
822,0.41,0.5,2,128,3,0,0,5,0
13751,0.44,0.41,3,125,7,0,0,0,1
9656,0.93,0.44,5,190,5,0,0,3,1
13497,0.76,0.82,4,170,6,0,0,0,1


In [114]:
TestLabel.head()

13982    0
822      1
13751    0
9656     0
13497    0
Name: Turnover, dtype: int64

In [115]:
from  sklearn.ensemble import RandomForestClassifier

In [116]:
# For Tuning of RFC , max_depth=10, min_samples_split=2, n_estimators = 100, random_state = 1
RFC = RandomForestClassifier()
RFmodel = RFC.fit(TrainData,TrainLabel)
AccuracyTrain=RFmodel.score(TrainData,TrainLabel)
AccuracyTest=RFmodel.score(TestData,TestLabel)
print("Accuracy of HR Train Data using Random Forest Classifier is :",AccuracyTrain*100)
print("Accuracy of HR Test Data using Random Forest Classifier is :",AccuracyTest*100)

Accuracy of HR Train Data using Random Forest Classifier is : 99.8592488332
Accuracy of HR Test Data using Random Forest Classifier is : 98.8


In [117]:
#To list the importance of features

f = list(RFmodel.feature_importances_)
print("Extracting the Impact of each feature :")
print('Satisfaction:',f[0]*100,'%\n''LatestEvaluation:',f[1]*100,'%\n''Projects:',f[2]*100,'%\n''AvgWorkingHours:',f[3]*100,'%\n''TimeAtCompany:',f[4]*100,'%\n''Accident:',f[5]*100,'%\n''Promotions:',f[6]*100,'%\n''Department:',f[7]*100,'%\n''Salary:',f[8]*100,'%\n')

Extracting the Impact of each feature :
Satisfaction: 36.6369108212 %
LatestEvaluation: 12.4537542581 %
Projects: 12.0537710816 %
AvgWorkingHours: 15.4773267277 %
TimeAtCompany: 20.8885424932 %
Accident: 0.419748918598 %
Promotions: 0.0814809417559 %
Department: 1.20623850761 %
Salary: 0.782226250185 %



We Generally Assume that Salary is the main criteria influencing the turnover.Our analysis of the feautures proves it wrong, showing that there are many more features impacting the turnover majorly.

Now,Lets consider the features which are having a high impact on the turnover and the training the model again with RandomForestClassifier. To say it in jargon terms, we are assigning higher weights to features with a greater impact. 

In [118]:
TrainData2=TrainData

In [119]:
TrainData2.head()

Unnamed: 0,satisfaction,LatestEvaluation,Projects,AvgWorkingHours,TimeAtCompany,Accident,Promotions,Department,salary
9292,0.18,0.96,5,208,6,0,0,3,1
9307,0.68,0.44,5,165,3,0,0,9,1
4017,0.69,0.91,4,186,3,0,0,4,0
14501,0.4,0.46,2,128,3,0,0,5,0
9243,0.14,0.63,6,215,5,0,0,0,0


In [120]:
TrainData2.pop('salary')

9292     1
9307     1
4017     0
14501    0
9243     0
259      1
13882    1
14842    0
6718     1
3768     2
9968     1
8787     0
14763    0
667      0
701      1
1717     0
9158     1
10921    0
477      0
8514     0
5674     0
5406     0
7048     0
8034     1
2129     1
4645     1
4231     1
9417     0
12825    1
11328    1
        ..
8036     1
13512    0
574      1
7290     1
11627    1
3416     1
2102     1
2443     1
239      0
4452     0
5648     0
14707    1
10742    1
12328    0
14826    1
6400     1
9289     0
9224     1
10234    1
10141    0
1520     1
4829     0
10201    1
9372     0
7291     1
11633    2
1344     0
12815    0
7293     1
1289     0
Name: salary, Length: 13499, dtype: int64

In [121]:
TrainData2.pop('Promotions')

9292     0
9307     0
4017     0
14501    0
9243     0
259      0
13882    0
14842    0
6718     0
3768     0
9968     0
8787     0
14763    0
667      0
701      0
1717     0
9158     0
10921    0
477      0
8514     0
5674     0
5406     0
7048     0
8034     0
2129     0
4645     0
4231     0
9417     0
12825    0
11328    0
        ..
8036     0
13512    0
574      0
7290     0
11627    0
3416     0
2102     0
2443     0
239      0
4452     0
5648     0
14707    0
10742    0
12328    0
14826    0
6400     0
9289     0
9224     0
10234    0
10141    0
1520     0
4829     0
10201    0
9372     0
7291     0
11633    0
1344     0
12815    0
7293     0
1289     0
Name: Promotions, Length: 13499, dtype: int64

In [122]:
TrainData2.pop('Accident')

9292     0
9307     0
4017     0
14501    0
9243     0
259      0
13882    0
14842    0
6718     0
3768     0
9968     0
8787     0
14763    0
667      0
701      0
1717     0
9158     1
10921    1
477      0
8514     0
5674     0
5406     0
7048     0
8034     0
2129     1
4645     0
4231     0
9417     0
12825    0
11328    0
        ..
8036     1
13512    0
574      0
7290     1
11627    1
3416     0
2102     0
2443     0
239      0
4452     0
5648     0
14707    0
10742    0
12328    0
14826    0
6400     0
9289     1
9224     0
10234    0
10141    0
1520     0
4829     0
10201    1
9372     0
7291     0
11633    0
1344     0
12815    1
7293     0
1289     0
Name: Accident, Length: 13499, dtype: int64

In [123]:
TrainData2.head()

Unnamed: 0,satisfaction,LatestEvaluation,Projects,AvgWorkingHours,TimeAtCompany,Department
9292,0.18,0.96,5,208,6,3
9307,0.68,0.44,5,165,3,9
4017,0.69,0.91,4,186,3,4
14501,0.4,0.46,2,128,3,5
9243,0.14,0.63,6,215,5,0


In [124]:
TestData2=TestData

In [125]:
TestData2.pop('salary')

13982    0
822      0
13751    1
9656     1
13497    1
1570     0
13024    0
6110     0
3201     2
9359     1
6630     1
14371    0
10182    0
10240    1
5732     0
2902     0
3601     0
9033     1
1793     1
10407    0
3821     1
2938     0
11561    1
2370     1
12889    2
9948     0
3586     0
6592     2
1328     0
7021     1
        ..
10216    0
3377     1
12102    1
10948    1
1138     0
9960     0
11148    0
14797    0
835      0
218      0
3866     0
8444     0
13711    0
14672    1
13745    0
762      1
1932     1
12657    2
14307    0
7032     1
3579     2
627      0
892      0
1986     1
8785     0
5946     1
285      0
7058     1
6277     0
9179     0
Name: salary, Length: 1500, dtype: int64

In [126]:
TestData2.pop('Accident')

13982    0
822      0
13751    0
9656     0
13497    0
1570     0
13024    0
6110     0
3201     0
9359     0
6630     0
14371    0
10182    1
10240    0
5732     0
2902     0
3601     0
9033     0
1793     0
10407    0
3821     0
2938     1
11561    0
2370     0
12889    0
9948     0
3586     0
6592     0
1328     0
7021     1
        ..
10216    0
3377     0
12102    0
10948    0
1138     0
9960     0
11148    0
14797    0
835      1
218      0
3866     0
8444     1
13711    0
14672    0
13745    0
762      0
1932     0
12657    0
14307    0
7032     0
3579     1
627      1
892      0
1986     0
8785     1
5946     0
285      0
7058     0
6277     0
9179     0
Name: Accident, Length: 1500, dtype: int64

In [127]:
TestData2.pop('Promotions')

13982    0
822      0
13751    0
9656     0
13497    0
1570     0
13024    0
6110     0
3201     0
9359     0
6630     0
14371    0
10182    0
10240    0
5732     0
2902     0
3601     0
9033     0
1793     0
10407    0
3821     0
2938     0
11561    0
2370     0
12889    0
9948     0
3586     0
6592     0
1328     0
7021     0
        ..
10216    0
3377     0
12102    0
10948    0
1138     0
9960     0
11148    0
14797    0
835      0
218      0
3866     0
8444     0
13711    0
14672    0
13745    0
762      0
1932     0
12657    0
14307    0
7032     0
3579     0
627      0
892      0
1986     0
8785     0
5946     0
285      0
7058     0
6277     0
9179     0
Name: Promotions, Length: 1500, dtype: int64

In [128]:
TestData2.head()

Unnamed: 0,satisfaction,LatestEvaluation,Projects,AvgWorkingHours,TimeAtCompany,Department
13982,0.65,0.4,5,125,4,0
822,0.41,0.5,2,128,3,5
13751,0.44,0.41,3,125,7,0
9656,0.93,0.44,5,190,5,3
13497,0.76,0.82,4,170,6,0


In [129]:
# For Tuning of RFC , max_depth=10, min_samples_split=2, n_estimators = 100, random_state = 1
RFC2 = RandomForestClassifier()
RFmodel2 = RFC2.fit(TrainData2,TrainLabel)
AccuracyTrain=RFmodel2.score(TrainData2,TrainLabel)
AccuracyTest=RFmodel2.score(TestData2,TestLabel)
print("Accuracy of HR Train Data using Random Forest Classifier is :",AccuracyTrain*100)
print("Accuracy of HR Test Data using Random Forest Classifier is :",AccuracyTest*100)

Accuracy of HR Train Data using Random Forest Classifier is : 99.8740647455
Accuracy of HR Test Data using Random Forest Classifier is : 98.8666666667


In [130]:
#To list the importance of features
f2 = list(RFmodel2.feature_importances_)
print("Extracting the Impact of each feature after feature reduction:")
print('Satisfaction:',f2[0]*100,'%\n''LatestEvaluation:',f2[1]*100,'%\n''Projects:',f2[2]*100,'%\n''AvgWorkingHours:',f2[3]*100,'%\n''TimeAtCompany:',f2[4]*100,'%\n''Department:',f2[5]*100,'%\n')

Extracting the Impact of each feature after feature reduction:
Satisfaction: 29.1004497613 %
LatestEvaluation: 12.0696977024 %
Projects: 21.5855858663 %
AvgWorkingHours: 16.7303431972 %
TimeAtCompany: 19.3286328075 %
Department: 1.18529066527 %



As we have observed, after the feature reduction, there is an increase in accuracy by a scale of 0.06%. The increase is not a significant optimization, but the data of the dropped features is omitted, thus reducing the amount of data and processing of the data. 