In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Loading the attrition data

dataset = pd.read_csv('general_data.csv')

dataset.shape

In [133]:
#preprocessing the data

dataset.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [132]:
#replacing null values in the NumCompaniesWorked & TotalWorkingYears column with 0 values assuming that the employees have not worked with any other companies

nullhand = np.where(dataset['NumCompaniesWorked'].isnull(),0,dataset['NumCompaniesWorked'])

dataset['NumCompaniesWorked'] = nullhand

dataset['TotalWorkingYears'] = dataset['TotalWorkingYears'].fillna(value=0)

In [None]:
dataset.head()

In [134]:
# replacing text categorical features attrition ,gender,Department,BusinessTravel,EducationField with binary values 

label_encoder = preprocessing.LabelEncoder()


dataset['Attrition'] = label_encoder.fit_transform(dataset['Attrition'])
dataset['Gender'] = label_encoder.fit_transform(dataset['Gender'])
dataset['Department'] = label_encoder.fit_transform(dataset['Department'])
dataset['BusinessTravel'] = label_encoder.fit_transform(dataset['BusinessTravel'])
dataset['EducationField'] = label_encoder.fit_transform(dataset['EducationField'])
dataset['BusinessTravel'] = label_encoder.fit_transform(dataset['BusinessTravel'])
dataset['MaritalStatus'] = label_encoder.fit_transform(dataset['MaritalStatus'])

In [135]:
pd.set_option('display.max_columns',100)

dataset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,2,2,6,2,1,1,1,0,1,Healthcare Representative,1,131160,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,1,1,1,10,1,1,1,2,0,1,Research Scientist,2,41890,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,0,1,1,17,4,4,1,3,1,4,Sales Executive,1,193280,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,0,0,1,2,5,1,1,4,1,3,Human Resources,1,83210,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,0,2,1,10,1,3,1,5,1,1,Sales Executive,2,23420,4.0,Y,12,8,2,9.0,2,6,0,4


In [136]:
features = dataset.columns

In [165]:
#considering only the numerical values & removing 'Attrition' variable since it will considered as y dependent feature

num_features = [features for features in dataset.columns if dataset[features].dtypes != 'O']


num_features.remove('Attrition')
num_features.remove('EmployeeID')
num_features.remove('EmployeeCount')

In [118]:
#for identifying the null values

np.where(np.isnan(dataset[num_features]))

(array([  23,  137,  308,  574, 1517, 2367, 3120, 3818, 4409], dtype=int64),
 array([16, 16, 16, 16, 16, 16, 16, 16, 16], dtype=int64))

In [300]:
# assigning features

x = dataset[num_features]

y = dataset['Attrition']

In [169]:
#initializing RF algorithm

rf_model = RandomForestClassifier(n_estimators=1000,max_features=2,oob_score=True)

In [301]:
rf_model.fit(X = x,y = y)

RandomForestClassifier(max_features=2, n_estimators=1000, oob_score=True)

In [302]:
print("OOB_Accuracy: ",rf_model.oob_score_)

OOB_Accuracy:  0.9997732426303855


In [303]:
for feature,imp in zip(num_features,rf_model.feature_importances_):
    print(feature,imp)

Age 0.1033365735299083
BusinessTravel 0.02909628528753729
Department 0.027769056878527848
DistanceFromHome 0.07439903675801236
Education 0.04355034014891464
EducationField 0.04277571573117485
Gender 0.019383071189340084
JobLevel 0.04087241963581014
MaritalStatus 0.04158374635199692
MonthlyIncome 0.10150267026411525
NumCompaniesWorked 0.05873827296173947
PercentSalaryHike 0.06953193830909139
StandardHours 0.0
StockOptionLevel 0.03577079213887573
TotalWorkingYears 0.0896018155308224
TrainingTimesLastYear 0.048244008824567565
YearsAtCompany 0.07200878878608925
YearsSinceLastPromotion 0.04557526184848265
YearsWithCurrManager 0.05626020582499388


In [246]:
#building the desicion tree

predictors = pd.DataFrame([dataset['TotalWorkingYears'],dataset['Age'],dataset['MonthlyIncome']]).T

In [248]:
dc_model = tree.DecisionTreeClassifier(max_depth=6)

dc_model

DecisionTreeClassifier(max_depth=6)

In [249]:
dc_model.fit(predictors,y = dataset['Attrition'])

DecisionTreeClassifier(max_depth=6)

In [250]:
#genrating output to a file

with open('DtreeAttirtion.dot','w')as f:
    f = tree.export_graphviz(dc_model,feature_names=['TotalWorkingYears','Age','MonthlyIncome'],out_file=f)

In [306]:
dc_model.score(X=predictors,y = dataset['Attrition'])

0.8668934240362812

In [273]:
#sample data for testing purpose

pred = X_test.tail(50)

In [274]:
test_pred = dc_model.predict(X=pred)

In [282]:
test_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int64)

In [283]:
pred.head()

Unnamed: 0,TotalWorkingYears,Age,MonthlyIncome
4388,13.0,33.0,51470.0
3193,13.0,36.0,59150.0
525,10.0,41.0,52570.0
3235,5.0,40.0,20730.0
2796,12.0,34.0,50630.0


In [280]:
dict = pd.DataFrame({"MonthlyIncome" : pred['MonthlyIncome'],"Attrition" : test_pred})

In [285]:
dict.head(10)

Unnamed: 0,MonthlyIncome,Attrition
4388,51470.0,0
3193,59150.0,0
525,52570.0,0
3235,20730.0,0
2796,50630.0,0
3578,38940.0,0
4254,99240.0,1
1492,21480.0,0
3215,51260.0,0
3177,22930.0,0


In [297]:
dataset.iloc[[3235]]

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
3235,40,0,2,2,9,3,2,1,3236,1,2,Research Scientist,2,20730,0.0,Y,14,8,0,5.0,3,4,2,3
