In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import preprocessing
from sklearn .ensemble import RandomForestClassifier

In [3]:
df=pd.read_csv("general_data.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [4]:
df.drop(columns=['EmployeeCount','EmployeeID'],inplace=True)

In [5]:
df.isnull().sum().sum()

28

In [6]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'Over18',
       'PercentSalaryHike', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   Gender                   4410 non-null   object 
 8   JobLevel                 4410 non-null   int64  
 9   JobRole                  4410 non-null   object 
 10  MaritalStatus            4410 non-null   object 
 11  MonthlyIncome            4410 non-null   int64  
 12  NumCompaniesWorked       4391 non-null   float64
 13  Over18                   4410 non-null   object 
 14  PercentSalaryHike       

In [8]:
rf_model=RandomForestClassifier(n_estimators=1000,max_features=2,oob_score=True)

In [9]:
label_encoder=preprocessing.LabelEncoder()
Attrition_coder=df.apply(label_encoder.fit_transform)

In [10]:
features=['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'Over18',
       'PercentSalaryHike', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [11]:
rf_model.fit(X=Attrition_coder[features],y=Attrition_coder["Attrition"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
print("OOB Accuracy:")
print(rf_model.oob_score_)

OOB Accuracy:
1.0


In [13]:
for features,imp in zip(features,rf_model.feature_importances_):
    print(features,imp)

Age 0.04192631710321384
Attrition 0.5941114448410414
BusinessTravel 0.011701893811269359
Department 0.010810234360957762
DistanceFromHome 0.02512530739595338
Education 0.015128765393289344
EducationField 0.016560364668340027
Gender 0.006425308293159444
JobLevel 0.014230392811224811
JobRole 0.020821755715071038
MaritalStatus 0.019485047034353695
MonthlyIncome 0.03560265186690543
NumCompaniesWorked 0.0222418927707781
Over18 0.0
PercentSalaryHike 0.02388677624140973
StandardHours 0.0
StockOptionLevel 0.012667587300539405
TotalWorkingYears 0.039756474010866405
TrainingTimesLastYear 0.01753621380846397
YearsAtCompany 0.031007517598622
YearsSinceLastPromotion 0.016193856471148824
YearsWithCurrManager 0.024780198503392022


In [19]:
decisionTreeClassifier=tree.DecisionTreeClassifier(max_depth=8)

In [20]:
draw_attrition =pd.DataFrame([attrition_encoded['Age'],attrition_encoded['MonthlyIncome'],
                           attrition_encoded['TotalWorkingYears'],attrition_encoded['DistanceFromHome'],
                           attrition_encoded['YearsAtCompany'],attrition_encoded['PercentSalaryHike'],
                           attrition_encoded['NumCompaniesWorked'],attrition_encoded['JobRole'],
                           attrition_encoded['YearsWithCurrManager']]).T

In [23]:
decisionTreeClassifier.fit(X=draw_attrition,y=attrition_encoded["Attrition"])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [24]:
with open("Attr_Model.dot",'w') as f:

    f=tree.export_graphviz(decisionTreeClassifier,feature_names=["Age","MonthlyIncome","TotalWorkingYears",
                                                     "DistanceFromHome","YearsAtCompany","PercentSalaryHike",
                                                     "NumCompaniesWorked","JobRole","YearsWithCurrManager"],out_file=f);

In [27]:
print("Classification Accuracy with the features  is :", decisionTreeClassifier.score(X=draw_attrition,y=attrition_encoded["Attrition"]))

Classification Accuracy with the features  is : 0.9018140589569161
