In [1]:
# Import modules
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [2]:
# Read csv file and display sample data
employee = pd.read_csv('Employee-Attrition.csv')
employee.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# Remove uncecessary columns and code binary ones Yes = 1 | No = 0
employee["Attrition"] = employee["Attrition"].eq('Yes').mul(1)
employee["OverTime"] = employee["OverTime"].eq('Yes').mul(1)
employee["Gender"] = employee["Gender"].eq('Female').mul(1)
# Remove unwanted columns
employee = employee.drop(["Over18","StandardHours"], axis = 1)
employee.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,3,1,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,4,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,3,2,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,3,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,3,4,1,6,3,3,2,2,2,2


In [4]:
# change columns to numbers in order to analyze the data
import numpy as np
# Select those categorical columns to map themo to dummy values
encoder = employee[["BusinessTravel", "Department","EducationField","JobRole","MaritalStatus"]]
dum_employee = pd.get_dummies(encoder, prefix="DUM")
# join with original data
employee = employee.join(dum_employee,lsuffix="EMP")


In [5]:
print(employee.head())
# Remove categorical columns. We already have them with dummy values
employee = employee.drop(["BusinessTravel","Department","EducationField","JobRole","MaritalStatus"], axis = 1)
print(employee.head())
print(employee.columns)

Age  Attrition     BusinessTravel  DailyRate              Department  \
0   41          1      Travel_Rarely       1102                   Sales   
1   49          0  Travel_Frequently        279  Research & Development   
2   37          1      Travel_Rarely       1373  Research & Development   
3   33          0  Travel_Frequently       1392  Research & Development   
4   27          0      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  DUM_Laboratory Technician  DUM_Manager  DUM_Manufacturing Direc

In [6]:
# Y -> outcome
target = employee["Attrition"]
target_names = ["Yes", "No"]

In [7]:
# x -> variables
# Get all columns except the one we want to predict
data = employee.drop(["Attrition"], axis=1)
feature_names = data.columns
print(feature_names)
data.head()

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'DUM_Non-Travel', 'DUM_Travel_Frequently',
       'DUM_Travel_Rarely', 'DUM_Human Resources',
       'DUM_Research & Development', 'DUM_Sales', 'DUM_Human Resources',
       'DUM_Life Sciences', 'DUM_Marketing', 'DUM_Medical', 'DUM_Other',
       'DUM_Technical Degree', 'DUM_Healthcare Representative',
       'DUM_Human Resources', 'DUM_Laboratory Technician', 'DUM_Manager',
       'DUM_Manufacturing Director', 'DUM_Research Director',
    

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,...,DUM_Laboratory Technician,DUM_Manager,DUM_Manufacturing Director,DUM_Research Director,DUM_Research Scientist,DUM_Sales Executive,DUM_Sales Representative,DUM_Divorced,DUM_Married,DUM_Single
0,41,1102,1,2,1,1,2,1,94,3,...,0,0,0,0,0,1,0,0,0,1
1,49,279,8,1,1,2,3,0,61,2,...,0,0,0,0,1,0,0,0,1,0
2,37,1373,2,2,1,4,4,0,92,2,...,1,0,0,0,0,0,0,0,0,1
3,33,1392,3,4,1,5,4,1,56,3,...,0,0,0,0,1,0,0,0,1,0
4,27,591,2,1,1,7,1,0,40,3,...,1,0,0,0,0,0,0,0,1,0


In [8]:
# Split the data using train_test_split
from sklearn.model_selection import train_test_split
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(data,target, random_state=10)

In [15]:
# Create, fit, and score a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
# Generate forest models from 100 to 1,000 each with 100 increase in estimators
# i.e. model 1 = 100, model 2 = 200, model 3 = 300, etc
for i in range(100,1000,100):
    print("================================")
    rf = RandomForestClassifier(n_estimators=i)
    rf = rf.fit(X_train, y_train)
    # print scores for each model
    print(f"Estimators: {i}")
    print(f"Train score: {rf.score(X_train,y_train)}")
    print(f"Test score: {rf.score(X_test,y_test)}")



Estimators: 100
Train score: 1.0
Test score: 0.8369565217391305
Estimators: 200
Train score: 1.0
Test score: 0.8369565217391305
Estimators: 300
Train score: 1.0
Test score: 0.8369565217391305
Estimators: 400
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 500
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 600
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 700
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 800
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 900
Train score: 1.0
Test score: 0.8396739130434783


In [29]:
import math
importances = rf.feature_importances_
# sort features by importance (here, we have results for last model)
sorted (zip((importances*100),feature_names),reverse = True)

[(6.8814280874652605, 'MonthlyIncome'),
 (5.3882320890606215, 'Age'),
 (4.965774111854274, 'DailyRate'),
 (4.808785563236375, 'EmployeeNumber'),
 (4.629289735925777, 'TotalWorkingYears'),
 (4.606174246898445, 'HourlyRate'),
 (4.525713658110499, 'MonthlyRate'),
 (4.492123788509902, 'OverTime'),
 (4.309212122319542, 'DistanceFromHome'),
 (4.023484343248683, 'YearsAtCompany'),
 (3.4551987041226084, 'PercentSalaryHike'),
 (3.0145727877362685, 'NumCompaniesWorked'),
 (2.8674719897628935, 'JobSatisfaction'),
 (2.794410289548614, 'EnvironmentSatisfaction'),
 (2.751403305129681, 'YearsWithCurrManager'),
 (2.7361093856523797, 'StockOptionLevel'),
 (2.439302727031132, 'TrainingTimesLastYear'),
 (2.3817922538826184, 'YearsInCurrentRole'),
 (2.381371037864502, 'WorkLifeBalance'),
 (2.3069331153805277, 'JobInvolvement'),
 (2.2729711120861924, 'YearsSinceLastPromotion'),
 (2.2261626760611417, 'RelationshipSatisfaction'),
 (2.085786830462156, 'JobLevel'),
 (1.8123844246688898, 'Education'),
 (1.79489