In [1]:
# Import modules
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [2]:
# Read csv file and display sample data
employee = pd.read_csv('Employee-Attrition.csv')
employee.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# Remove uncecessary columns and code binary ones Yes = 1 | No = 0
employee["Attrition"] = employee["Attrition"].eq('Yes').mul(1)
employee["OverTime"] = employee["OverTime"].eq('Yes').mul(1)
employee["Gender"] = employee["Gender"].eq('Female').mul(1)
# Remove unwanted columns
employee = employee.drop(["Over18","StandardHours"], axis = 1)
employee.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,3,1,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,4,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,3,2,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,3,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,3,4,1,6,3,3,2,2,2,2


In [4]:
# change columns to numbers in order to analyze the data
import numpy as np
# Select those categorical columns to map themo to dummy values
encoder = employee[["BusinessTravel", "Department","EducationField","JobRole","MaritalStatus"]]
dum_employee = pd.get_dummies(encoder, prefix="DUM")
# join with original data
employee = employee.join(dum_employee,lsuffix="EMP")


In [5]:
print(employee.head())
# Remove categorical columns. We already have them with dummy values
employee = employee.drop(["BusinessTravel","Department","EducationField","JobRole","MaritalStatus"], axis = 1)
print(employee.head())
print(employee.columns)

Age  Attrition     BusinessTravel  DailyRate              Department  \
0   41          1      Travel_Rarely       1102                   Sales   
1   49          0  Travel_Frequently        279  Research & Development   
2   37          1      Travel_Rarely       1373  Research & Development   
3   33          0  Travel_Frequently       1392  Research & Development   
4   27          0      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  DUM_Laboratory Technician  DUM_Manager  DUM_Manufacturing Direc

In [6]:
# Y -> outcome
target = employee["Attrition"]
target_names = ["Yes", "No"]

In [7]:
# x -> variables
# Get all columns except the one we want to predict
data = employee.drop(["Attrition"], axis=1)
feature_names = data.columns
print(feature_names)
data.head()

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'DUM_Non-Travel', 'DUM_Travel_Frequently',
       'DUM_Travel_Rarely', 'DUM_Human Resources',
       'DUM_Research & Development', 'DUM_Sales', 'DUM_Human Resources',
       'DUM_Life Sciences', 'DUM_Marketing', 'DUM_Medical', 'DUM_Other',
       'DUM_Technical Degree', 'DUM_Healthcare Representative',
       'DUM_Human Resources', 'DUM_Laboratory Technician', 'DUM_Manager',
       'DUM_Manufacturing Director', 'DUM_Research Director',
    

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,...,DUM_Laboratory Technician,DUM_Manager,DUM_Manufacturing Director,DUM_Research Director,DUM_Research Scientist,DUM_Sales Executive,DUM_Sales Representative,DUM_Divorced,DUM_Married,DUM_Single
0,41,1102,1,2,1,1,2,1,94,3,...,0,0,0,0,0,1,0,0,0,1
1,49,279,8,1,1,2,3,0,61,2,...,0,0,0,0,1,0,0,0,1,0
2,37,1373,2,2,1,4,4,0,92,2,...,1,0,0,0,0,0,0,0,0,1
3,33,1392,3,4,1,5,4,1,56,3,...,0,0,0,0,1,0,0,0,1,0
4,27,591,2,1,1,7,1,0,40,3,...,1,0,0,0,0,0,0,0,1,0


In [8]:
# Split the data using train_test_split
from sklearn.model_selection import train_test_split
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(data,target, random_state=10)

In [9]:
# Create, fit, and score a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
# Generate forest models from 100 to 1,000 each with 100 increase in estimators
# i.e. model 1 = 100, model 2 = 200, model 3 = 300, etc
for i in range(100,1000,100):
    print("================================")
    rf = RandomForestClassifier(n_estimators=i)
    rf = rf.fit(X_train, y_train)
    # print scores for each model
    print(f"Estimators: {i}")
    print(f"Train score: {rf.score(X_train,y_train)}")
    print(f"Test score: {rf.score(X_test,y_test)}")



Estimators: 100
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 200
Train score: 1.0
Test score: 0.842391304347826
Estimators: 300
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 400
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 500
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 600
Train score: 1.0
Test score: 0.8369565217391305
Estimators: 700
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 800
Train score: 1.0
Test score: 0.8396739130434783
Estimators: 900
Train score: 1.0
Test score: 0.8369565217391305


In [10]:
import math
importances = rf.feature_importances_
# sort features by importance (here, we have results for last model)
sorted (zip((importances),feature_names),reverse = True)

[(0.06815175626635145, 'MonthlyIncome'),
 (0.05377991097512753, 'Age'),
 (0.04854710666869295, 'DailyRate'),
 (0.04801178575996509, 'MonthlyRate'),
 (0.04756647362951484, 'EmployeeNumber'),
 (0.04685884320629231, 'OverTime'),
 (0.04617280944529408, 'TotalWorkingYears'),
 (0.04537610810422373, 'HourlyRate'),
 (0.04247772621040197, 'DistanceFromHome'),
 (0.03986306767194321, 'YearsAtCompany'),
 (0.03338787943598456, 'PercentSalaryHike'),
 (0.031111403761068582, 'NumCompaniesWorked'),
 (0.02859157618464612, 'JobSatisfaction'),
 (0.02839209010856265, 'EnvironmentSatisfaction'),
 (0.027429044589073116, 'StockOptionLevel'),
 (0.025981248565689837, 'YearsWithCurrManager'),
 (0.02514250607248738, 'TrainingTimesLastYear'),
 (0.02379179760469061, 'WorkLifeBalance'),
 (0.0233721846683969, 'JobInvolvement'),
 (0.022925766562066872, 'YearsInCurrentRole'),
 (0.022530993070701735, 'RelationshipSatisfaction'),
 (0.02212334407714227, 'YearsSinceLastPromotion'),
 (0.019731311160327995, 'JobLevel'),
 (0.

In [11]:
# Number 2 try out
features = pd.read_csv('Employee-Attrition.csv')
features.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [12]:
print('The shape of our features is:', features.shape)


The shape of our features is: (1470, 35)


In [13]:
features.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [14]:
# Remove uncecessary columns and code binary ones Yes = 1 | No = 0
features["Attrition"] = features["Attrition"].eq('Yes').mul(1)
features["OverTime"] = features["OverTime"].eq('Yes').mul(1)
features["Gender"] = features["Gender"].eq('Female').mul(1)

# Remove columns
features = features.drop(["Over18","StandardHours","EmployeeNumber","EmployeeCount","MaritalStatus","EducationField","Department","RelationshipSatisfaction","JobRole","BusinessTravel","JobLevel","PerformanceRating","Gender","StockOptionLevel","TrainingTimesLastYear","DailyRate","HourlyRate","MonthlyRate"], axis = 1)


In [15]:
# One-hot encode the data using pandas get_dummies
features = pd.get_dummies(features)
# Display the first 5 rows of the last 12 columns
features.iloc[:,5:].head(5)

Unnamed: 0,JobInvolvement,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,3,4,5993,8,1,11,8,1,6,4,0,5
1,2,2,5130,1,0,23,10,3,10,7,1,7
2,2,3,2090,6,1,15,7,3,0,0,0,0
3,3,3,2909,1,1,11,8,3,8,7,3,0
4,3,2,3468,9,0,12,6,3,2,2,2,2


In [16]:
features.columns

Index(['Age', 'Attrition', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction',
       'MonthlyIncome', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [17]:
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(features['Attrition'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('Attrition', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [18]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.80, random_state = 42)

In [19]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (294, 16)
Training Labels Shape: (294,)
Testing Features Shape: (1176, 16)
Testing Labels Shape: (1176,)


In [20]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [21]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 0.24


In [22]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Age                  Importance: 0.15
Variable: MonthlyIncome        Importance: 0.11
Variable: OverTime             Importance: 0.11
Variable: NumCompaniesWorked   Importance: 0.1
Variable: DistanceFromHome     Importance: 0.09
Variable: PercentSalaryHike    Importance: 0.07
Variable: TotalWorkingYears    Importance: 0.06
Variable: EnvironmentSatisfaction Importance: 0.04
Variable: YearsAtCompany       Importance: 0.04
Variable: YearsInCurrentRole   Importance: 0.04
Variable: YearsWithCurrManager Importance: 0.04
Variable: Education            Importance: 0.03
Variable: JobInvolvement       Importance: 0.03
Variable: JobSatisfaction      Importance: 0.03
Variable: WorkLifeBalance      Importance: 0.03
Variable: YearsSinceLastPromotion Importance: 0.03


#Variable: Age                  Importance: 0.11
Variable: OverTime             Importance: 0.1
Variable: MonthlyIncome        Importance: 0.07
Variable: NumCompaniesWorked   Importance: 0.07
Variable: DailyRate            Importance: 0.05
Variable: DistanceFromHome     Importance: 0.05
Variable: HourlyRate           Importance: 0.05
Variable: MonthlyRate          Importance: 0.04
Variable: TotalWorkingYears    Importance: 0.04
Variable: EmployeeNumber       Importance: 0.03
Variable: PercentSalaryHike    Importance: 0.03
Variable: YearsInCurrentRole   Importance: 0.03
Variable: EnvironmentSatisfaction Importance: 0.02
Variable: JobSatisfaction      Importance: 0.02
Variable: RelationshipSatisfaction Importance: 0.02
Variable: StockOptionLevel     Importance: 0.02
Variable: TrainingTimesLastYear Importance: 0.02
Variable: WorkLifeBalance      Importance: 0.02
Variable: YearsAtCompany       Importance: 0.02
Variable: YearsSinceLastPromotion Importance: 0.02
Variable: YearsWithCurrManager Importance: 0.02
Variable: MaritalStatus_Single Importance: 0.02
Variable: Education            Importance: 0.01
Variable: Gender               Importance: 0.01
Variable: JobInvolvement       Importance: 0.01
Variable: JobLevel             Importance: 0.01
Variable: BusinessTravel_Travel_Frequently Importance: 0.01
Variable: Department_Sales     Importance: 0.01
Variable: EducationField_Life Sciences Importance: 0.01
Variable: EducationField_Marketing Importance: 0.01
Variable: EducationField_Medical Importance: 0.01
Variable: JobRole_Healthcare Representative Importance: 0.01
Variable: JobRole_Sales Representative Importance: 0.01
Variable: EmployeeCount        Importance: 0.0
Variable: PerformanceRating    Importance: 0.0
Variable: StandardHours        Importance: 0.0
Variable: BusinessTravel_Non-Travel Importance: 0.0
Variable: BusinessTravel_Travel_Rarely Importance: 0.0
Variable: Department_Human Resources Importance: 0.0
Variable: Department_Research & Development Importance: 0.0
Variable: EducationField_Human Resources Importance: 0.0
Variable: EducationField_Other Importance: 0.0
Variable: EducationField_Technical Degree Importance: 0.0
Variable: JobRole_Human Resources Importance: 0.0
Variable: JobRole_Laboratory Technician Importance: 0.0
Variable: JobRole_Manager      Importance: 0.0
Variable: JobRole_Manufacturing Director Importance: 0.0
Variable: JobRole_Research Director Importance: 0.0
Variable: JobRole_Research Scientist Importance: 0.0
Variable: JobRole_Sales Executive Importance: 0.0
Variable: MaritalStatus_Divorced Importance: 0.0
Variable: MaritalStatus_Married Importance: 0.0
Variable: Over18_Y             Importance: 0.0

Variable: Age                  Importance: 0.13
Variable: OverTime             Importance: 0.1
Variable: MonthlyIncome        Importance: 0.08
Variable: NumCompaniesWorked   Importance: 0.07
Variable: DailyRate            Importance: 0.06
Variable: DistanceFromHome     Importance: 0.06
Variable: HourlyRate           Importance: 0.06
Variable: MonthlyRate          Importance: 0.06
Variable: TotalWorkingYears    Importance: 0.05
Variable: PercentSalaryHike    Importance: 0.04
Variable: EnvironmentSatisfaction Importance: 0.03
Variable: StockOptionLevel     Importance: 0.03
Variable: TrainingTimesLastYear Importance: 0.03
Variable: YearsAtCompany       Importance: 0.03
Variable: YearsInCurrentRole   Importance: 0.03
Variable: Education            Importance: 0.02
Variable: JobInvolvement       Importance: 0.02
Variable: JobSatisfaction      Importance: 0.02
Variable: WorkLifeBalance      Importance: 0.02
Variable: YearsSinceLastPromotion Importance: 0.02
Variable: YearsWithCurrManager Importance: 0.02
Variable: Gender               Importance: 0.01
Variable: JobLevel             Importance: 0.01
Variable: PerformanceRating    Importance: 0.0

Variable: Age                  Importance: 0.13
Variable: OverTime             Importance: 0.1
Variable: MonthlyIncome        Importance: 0.08
Variable: NumCompaniesWorked   Importance: 0.07
Variable: DailyRate            Importance: 0.06
Variable: DistanceFromHome     Importance: 0.06
Variable: HourlyRate           Importance: 0.06
Variable: MonthlyRate          Importance: 0.06
Variable: PercentSalaryHike    Importance: 0.05
Variable: TotalWorkingYears    Importance: 0.05
Variable: EnvironmentSatisfaction Importance: 0.03
Variable: StockOptionLevel     Importance: 0.03
Variable: TrainingTimesLastYear Importance: 0.03
Variable: YearsAtCompany       Importance: 0.03
Variable: YearsInCurrentRole   Importance: 0.03
Variable: Education            Importance: 0.02
Variable: JobInvolvement       Importance: 0.02
Variable: JobSatisfaction      Importance: 0.02
Variable: WorkLifeBalance      Importance: 0.02
Variable: YearsSinceLastPromotion Importance: 0.02
Variable: YearsWithCurrManager Importance: 0.02

Variable: Age                  Importance: 0.13
Variable: OverTime             Importance: 0.11
Variable: MonthlyIncome        Importance: 0.09
Variable: NumCompaniesWorked   Importance: 0.08
Variable: DailyRate            Importance: 0.07
Variable: DistanceFromHome     Importance: 0.07
Variable: HourlyRate           Importance: 0.07
Variable: MonthlyRate          Importance: 0.06
Variable: PercentSalaryHike    Importance: 0.05
Variable: TotalWorkingYears    Importance: 0.05
Variable: EnvironmentSatisfaction Importance: 0.03
Variable: YearsAtCompany       Importance: 0.03
Variable: YearsInCurrentRole   Importance: 0.03
Variable: YearsSinceLastPromotion Importance: 0.03
Variable: Education            Importance: 0.02
Variable: JobInvolvement       Importance: 0.02
Variable: JobSatisfaction      Importance: 0.02
Variable: WorkLifeBalance      Importance: 0.02
Variable: YearsWithCurrManager Importance: 0.02