In [1]:
# Import the modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib import pyplot
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
HR_data_df = pd.read_csv('Resources/HR_Employee_Attrition.csv')

# Review the DataFrame
HR_data_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# Check column datatypes
HR_data_df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [4]:
# Check null values
HR_data_df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [5]:
# Count distribution of attrition values
# sns.countplot(x='Attrition', data=HR_data_df)

In [6]:
HR_data_df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [7]:
#Drop columns that is not important feature
HR_data_df.drop(['BusinessTravel', 'DailyRate', 'Department', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyRate', 'Over18', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsSinceLastPromotion'], axis="columns", inplace=True)

#Convert attrition column from categorical to numeric
categorical_col = []
for column in HR_data_df.columns:
    if HR_data_df[column].dtype == object and len(HR_data_df[column].unique()) <= 50:
        categorical_col.append(column)
        
HR_data_df['Attrition'] = HR_data_df.Attrition.astype("category").cat.codes

In [8]:
#Convert Overtime column from categorical to numeric
categorical_col_1 = []
for column in HR_data_df.columns:
    if HR_data_df[column].dtype == object and len(HR_data_df[column].unique()) <= 50:
        categorical_col_1.append(column)
        
HR_data_df['OverTime'] = HR_data_df.Attrition.astype("category").cat.codes

In [9]:
# Attrition columns is convert from yes/no to 1/0
HR_data_df.head()

Unnamed: 0,Age,Attrition,DistanceFromHome,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsWithCurrManager
0,41,1,1,5993,8,1,11,8,6,4,5
1,49,0,8,5130,1,0,23,10,10,7,7
2,37,1,2,2090,6,1,15,7,0,0,0
3,33,0,3,2909,1,0,11,8,8,7,0
4,27,0,2,3468,9,0,12,6,2,2,2


In [10]:
categorical_col.remove('Attrition')

In [11]:
# Transform categorical data into dummies
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
for column in categorical_col:
    HR_data_df[column] = label.fit_transform(HR_data_df[column])

In [12]:
HR_data_df.head()

Unnamed: 0,Age,Attrition,DistanceFromHome,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsWithCurrManager
0,41,1,1,5993,8,1,11,8,6,4,5
1,49,0,8,5130,1,0,23,10,10,7,7
2,37,1,2,2090,6,1,15,7,0,0,0
3,33,0,3,2909,1,0,11,8,8,7,0
4,27,0,2,3468,9,0,12,6,2,2,2


In [13]:
# Train test split
from sklearn.model_selection import train_test_split

X = HR_data_df.drop('Attrition', axis=1)
y = HR_data_df.Attrition

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
# Define and print report for accuracy score, confusion matrix and classification report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [69]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=10, max_leaf_nodes=2, criterion='entropy')
rf_clf.fit(X_train, y_train)

print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)

Test Result:
Accuracy Score: 91.61%
_______________________________________________
CLASSIFICATION REPORT:
                    0          1  accuracy   macro avg  weighted avg
precision    0.911271   1.000000    0.9161    0.955635      0.923544
recall       1.000000   0.393443    0.9161    0.696721      0.916100
f1-score     0.953576   0.564706    0.9161    0.759141      0.899787
support    380.000000  61.000000    0.9161  441.000000    441.000000
_______________________________________________
Confusion Matrix: 
 [[380   0]
 [ 37  24]]

Train Result:
Accuracy Score: 91.16%
_______________________________________________
CLASSIFICATION REPORT:
                    0           1  accuracy    macro avg  weighted avg
precision    0.903602    1.000000  0.911565     0.951801      0.920090
recall       1.000000    0.482955  0.911565     0.741477      0.911565
f1-score     0.949360    0.651341  0.911565     0.800351      0.898387
support    853.000000  176.000000  0.911565  1029.000000   1029.