In [1]:
# It allows you to handle the warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing necessary packages
import pandas as pd   #It is used for importing the dataset and managing dataset
import numpy as np    #It is used because it contains mathematical tools
# Importing packages of matplotlib and seaborn for making graph
import matplotlib.pyplot as plt   
from matplotlib import rcParams
rcParams['figure.figsize'] = 5,4
import seaborn as sb
sb.set_style('whitegrid')
%matplotlib inline  

In [3]:
# Importing the data file
data = pd.read_excel('INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls')
data.head()

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3


# Data Cleaning or Data Mugging or Data Wrangling

In [4]:
# Looking for missing data
data.isnull().sum()

EmpNumber                       0
Age                             0
Gender                          0
EducationBackground             0
MaritalStatus                   0
EmpDepartment                   0
EmpJobRole                      0
BusinessTravelFrequency         0
DistanceFromHome                0
EmpEducationLevel               0
EmpEnvironmentSatisfaction      0
EmpHourlyRate                   0
EmpJobInvolvement               0
EmpJobLevel                     0
EmpJobSatisfaction              0
NumCompaniesWorked              0
OverTime                        0
EmpLastSalaryHikePercent        0
EmpRelationshipSatisfaction     0
TotalWorkExperienceInYears      0
TrainingTimesLastYear           0
EmpWorkLifeBalance              0
ExperienceYearsAtThisCompany    0
ExperienceYearsInCurrentRole    0
YearsSinceLastPromotion         0
YearsWithCurrManager            0
Attrition                       0
PerformanceRating               0
dtype: int64

# Encoding the Categorical values using LabelEncoder

In [5]:
# Label Encoder is used to convert Categorical values into numerical values
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
data.EmpNumber = enc.fit_transform(data.EmpNumber)
data.Gender = enc.fit_transform(data.Gender)
data.EducationBackground = enc.fit_transform(data.EducationBackground)
data.MaritalStatus = enc.fit_transform(data.MaritalStatus)
data.EmpDepartment = enc.fit_transform(data.EmpDepartment)
data.EmpJobRole = enc.fit_transform(data.EmpJobRole)
data.BusinessTravelFrequency = enc.fit_transform(data.BusinessTravelFrequency)
data.OverTime = enc.fit_transform(data.OverTime)
data.Attrition = enc.fit_transform(data.Attrition)

# Selecting Predictor variables (X) and the Target Variables (Y)

In [6]:
# Here I am selecting the important features using RecursiveFeatureElimination 
X = data.loc[:,['EmpNumber', 'Age', 'MaritalStatus', 'EmpDepartment', 'EmpJobRole',
       'DistanceFromHome', 'EmpEducationLevel', 'EmpEnvironmentSatisfaction',
       'EmpHourlyRate', 'NumCompaniesWorked', 'EmpLastSalaryHikePercent',
       'EmpRelationshipSatisfaction', 'TotalWorkExperienceInYears',
       'TrainingTimesLastYear', 'EmpWorkLifeBalance',
       'ExperienceYearsAtThisCompany', 'ExperienceYearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']]
y = data.PerformanceRating
X.head()

Unnamed: 0,EmpNumber,Age,MaritalStatus,EmpDepartment,EmpJobRole,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,NumCompaniesWorked,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0,32,2,5,13,10,3,4,55,1,12,4,10,2,2,10,7,0,8
1,1,47,2,5,13,14,4,4,42,2,12,4,20,2,3,7,7,1,7
2,2,40,1,5,13,5,4,4,48,5,21,3,20,2,3,18,13,1,12
3,3,41,0,3,8,10,4,2,73,3,15,2,23,2,2,21,6,12,6
4,4,60,2,5,13,16,4,1,84,8,14,4,10,1,3,2,2,2,2


# Splitting the Dataset into Training set and Test Set

In [7]:
# Importing package of train test split
from sklearn.model_selection import train_test_split

In [8]:
# Splitting Train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

# Implementing Machine Learning Models

 Having performed some exploratory data analysis and simple feature engineering as well as having ensured that all categorical values are encoded, we are now ready to proceed onto building our models.

# 1.Random Forest Classifier using Grid Search

In [9]:
# Training the model
from sklearn.ensemble import RandomForestClassifier
# importing the GridSearch Package
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()

parameters = [{'min_samples_split':[2,3,4,5],'criterion':['gini','entropy'],'min_samples_leaf':[1,2,3]}]

grid_modle=GridSearchCV(RandomForestClassifier(),parameters )
grid_modle.fit(X_train,y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'min_samples_split': [2, 3, 4, 5], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 2, 3]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
grid_modle.best_score_

0.9273809523809524

In [11]:
# Predicting the model
y_predict = grid_modle.predict(X_test)

In [12]:
# Finding accuracy, precision, recall and confusion matrix
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))

0.9277777777777778
              precision    recall  f1-score   support

           2       0.95      0.89      0.92        63
           3       0.93      0.98      0.95       264
           4       0.83      0.61      0.70        33

   micro avg       0.93      0.93      0.93       360
   macro avg       0.90      0.82      0.86       360
weighted avg       0.93      0.93      0.92       360



In [13]:
confusion_matrix(y_test,y_predict)

array([[ 56,   7,   0],
       [  2, 258,   4],
       [  1,  12,  20]], dtype=int64)

# 2. Extreme Gradient Boosting Classifier

In [15]:
# Here i am installing xgboost classifier
!pip install xgboost



In [16]:
from xgboost import XGBClassifier

In [17]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [18]:
y_predict_xgb = model_xgb.predict(X_test)

In [19]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_xgb))
print(classification_report(y_test,y_predict_xgb))

0.9388888888888889
              precision    recall  f1-score   support

           2       0.92      0.90      0.91        63
           3       0.96      0.97      0.96       264
           4       0.83      0.76      0.79        33

   micro avg       0.94      0.94      0.94       360
   macro avg       0.90      0.88      0.89       360
weighted avg       0.94      0.94      0.94       360



In [20]:
confusion_matrix(y_test,y_predict_xgb)

array([[ 57,   6,   0],
       [  3, 256,   5],
       [  2,   6,  25]], dtype=int64)