In [213]:
# Importing the necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [214]:
# Importing the csv file
emp_data = pd.read_excel('../../Data/INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls')

In [215]:
# Dropping the first columns as it is of no use for analysis.
emp_data.drop(['EmpNumber'],inplace=True,axis=1)

In [216]:
# Encoding all the ordinal columns and creating a dummy variable for them to see if there are any effects on Performance Rating
enc = LabelEncoder()
for i in (2,3,4,5,6,7,16,26):
    emp_data.iloc[:,i] = enc.fit_transform(emp_data.iloc[:,i])
emp_data.head()

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,32,Male,2,2,5,13,2,9,3,4,...,4,10,2,2,10,7,0,8,No,1
1,47,Male,2,2,5,13,2,13,4,4,...,4,20,2,3,7,7,1,7,No,1
2,40,Male,1,1,5,13,1,4,4,4,...,3,20,2,3,18,13,1,12,No,2
3,41,Male,0,0,3,8,2,9,4,2,...,2,23,2,2,21,6,12,6,No,1
4,60,Male,2,2,5,13,2,15,4,1,...,4,10,1,3,2,2,2,2,No,1


# Implementing Machine Learning Models

Having performed some exploratory data analysis and simple feature engineering as well as having ensured that all categorical values are encoded, we are now ready to proceed onto building our models.

### Splitting Data into Train and Test sets

But before we even start training a model, we will have to partition our dataset into a training set and a test set. To split our data we will utilise sklearn's

In [217]:
# Here we have selected only the important columns
y = emp_data.PerformanceRating
#X = data.iloc[:,0:-1]  All predictors were selected it resulted in dropping of accuracy.
X = emp_data.iloc[:,[4,5,9,16,20,21,22,23,24]] # Taking only variables with correlation coeffecient greater than 0.1
X.head()

Unnamed: 0,EmpDepartment,EmpJobRole,EmpEnvironmentSatisfaction,EmpLastSalaryHikePercent,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,5,13,4,1,2,10,7,0,8
1,5,13,4,1,3,7,7,1,7
2,5,13,4,10,3,18,13,1,12
3,3,8,2,4,2,21,6,12,6
4,5,13,1,3,3,2,2,2,2


In [218]:
# Splitting into train and test for calculating the accuracy
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.75,random_state=10)

### SMOTE to oversample due to the skewness in target

Since we have already noted the severe imbalance in the values within the target variable, let us implement the SMOTE method in the dealing with this skewed value via the imblearn Python package.

In [219]:
from imblearn.over_sampling import SMOTE

oversampler= SMOTE(random_state=0)
smote_train, smote_target = oversampler.fit_sample(X_train,y_train)

In [220]:
# Standardization technique is used
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [221]:
X_train.shape

(300, 9)

In [222]:
X_test.shape

(900, 9)

## 1 .Logistic regression

In [223]:
model_logr = LogisticRegression()
model_logr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [224]:
# Predicting the model
y_predict_log = model_logr.predict(X_test)

In [225]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_log))
print(classification_report(y_test,y_predict_log))

0.8
              precision    recall  f1-score   support

           0       0.70      0.33      0.45       150
           1       0.81      0.96      0.88       650
           2       0.77      0.49      0.60       100

   micro avg       0.80      0.80      0.80       900
   macro avg       0.76      0.59      0.64       900
weighted avg       0.79      0.80      0.78       900



## 2. Decision Tree with GridSearchCV

In [226]:
classifier_dtg=DecisionTreeClassifier(random_state=42,splitter='best')
parameters=[{'min_samples_split':[2,3,4,5],'criterion':['gini']},{'min_samples_split':[2,3,4,5],'criterion':['entropy']}]

model_griddtree=GridSearchCV(estimator=classifier_dtg, param_grid=parameters, scoring='accuracy',cv=10)
model_griddtree.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'min_samples_split': [2, 3, 4, 5], 'criterion': ['gini']}, {'min_samples_split': [2, 3, 4, 5], 'criterion': ['entropy']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [227]:
model_griddtree.best_params_

{'criterion': 'gini', 'min_samples_split': 4}

In [228]:
# Predicting the model
y_predict_dtree = model_griddtree.predict(X_test)

In [229]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_dtree))
print(classification_report(y_test,y_predict_dtree))

0.8944444444444445
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       150
           1       0.92      0.94      0.93       650
           2       0.87      0.69      0.77       100

   micro avg       0.89      0.89      0.89       900
   macro avg       0.86      0.82      0.84       900
weighted avg       0.89      0.89      0.89       900



In [230]:
confusion_matrix(y_test,y_predict_dtree)

array([[123,  27,   0],
       [ 27, 613,  10],
       [  5,  26,  69]])

## 3. XGBoost Classifier

In [231]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [232]:
# Predicting the model
y_predict_xgb = model_xgb.predict(X_test)

In [233]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_xgb))
print(classification_report(y_test,y_predict_xgb))

0.9233333333333333
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       150
           1       0.93      0.97      0.95       650
           2       0.91      0.69      0.78       100

   micro avg       0.92      0.92      0.92       900
   macro avg       0.91      0.85      0.87       900
weighted avg       0.92      0.92      0.92       900



In [234]:
confusion_matrix(y_test,y_predict_xgb)

array([[134,  16,   0],
       [ 15, 628,   7],
       [  2,  29,  69]])

## 4. Random Forest Classifier

In [235]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

In [236]:
seed = 0   # We set our random seed to zero for reproducibility
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 800,
    'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 9,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

In [237]:
rf = RandomForestClassifier(**rf_params)

In [238]:
rf.fit(smote_train, smote_target)
print("Fitting of Random Forest as finished")

Fitting of Random Forest as finished


In [239]:
rf_predictions = rf.predict(X_test)
print("Predictions finished")


display(rf_predictions)
display(y_test)

Predictions finished


array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 0, 1, 1, 1, 2, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 2, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 2, 1, 0, 1, 1, 0, 1, 1, 1, 2, 0, 0, 0, 1, 1,
       1, 0, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 2, 0,
       0, 1, 1, 1, 1, 1, 2, 2, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 2, 1,
       1, 1, 1, 1, 0, 2, 2, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1])

123     1
1036    1
1034    1
426     0
726     1
508     1
483     1
67      1
598     1
840     1
87      1
941     1
190     2
858     1
1136    1
894     1
363     0
228     1
74      1
142     1
31      2
585     1
413     1
1199    0
895     1
443     0
1168    1
261     1
131     1
982     1
       ..
1023    0
41      1
596     1
1016    1
808     1
845     1
725     0
723     1
124     2
24      1
173     1
1100    1
121     1
567     1
615     0
329     2
374     2
672     0
868     1
126     1
143     1
577     1
1001    1
434     1
375     0
665     1
699     0
403     1
520     0
888     1
Name: PerformanceRating, Length: 240, dtype: int64

In [240]:
accuracy_score(y_test, rf_predictions) # To find the accuracy of the Prediction Model 

0.9166666666666666

In [241]:
print(classification_report(y_test,rf_predictions))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90        38
           1       0.95      0.93      0.94       179
           2       0.72      0.78      0.75        23

   micro avg       0.92      0.92      0.92       240
   macro avg       0.85      0.88      0.86       240
weighted avg       0.92      0.92      0.92       240



# 5. Gradient Boosted Classifier


In [242]:
# Gradient Boosting Parameters
gb_params ={
    'n_estimators': 500,
    'max_features': 0.9,
    'learning_rate' : 0.2,
    'max_depth': 11,
    'min_samples_leaf': 2,
    'subsample': 1,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

In [243]:
gb = GradientBoostingClassifier(**gb_params)
# Fit the model to our SMOTEd train and target
gb.fit(smote_train, smote_target)
# Get our predictions
gb_predictions = gb.predict(X_test)
print("Predictions have finished")

Predictions have finished


In [244]:
accuracy_score(y_test, gb_predictions)

0.9083333333333333