## Data Preparation

In [1]:
# Now, Let's prepare the data for ML algorithms

In [2]:
# Importing the necessary libraries

import os
import numpy as np
import pandas as pd

In [3]:
# To display all the columns in the dataframe

pd.set_option('display.max_columns', None)

In [4]:
# Reading the train and test sets (These were split before EDA)

def read_data(filename):
    return pd.read_csv(os.path.join('dataset', filename), index_col='index').rename_axis(None)

In [5]:
df_train = read_data('df_train.csv')
df_test = read_data('df_test.csv')
data = read_data('data.csv')

In [6]:
print(df_train.shape)
print(df_test.shape)
print(data.shape)

(1176, 35)
(294, 35)
(1176, 45)


In [7]:
# Encoding the target variable in both train and test set

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_train['Attrition'] = le.fit_transform(df_train['Attrition'])

df_test['Attrition'] = le.transform(df_test['Attrition'])

In [8]:
# Separating X and y i.e. the independent tariables and the target variable

employees = df_train.drop('Attrition', axis=1)
employees_labels = df_train['Attrition'].copy()

employees_test = df_test.drop('Attrition', axis=1)
employees_test_labels = df_test['Attrition'].copy()

In [9]:
print(employees.shape, employees_labels.shape)
print(employees_test.shape, employees_test_labels.shape)

(1176, 34) (1176,)
(294, 34) (294,)


In [10]:
# Data Cleaning 
    # 1. Remove unnecessary attributes
    # 2. Handle missing values if any

In [11]:
# 1. Dropping unnecessary variables from both train and test sets

unnecessary_vars = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']

employees = employees.drop(unnecessary_vars, axis=1)
employees_test = employees_test.drop(unnecessary_vars, axis=1)

In [12]:
# 2. Checking if there are any missing values

employees.isnull().sum()

Age                         0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [13]:
employees_test.isnull().sum()

Age                         0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [14]:
# There aren't any missing values in any of the variables

In [15]:
# Handling numeric variables

employees_num = employees.select_dtypes(exclude='object')

employees_num.columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [16]:
# Let's create a custom class to add the derived attribute 'MonthlRateToIncome_Diff'

from sklearn.base import BaseEstimator, TransformerMixin

monthly_rate_idx, monthly_income_idx = 16, 15

class AddDerivedAttributes(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        monthly_rate_to_income_diff = X[:, monthly_rate_idx] - X[:, monthly_income_idx]
        return np.c_[X, monthly_rate_to_income_diff]

In [17]:
# Handling Categorical Attributes

employees_cat = employees.select_dtypes(include='object')

employees_cat

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
941,Travel_Rarely,Research & Development,Technical Degree,Female,Laboratory Technician,Married,No
865,Travel_Rarely,Sales,Life Sciences,Male,Sales Executive,Divorced,No
18,Travel_Rarely,Sales,Life Sciences,Female,Manager,Married,No
786,Non-Travel,Research & Development,Life Sciences,Male,Laboratory Technician,Married,No
416,Travel_Frequently,Research & Development,Life Sciences,Male,Laboratory Technician,Married,Yes
...,...,...,...,...,...,...,...
1332,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Single,Yes
159,Travel_Frequently,Sales,Marketing,Female,Sales Representative,Married,No
318,Travel_Rarely,Research & Development,Life Sciences,Female,Research Scientist,Single,Yes
1286,Travel_Rarely,Research & Development,Life Sciences,Female,Laboratory Technician,Married,No


In [18]:
# Using one hot encoding technique to encode all the categorical variables

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

employees_cat_1hot = ohe.fit_transform(employees_cat)

In [19]:
len(employees_cat_1hot.toarray()[0])

28

In [20]:
# Building the pipeline for both numeric and categorical attributes

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_attrs = list(employees_num.columns)
cat_attrs = list(employees_cat.columns)

# Pipeline step to add the derived attribute MonthlyRateToIncomeDiff
add_derived_attributes_pipeline = Pipeline([
    ('attribute_adder', AddDerivedAttributes())
])

# Pipeline to scale all the numeric attributes using standard scaler and encode all the categorical attributes using 
# one-hot-encoding
transform_pipeline = ColumnTransformer([
    ('scaling', StandardScaler(), make_column_selector(dtype_include=np.number)),
    ('one_hot_encoding', OneHotEncoder(), make_column_selector(dtype_include=object))
])

# Full Pipeline combining above two
full_pipeline = Pipeline([
    ('transform', transform_pipeline),
    ('add_derived_attributes', add_derived_attributes_pipeline),
])

In [21]:
employees_prepared = full_pipeline.fit_transform(employees)

In [22]:
employees_test_prepared = full_pipeline.transform(employees_test)

# 

## Modelling

In [23]:
# Function to evaluate the models that we are going to try

from sklearn import metrics

def evaluate_model(model, X_train=employees_prepared, y_train=employees_labels, 
                   X_test=employees_test_prepared, y_test=employees_test_labels):
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_confusion_matrix = metrics.confusion_matrix(y_true=y_train, y_pred=y_train_pred)
    test_confusion_matrix = metrics.confusion_matrix(y_true=y_test, y_pred=y_test_pred)
    
    train_accuracy = metrics.accuracy_score(y_true=y_train, y_pred=y_train_pred)
    test_accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_test_pred)
    
    train_recall_rate = metrics.recall_score(y_true=y_train, y_pred=y_train_pred)
    test_recall_rate = metrics.recall_score(y_true=y_test, y_pred=y_test_pred)
    
    train_fpr, train_tpr, train_thresholds = metrics.roc_curve(y_true=y_train, y_score=model.predict_proba(X_train)[:, 1])
    train_auc = metrics.auc(train_fpr, train_tpr)
    
    test_fpr, test_tpr, test_thresholds = metrics.roc_curve(y_true=y_test, y_score=model.predict_proba(X_test)[:, 1])
    test_auc = metrics.auc(test_fpr, test_tpr)
    
    print(f'Train Confusion Matrix:\n {train_confusion_matrix}\n')
    print(f'Test Confusion Matrix:\n {test_confusion_matrix}\n')
    print(f'Train Accuracy: {train_accuracy}')
    print(f'Test Accuracy: {test_accuracy} \n')
    print(f'Train Recall Score: {train_recall_rate}')
    print(f'Test Recall Score: {test_recall_rate} \n')
    print(f'Train AUC: {train_auc}')
    print(f'Test AUC: {test_auc}')

#### Logistic Regression

In [24]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

log_reg_clf = LogisticRegression()

log_reg_clf.fit(employees_prepared, employees_labels)

In [25]:
evaluate_model(model=log_reg_clf)

Train Confusion Matrix:
 [[959  27]
 [108  82]]

Test Confusion Matrix:
 [[244   3]
 [ 27  20]]

Train Accuracy: 0.8852040816326531
Test Accuracy: 0.8979591836734694 

Train Recall Score: 0.43157894736842106
Test Recall Score: 0.425531914893617 

Train AUC: 0.8642735133980997
Test AUC: 0.8676027220260144


In [26]:
# We have obtained decent accuracy and AUC but the Recall rate is very low

#### Naive Bayes

In [27]:
# Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()

gnb_clf.fit(employees_prepared, employees_labels)

In [28]:
evaluate_model(model=gnb_clf)

Train Confusion Matrix:
 [[728 258]
 [ 52 138]]

Test Confusion Matrix:
 [[188  59]
 [ 12  35]]

Train Accuracy: 0.7363945578231292
Test Accuracy: 0.7585034013605442 

Train Recall Score: 0.7263157894736842
Test Recall Score: 0.7446808510638298 

Train AUC: 0.7695793743994876
Test AUC: 0.7946420880351451


# 

In [29]:
# Bernoulli Naive Bayes

from sklearn.naive_bayes import BernoulliNB

bnb_clf = BernoulliNB()

bnb_clf.fit(employees_prepared, employees_labels)

In [30]:
evaluate_model(model=bnb_clf)

Train Confusion Matrix:
 [[874 112]
 [ 82 108]]

Test Confusion Matrix:
 [[224  23]
 [ 23  24]]

Train Accuracy: 0.8350340136054422
Test Accuracy: 0.8435374149659864 

Train Recall Score: 0.5684210526315789
Test Recall Score: 0.5106382978723404 

Train AUC: 0.7912298494715491
Test AUC: 0.8371091394607633


# 

In [31]:
# Decision Tree