In [1]:
# Importing the python libraries for model.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss

# Surpass the warnings.....
import warnings
warnings.filterwarnings('ignore')

# Step 1 -  Load the Data

In [2]:
# load data file.
data = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [5]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
data.shape

(1470, 35)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [35]:
data.dtypes.value_counts()

int64     26
object     9
dtype: int64

In [37]:
df_cat = pd.DataFrame(data = data.select_dtypes(include=['object']))
df_num = pd.DataFrame(data = data.select_dtypes(include=['int64']))
print("Categorical Columns", df_cat.shape)
print("Numerical Columns", df_num.shape)

Categorical Columns (1470, 9)
Numerical Columns (1470, 26)


**Observations:-**
- We have data of HR employees of the compnies.
- The dataframe has 1470 rows and 35 columns.
- There are total 9 columns has categorical in nature and 26 columns which has continus value in nature.

In [15]:
data.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


#### Let's check the null values present in the data.

In [24]:
# Let's check is there is any column which have all values as null.
data.isnull().all(axis = 0).sum()

0

In [25]:
# Let's check is there is any row which have all values as null.
data.isnull().all(axis = 1).sum()

0

In [26]:
# Let's check is there is any cell which have values as null.
data.isnull().any(axis = 0).sum()

0

**Observations:-**
- There are no any column present in the data which have all values as null value.
- There are no any row present in the data which have all values as null value.
- There no any null value present in the data.

# Step 2 - Feature engineering

There are some column present in or data which are categorical in nature. For model building we have to convert these categorical values as nummerical form. For this we are using the dummie value creation.

There are some column present in or data which are numerical in nature but the range of the values are roo high. For model building we have to convert these columns in standarise form. For this we are using the Standare scaler for data scalling.

#### Dummie column creation

In [38]:
df_cat

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,No,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Yes,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,No,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,No,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
...,...,...,...,...,...,...,...,...,...
1465,No,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
1466,No,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,Y,No
1467,No,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Married,Y,Yes
1468,No,Travel_Frequently,Sales,Medical,Male,Sales Executive,Married,Y,No


Here we have a feature 'Attrition' who is our target variable. So we have to exclude this feature from he categorical_cols list before creation of dummie values.

In [43]:
#### Create Dummie values ........
df_cat.drop('Attrition', axis = 1, inplace=True)

In [44]:
df_cat

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
...,...,...,...,...,...,...,...,...
1465,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
1466,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,Y,No
1467,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Married,Y,Yes
1468,Travel_Frequently,Sales,Medical,Male,Sales Executive,Married,Y,No


In [50]:
for col in df_cat.columns:
    print("Column Name -> ", col)
    print(df_cat[col].value_counts())
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

Column Name ->  BusinessTravel
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: BusinessTravel, dtype: int64
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column Name ->  Department
Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column Name ->  EducationField
Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: EducationField, dtype: int64
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column Name ->  Gender
Male      882
Female    588
Name: Gender, dtype: int64
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column Name ->  JobRole
Sales Executive              326
Research Scientist           292
Laboratory Technician        259
Manufacturing Director       145


From the above results we can say that there are one column present in our data 'Over18' which have onlly one value.
So we are going to delete this column because this column dose not do any help  for model building

In [51]:
df_cat.drop('Over18', axis = 1, inplace=True)

In [52]:
df_cat

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,No
...,...,...,...,...,...,...,...
1465,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Married,No
1466,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,No
1467,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Married,Yes
1468,Travel_Frequently,Sales,Medical,Male,Sales Executive,Married,No


Now we are going to create the dummie value. Here we are using drop_first = True because we don't column repetation.

In [57]:
dummie_df = pd.get_dummies(df_cat, drop_first=True)

In [58]:
dummie_df

Unnamed: 0,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Research & Development,Department_Sales,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Male,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
1,1,0,1,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
2,0,1,1,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,1,1
3,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
4,0,1,1,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,1,0,1,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
1466,0,1,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1467,0,1,1,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,1
1468,1,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,1,0,1,0,0


**Observations:-**
- After dummie variable creation we have 21 columns available in our dummie data.

#### Scaling the Numerical Columns

In [61]:
df_num

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,1,1,2,94,3,2,...,1,80,0,8,0,1,6,4,0,5
1,49,279,8,1,1,2,3,61,2,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1373,2,2,1,4,4,92,2,1,...,2,80,0,7,3,3,0,0,0,0
3,33,1392,3,4,1,5,4,56,3,1,...,3,80,0,8,3,3,8,7,3,0
4,27,591,2,1,1,7,1,40,3,1,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,1,2061,3,41,4,2,...,3,80,1,17,3,3,5,2,0,3
1466,39,613,6,1,1,2062,4,42,2,3,...,1,80,1,9,5,3,7,7,1,7
1467,27,155,4,3,1,2064,2,87,4,2,...,2,80,1,6,0,3,6,2,0,3
1468,49,1023,2,3,1,2065,4,63,2,2,...,4,80,0,17,3,2,9,6,0,8


In [62]:
# For this we are using standard scaler.
sc = StandardScaler()

In [64]:
sc_df = sc.fit_transform(df_num)

In [66]:
df_num_scaled = pd.DataFrame(data = sc_df, columns = df_num.columns, index = df_num.index)

In [68]:
df_num_scaled.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,-4.0179500000000004e-17,7.197568e-17,5.1640480000000005e-17,2.697011e-16,0.0,1.132881e-16,7.05407e-17,1.728021e-16,-6.495182e-18,1.479542e-16,...,-1.910792e-16,0.0,6.600918000000001e-17,-9.236753000000001e-17,-1.106069e-16,-5.165936e-17,-1.941002e-17,6.117555e-17,-4.236974e-17,-5.823006e-17
std,1.00034,1.00034,1.00034,1.00034,0.0,1.00034,1.00034,1.00034,1.00034,1.00034,...,1.00034,0.0,1.00034,1.00034,1.00034,1.00034,1.00034,1.00034,1.00034,1.00034
min,-2.072192,-1.736576,-1.010909,-1.868426,0.0,-1.701283,-1.575686,-1.766079,-2.432006,-0.9614864,...,-1.584178,0.0,-0.9320144,-1.450167,-2.171982,-2.49382,-1.144294,-1.167687,-0.6791457,-1.155935
25%,-0.75817,-0.8366616,-0.8875151,-0.8916883,0.0,-0.88667,-0.6605307,-0.8803615,-1.026167,-0.9614864,...,-0.6589728,0.0,-0.9320144,-0.6787735,-0.6201892,-1.077862,-0.6544537,-0.6154916,-0.6791457,-0.5952272
50%,-0.1011589,-0.001204135,-0.270544,0.08504925,0.0,-0.007253514,0.2546249,0.005355811,0.3796721,-0.05778755,...,0.2662326,0.0,0.2419883,-0.1645114,0.1557071,0.3380962,-0.3278933,-0.3393937,-0.3687153,-0.3148735
75%,0.6653541,0.8788772,0.5932157,1.061787,0.0,0.8821327,1.169781,0.8787715,0.3796721,0.8459113,...,1.191438,0.0,0.2419883,0.4783162,0.1557071,0.3380962,0.3252275,0.7649976,0.2521455,0.8065415
max,2.526886,1.72673,2.444129,2.038524,0.0,1.733302,1.169781,1.678377,1.785511,2.653309,...,1.191438,0.0,2.589994,3.692454,2.483396,1.754054,5.386914,3.802074,3.97731,3.610079


In [69]:
# let's create our final_data.
df_final = pd.concat([df_num_scaled, dummie_df], axis=1)

In [70]:
df_final.shape

(1470, 47)

In [76]:
target = data['Attrition']
target

0       Yes
1        No
2       Yes
3        No
4        No
       ... 
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1470, dtype: object

In [77]:
# maopping the Yes = 1 and No = 0.
map = {'Yes' : 1, 'No' : 0}
target = target.apply(lambda x : map[x])
target.shape

(1470,)

In [78]:
target

0       1
1       0
2       1
3       0
4       0
       ..
1465    0
1466    0
1467    0
1468    0
1469    0
Name: Attrition, Length: 1470, dtype: int64

In [79]:
X = df_final
y = target