In [1]:
# It allows you to handle the warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing necessary packages
import pandas as pd   #It is used for importing the dataset and managing dataset
import numpy as np    #It is used because it contains mathematical tools

In [3]:
# Importing the data file
data = pd.read_excel('INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls')
data.head()

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3


# Data types of the features

In [4]:
# To know the types of data for each features
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 28 columns):
EmpNumber                       1200 non-null object
Age                             1200 non-null int64
Gender                          1200 non-null object
EducationBackground             1200 non-null object
MaritalStatus                   1200 non-null object
EmpDepartment                   1200 non-null object
EmpJobRole                      1200 non-null object
BusinessTravelFrequency         1200 non-null object
DistanceFromHome                1200 non-null int64
EmpEducationLevel               1200 non-null int64
EmpEnvironmentSatisfaction      1200 non-null int64
EmpHourlyRate                   1200 non-null int64
EmpJobInvolvement               1200 non-null int64
EmpJobLevel                     1200 non-null int64
EmpJobSatisfaction              1200 non-null int64
NumCompaniesWorked              1200 non-null int64
OverTime                        1200 non-null object
E

# Data Cleaning or Data Mugging or Data Wrangling

In [5]:
# Looking for missing data
data.isnull().sum()

EmpNumber                       0
Age                             0
Gender                          0
EducationBackground             0
MaritalStatus                   0
EmpDepartment                   0
EmpJobRole                      0
BusinessTravelFrequency         0
DistanceFromHome                0
EmpEducationLevel               0
EmpEnvironmentSatisfaction      0
EmpHourlyRate                   0
EmpJobInvolvement               0
EmpJobLevel                     0
EmpJobSatisfaction              0
NumCompaniesWorked              0
OverTime                        0
EmpLastSalaryHikePercent        0
EmpRelationshipSatisfaction     0
TotalWorkExperienceInYears      0
TrainingTimesLastYear           0
EmpWorkLifeBalance              0
ExperienceYearsAtThisCompany    0
ExperienceYearsInCurrentRole    0
YearsSinceLastPromotion         0
YearsWithCurrManager            0
Attrition                       0
PerformanceRating               0
dtype: int64

There is no Nan values in the Dataset

# Encoding the Categorical values using LabelEncoder

In [6]:
# Label Encoder is used to convert Categorical values into numerical values
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
data.EmpNumber = enc.fit_transform(data.EmpNumber)
data.Gender = enc.fit_transform(data.Gender)
data.EducationBackground = enc.fit_transform(data.EducationBackground)
data.MaritalStatus = enc.fit_transform(data.MaritalStatus)
data.EmpDepartment = enc.fit_transform(data.EmpDepartment)
data.EmpJobRole = enc.fit_transform(data.EmpJobRole)
data.BusinessTravelFrequency = enc.fit_transform(data.BusinessTravelFrequency)
data.OverTime = enc.fit_transform(data.OverTime)
data.Attrition = enc.fit_transform(data.Attrition)

# Feature selection

### Feature Selection by Recursive Feature Elimination of RandomForestClassifier

In [7]:
#import the Recursive Feature Elimination library(RFE) and RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

In [8]:
# selecting x and y parameters
X = data.iloc[:,0:27]
y = data.PerformanceRating

In [9]:
# Splitting the X_train,X_test,y_train,y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [10]:
#Fitting the model
data1 = RFE(RandomForestClassifier(),n_features_to_select=19)
data1.fit(X_train,y_train)

RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
  n_features_to_select=19, step=1, verbose=0)

In [11]:
#Using get_support is getting a mask, or integer index, of the features selected
data1.get_support()

array([ True,  True, False, False, False,  True,  True, False,  True,
        True,  True,  True, False, False,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False])

In [12]:
features = X_train.columns[data1.get_support()]
features

Index(['EmpNumber', 'Age', 'EmpDepartment', 'EmpJobRole', 'DistanceFromHome',
       'EmpEducationLevel', 'EmpEnvironmentSatisfaction', 'EmpHourlyRate',
       'EmpJobSatisfaction', 'NumCompaniesWorked', 'EmpLastSalaryHikePercent',
       'EmpRelationshipSatisfaction', 'TotalWorkExperienceInYears',
       'TrainingTimesLastYear', 'EmpWorkLifeBalance',
       'ExperienceYearsAtThisCompany', 'ExperienceYearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

This are the top 19 important features for model

# Feature Scaling

Feature Scaling is a technique of data preprocessing which is applid to independent variables to normalize
the data within particular range.

### Why and Where to apply feature scaling?

The dataset contains features that highly vary in magnitudes and units and range. The algorithm which us Euclidean distance are sensitive to magnitudes. Here feature scaling helps to scales all the features equally.Those algorithms are 1.KMeans 2.KNearestNeighbors 3.Principle Component Analusis (PCA)

We dont have to use feature scaling for Decision Tree, Random Forest and Xgboost. If we perform feature scaling then also it is not going to make such impact.

Techniques to perform feature Scaling:-
1. Normalization :- It is technique to transfrom the data between 0 and 1.
2. Standardization :- It is very effective technique which rescales a feature value. So that it has distribution with 0 mean and variance equals to 1.

In [13]:
# Standardization technique is used
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)