## Data Preparation with Advanced Pipeline

In [2]:
# Now, Let's prepare the data for ML algorithms

In [3]:
# Importing the necessary libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# To display all the columns in the dataframe

pd.set_option('display.max_columns', None)

In [5]:
# Reading the train and test sets (These were split before EDA)

def read_data(filename):
    return pd.read_csv(os.path.join('dataset', filename), index_col='index').rename_axis(None)

In [6]:
df_train = read_data('df_train.csv')
df_test = read_data('df_test.csv')
data = read_data('data.csv')

In [7]:
print(df_train.shape)
print(df_test.shape)
print(data.shape)

(1176, 35)
(294, 35)
(1176, 45)


In [8]:
# Encoding the target variable in both train and test set

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_train['Attrition'] = le.fit_transform(df_train['Attrition'])

df_test['Attrition'] = le.transform(df_test['Attrition'])

In [9]:
# Separating X and y i.e. the independent tariables and the target variable

employees = df_train.drop('Attrition', axis=1)
employees_labels = df_train['Attrition'].copy()

employees_test = df_test.drop('Attrition', axis=1)
employees_test_labels = df_test['Attrition'].copy()

In [10]:
print(employees.shape, employees_labels.shape)
print(employees_test.shape, employees_test_labels.shape)

(1176, 34) (1176,)
(294, 34) (294,)


In [54]:
# Let's create a custom class to add the derived attribute 'MonthlRateToIncome_Diff'

from sklearn.base import BaseEstimator, TransformerMixin

monthly_rate_idx, monthly_income_idx = 18, 17

class AddDerivedAttributes(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_['monthly_rate_income_diff'] = X_['MonthlyRate'] - X_['MonthlyIncome']
        return X_

In [15]:
employees.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
941,30,Travel_Rarely,1138,Research & Development,6,3,Technical Degree,1,1311,1,Female,48,2,2,Laboratory Technician,4,Married,4627,23631,0,Y,No,12,3,1,80,1,10,6,3,9,2,6,7
865,30,Travel_Rarely,1329,Sales,29,4,Life Sciences,1,1211,3,Male,61,3,2,Sales Executive,1,Divorced,4115,13192,8,Y,No,19,3,3,80,3,8,3,3,4,3,0,3
18,53,Travel_Rarely,1219,Sales,2,4,Life Sciences,1,23,1,Female,78,2,4,Manager,4,Married,15427,22021,2,Y,No,16,3,3,80,0,31,3,3,25,8,3,7
786,27,Non-Travel,1277,Research & Development,8,5,Life Sciences,1,1094,1,Male,87,1,1,Laboratory Technician,3,Married,4621,5869,1,Y,No,19,3,4,80,3,3,4,3,3,2,1,2
416,38,Travel_Frequently,1490,Research & Development,2,2,Life Sciences,1,556,4,Male,42,3,1,Laboratory Technician,4,Married,1702,12106,1,Y,Yes,23,4,3,80,1,1,3,3,1,0,0,0


In [22]:
employees_labels.value_counts(normalize=True)

0    0.838435
1    0.161565
Name: Attrition, dtype: float64

In [None]:
# Our target variable is highly imbalanced. We have to balance it at the time of data preparation

In [12]:
# Now let's build a pipeline to clean and prepare the data

In [13]:
# 1. Dropping unnecessary variables from both train and test sets

unnecessary_vars = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']

In [55]:
# Importing the necessary libraries for pipeline

from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

ppl = Pipeline([
    # 1. Add derived features
    ('add_derived_features', AddDerivedAttributes()),
    
    # 2. Drop irrelevant features
    ('drop_features', DropFeatures(['EmployeeNumber'])),
    ('drop_constant_value_features', DropConstantFeatures(tol=1, missing_values='ignore')),
    ('drop_duplicate_features', DropDuplicateFeatures()),
    
    # 3. Impute, Scale for numeric vars and Encode for categorical vars
    ('cleaning', ColumnTransformer([
        ('num', make_pipeline(SimpleImputer(strategy='median'), 
                              StandardScaler()), 
         make_column_selector(dtype_include='int64')),
        
        ('cat', make_pipeline(SimpleImputer(strategy='most_frequent'), 
                              OneHotEncoder(sparse=False, handle_unknown='ignore')), 
         make_column_selector(dtype_include=object))
    ])),
    
    # 4. Sampling: Handling class imbalance
    ('smote', SMOTE())
])

In [56]:
# Now, Let's see what all features made it through the pipeline until step 2 i.e. cleaning (We are exclusing smote step for now)

ppl_fts = ppl[0:5]
ppl_fts.fit(employees, employees_labels)

In [60]:
# Extracting the features ou from the pipeline

features = ppl_fts[1:].get_feature_names_out()
pd.Series(features)

0                                   num__Age
1                             num__DailyRate
2                      num__DistanceFromHome
3                             num__Education
4               num__EnvironmentSatisfaction
5                            num__HourlyRate
6                        num__JobInvolvement
7                              num__JobLevel
8                       num__JobSatisfaction
9                         num__MonthlyIncome
10                          num__MonthlyRate
11                   num__NumCompaniesWorked
12                    num__PercentSalaryHike
13                    num__PerformanceRating
14             num__RelationshipSatisfaction
15                     num__StockOptionLevel
16                    num__TotalWorkingYears
17                num__TrainingTimesLastYear
18                      num__WorkLifeBalance
19                       num__YearsAtCompany
20                   num__YearsInCurrentRole
21              num__YearsSinceLastPromotion
22        

In [None]:
# Starting with:
# num__ -> indicates numeric feature
# cat__ -> indicates categorical feature

In [61]:
# Let's change the pipeline from sklearn to imblearn so that SMOTE works in our pipeline

ppl = imbPipeline([
    # 1. Add derived features
    ('add_derived_features', AddDerivedAttributes()),
    
    # 2. Drop irrelevant features
    ('drop_features', DropFeatures(['EmployeeNumber'])),
    ('drop_constant_value_features', DropConstantFeatures(tol=1, missing_values='ignore')),
    ('drop_duplicate_features', DropDuplicateFeatures()),
    
    # 3. Impute, Scale for numeric vars and Encode for categorical vars
    ('cleaning', ColumnTransformer([
        ('num', make_pipeline(SimpleImputer(strategy='median'), 
                              StandardScaler()), 
         make_column_selector(dtype_include='int64')),
        
        ('cat', make_pipeline(SimpleImputer(strategy='most_frequent'), 
                              OneHotEncoder(sparse=False, handle_unknown='ignore')), 
         make_column_selector(dtype_include=object))
    ])),
    
    # 4. Sampling: Handling class imbalance
    ('smote', SMOTE())
])

In [62]:
# Now let's pass our data into the pipeline so that the classes are also balanced now

ppl_fts_bal = ppl.fit(employees, employees_labels)

ppl_fts_bal

In [None]:
# Now the SMOTE is also part of our pipeline

In [64]:
# Let's add Logistic Regression to our pipeline and see how it behaves

from sklearn.linear_model import LogisticRegression

ppl = imbPipeline([
    # 1. Add derived features
    ('add_derived_features', AddDerivedAttributes()),
    
    # 2. Drop irrelevant features
    ('drop_features', DropFeatures(['EmployeeNumber'])),
    ('drop_constant_value_features', DropConstantFeatures(tol=1, missing_values='ignore')),
    ('drop_duplicate_features', DropDuplicateFeatures()),
    
    # 3. Impute, Scale for numeric vars and Encode for categorical vars
    ('cleaning', ColumnTransformer([
        ('num', make_pipeline(SimpleImputer(strategy='median'), 
                              StandardScaler()), 
         make_column_selector(dtype_include='int64')),
        
        ('cat', make_pipeline(SimpleImputer(strategy='most_frequent'), 
                              OneHotEncoder(sparse=False, handle_unknown='ignore')), 
         make_column_selector(dtype_include=object))
    ])),
    
    # 4. Sampling: Handling class imbalance
    ('smote', SMOTE()),
    
    # 5. Logistic Regression model
    ('logistic_regression_model', LogisticRegression(max_iter=400))
])

In [65]:
ppl

In [66]:
log_reg_ppl = ppl.fit(employees, employees_labels)

In [67]:
accuracy_score(y_true=employees_labels, y_pred=log_reg_ppl.predict(employees))

0.7772108843537415