## Bagging Classifier

In [14]:
import pandas as pd

In [15]:
df = pd.read_csv("../dataset/ibm-hr-analytics-employee-attrition-performance/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [16]:
df['Attrition_ind'] = 0
df.loc[df['Attrition'] == 'Yes', 'Attrition_ind'] = 1

In [17]:
# We can ignore the four variables as they do not change across the observations
to_ignore = ['EmployeeNumber', 'EmployeeCount', 'Over18', 'StandardHours', 'Attrition_ind']

In [18]:
discrete_columns, continuous_columns = [], []
for i, j in enumerate(df.dtypes):
    if j == object:
        discrete_columns.append(df.columns[i])
    else:
        continuous_columns.append(df.columns[i])

In [19]:
print("Discrete: \n", discrete_columns, '\n\n', "Continuous: \n", continuous_columns, sep='')

Discrete: 
['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']

Continuous: 
['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition_ind']


In [20]:
dummy_busnstrvl = pd.get_dummies(df['BusinessTravel'], prefix='busns_trvl')
dummy_dept = pd.get_dummies(df['Department'], prefix='dept')
dummy_edufield = pd.get_dummies(df['EducationField'], prefix='edufield')
dummy_gender = pd.get_dummies(df['Gender'], prefix='gender')
dummy_jobrole = pd.get_dummies(df['JobRole'], prefix='jobrole')
dummy_maritstat = pd.get_dummies(df['MaritalStatus'], prefix='maritalstat')
dummy_overtime = pd.get_dummies(df['OverTime'], prefix='overtime')

In [21]:
to_ignore_continuous = ['EmployeeNumber', 'EmployeeCount', 'StandardHours', 'Attrition_ind']

In [22]:
continuous_columns = [x for x in continuous_columns if x not in to_ignore_continuous] # list comprehension

In [23]:
print(continuous_columns, sep='')

['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [24]:
df_continuous = df[continuous_columns]

In [25]:
df_new = pd.concat([dummy_busnstrvl, dummy_dept, dummy_edufield, dummy_gender,
                   dummy_jobrole, dummy_maritstat, dummy_overtime, df_continuous,
                   df['Attrition_ind']], axis=1)
# Multi-collinearity does not create a problem in decision trees as opposed to logistic or linear regression,
# hence there is no need to remove any extra derived categorical dummy variable

In [26]:
X = df_new.iloc[:, :-1]
y = df_new['Attrition_ind']

In [28]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
# Bagging Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report

dt_fit = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=42,
                               class_weight={0:0.3, 1:0.7})
bag_fit = BaggingClassifier(base_estimator=dt_fit, n_estimators=5000, max_samples=0.67,
                           max_features=1.0, bootstrap=True,
                           bootstrap_features=False, n_jobs=-1, random_state=42)
bag_fit.fit(X_train, y_train)

bag_pred = bag_fit.predict(X_train)
print("\nBagging - Train Confusion Matrix\n\n", pd.crosstab(y_train, bag_pred,
                                                           rownames=['Actuall'], colnames=['Predicted']))
print('\nBagging - Train accuracy', round(accuracy_score(y_train, bag_pred), 3))
print('\nBagging - Train Classification Report\n', classification_report(y_train, bag_pred))

bag_pred = bag_fit.predict(X_test)
print('\n\nBagging - Test Confusion Matrix\n\n', pd.crosstab(y_test, bag_pred,
                                                            rownames=['Actuall'], colnames=['Predicted']))
print('\nBagging - Test accuracy', round(accuracy_score(y_test, bag_pred), 3))
print('\nBagging - Test Classification Report\n', classification_report(y_test, bag_pred))




Bagging - Train Confusion Matrix

 Predicted    0    1
Actuall            
0          846    7
1           66  110

Bagging - Train accuracy 0.929

Bagging - Train Classification Report
              precision    recall  f1-score   support

          0       0.93      0.99      0.96       853
          1       0.94      0.62      0.75       176

avg / total       0.93      0.93      0.92      1029



Bagging - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          372   8
1           49  12

Bagging - Test accuracy 0.871

Bagging - Test Classification Report
              precision    recall  f1-score   support

          0       0.88      0.98      0.93       380
          1       0.60      0.20      0.30        61

avg / total       0.84      0.87      0.84       441



Given the confusion matrix for test data, we can see the no. of false positive (the number of 0 classified as 1 significantly reduced to 8 compared with 19 in DT)

Overall, Bagging improves performance over the single tree

In [None]:
# References and credits to
# Statistics in Machine Learning