# Bagging Models

### Import Libraries

In [1]:
!pip install liac-arff

Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
Building wheels for collected packages: liac-arff
  Building wheel for liac-arff (setup.py) ... [?25l[?25hdone
  Created wheel for liac-arff: filename=liac_arff-2.5.0-py3-none-any.whl size=11731 sha256=3c150117f41df3b29112434dc48032a3ef55083290267a50d930376445e4ea67
  Stored in directory: /root/.cache/pip/wheels/1f/0f/15/332ca86cbebf25ddf98518caaf887945fbe1712b97a0f2493b
Successfully built liac-arff
Installing collected packages: liac-arff
Successfully installed liac-arff-2.5.0


In [26]:
import pandas as pd
import numpy as np
import arff
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree, export_text, DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, accuracy_score
import requests

### Import Data

In [36]:
hr_arff = requests.get('https://raw.githubusercontent.com/juwon0502/MIS-373-Predictive-Analytics/master/datasets/HR_employee_attrition.arff')
hr_arff = arff.load(hr_arff.text)
col_val = [attribute[0] for attribute in hr_arff['attributes']]
hr_df = pd.DataFrame(hr_arff['data'], columns = col_val)
meta = hr_arff['attributes']

In [37]:
def clean_df(df):
  cols = list(df.columns)
  for col in cols:
    try:
      df = df.replace({col: {'Yes': 1, 'No': 0}})
    except:
      pass
  return df

hr_df = clean_df(hr_df)
hr_df_dummies = pd.get_dummies(hr_df)

### Visualize Attributes

In [15]:
def display_attribute(df, meta, col_name):
  pep = df.loc[df['Attrition'] == 1]
  pep_col_name = []
  no_pep_col_name = []
  if type(meta[col_val.index(col_name)][1]) == list:
    labels = meta[col_val.index(col_name)][1]
    for label in labels:
      no_pep_col_name.append(len(df.loc[df[col_name] == label]))
      pep_col_name.append(len(pep.loc[pep[col_name] == label]))

  else:
    labels = []
    min_val = int(min(df[col_name]))
    max_val = int(max(df[col_name]))
    rg = max_val - min_val
    if rg < 12:
      for x in range(min_val, max_val + 1):
        no_pep_col_name.append(len(df.loc[df[col_name] == x]))
        pep_col_name.append(len(pep.loc[pep[col_name] == x]))
        labels.append(x)
    else:
      for y in range(min_val, max_val, (rg//8)):
        no_pep_col_name.append(len(df.loc[df[col_name].between(y, y + (rg//8))]))
        pep_col_name.append(len(pep.loc[pep[col_name].between(y, y + (rg//8))]))
        labels.append(f"{y}-{y+(rg//8-1)}")

  if type(labels[0]) != str:
    labels = [str(label) for label in labels]
  plt.figure(dpi = 150)
  plt.bar(labels, no_pep_col_name, label = 'No attrition')
  plt.bar(labels, pep_col_name, label = 'Yes attrition')
  plt.legend()
  plt.title(f'{col_name} distribution')
  plt.show()

In [None]:
display_attribute(hr_df, meta, 'Department') # Change third parameter to change attribute

### Create Bagging Model

In [38]:
X = hr_df_dummies.drop(columns=['Attrition'])
y = hr_df_dummies.Attrition
clf = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)
n_estimators = int(input("Number of estimators "))
bagging_model = BaggingClassifier(base_estimator=clf, n_estimators=n_estimators, random_state=0).fit(X, y)

Number of estimators 15


### Create Random Forest Model

In [39]:
X = hr_df_dummies.drop(columns=['Attrition'])
y = hr_df_dummies.Attrition
max_features = int(input("How many features (between 4-6)? "))
random_forest_model = RandomForestClassifier(criterion = 'entropy', max_features = max_features).fit(X,y)

How many features (between 4-6)? 5


### Evaluation

In [40]:
bagging_scores = cross_val_score(bagging_model, X, y, cv=10)
bagging_accuracy = bagging_scores.mean()
# std = bagging_scores.std()
# print(f"#### {round(scores.mean(),4)} accuracy with a standard deviation of {round(scores.std(),4)}")

In [41]:
rf_scores = cross_val_score(random_forest_model, X, y, cv=10)
rf_accuracy = rf_scores.mean()
# std = rf_scores.std()
# print(f"#### {round(scores.mean(),4)} accuracy with a standard deviation of {round(scores.std(),4)}")

In [54]:
dt_model = DecisionTreeClassifier(criterion='entropy').fit(X,y)
dt_scores = cross_val_score(dt_model, X, y, cv=10)
dt_accuracy = dt_scores.mean()
# stf = dt_scores.std()
# print(f"#### {round(scores.mean(),4)} accuracy with a standard deviation of {round(scores.std(),4)}")

In [55]:
print(f"Bagging: {round(bagging_accuracy,4)}")
print(f"Random Forest: {round(rf_accuracy,4)}")
print(f"Decision Tree: {round(dt_accuracy,4)}")

Bagging: 0.8558
Random Forest: 0.8578
Decision Tree: 0.7878
