# Basic Ensemble

Term 1 2019 - Instructor: Teerapong Leelanupab

Teaching Assistant: Suttida Satjasunsern
***

Again we revisit the dataset of McKeen Sea, a consulting firm. One of McKeen Sea's most important customers has requested an additional consultant, and you've been tasked to find the best one available out of two potential candidates: Aaron and Ben. Fortunately, McKeen Sea has collected some data on the past performance of these consultants, and so you would like to use the data to make a better decision. 

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import sys
sys.path.append("..")
from ds_utils.sample_data import get_project_data

data = get_project_data()
data.head()

Unnamed: 0,Consultant,Customer,Project,Service,Performance
74,Ben,New,Small,Old,0.555556
81,Ben,New,Small,Old,0.555556
84,Ben,Old,Small,New,0.888889
47,Aaron,Old,Small,Old,1.0
72,Ben,New,Small,Old,0.555556


In [2]:
data.tail()

Unnamed: 0,Consultant,Customer,Project,Service,Performance
26,Aaron,Old,Large,Old,0.666667
18,Aaron,Old,Large,New,0.555556
15,Aaron,Old,Large,New,0.555556
34,Aaron,Old,Large,Old,0.666667
3,Aaron,New,Large,Old,0.222222


In [3]:
data.describe()

Unnamed: 0,Performance
count,96.0
mean,0.555556
std,0.284766
min,0.111111
25%,0.388889
50%,0.555556
75%,0.722222
max,1.0


In [4]:
data.Consultant.value_counts()

Aaron    48
Ben      48
Name: Consultant, dtype: int64

## Encoding Categorical Attributes

In [5]:
# Fit and transform 3 categorical attributes, i.e., Consultant, Customer and Service
consult_le = LabelEncoder()
data['Consultant_Label'] = consult_le.fit_transform(data['Consultant'])

cus_le = LabelEncoder()
data['Gen_Label'] = cus_le.fit_transform(data['Customer'])

serv_le = LabelEncoder()
data['Service_Label'] = serv_le.fit_transform(data['Service'])

# Fit and transform a ordinal attributes, Project
proj_ord_map = {'Small': 1, 'Large': 2}
data['Project_Label'] = data['Project'].map(proj_ord_map)

In [6]:
data.head()

Unnamed: 0,Consultant,Customer,Project,Service,Performance,Consultant_Label,Gen_Label,Service_Label,Project_Label
74,Ben,New,Small,Old,0.555556,1,0,1,1
81,Ben,New,Small,Old,0.555556,1,0,1,1
84,Ben,Old,Small,New,0.888889,1,1,0,1
47,Aaron,Old,Small,Old,1.0,0,1,1,1
72,Ben,New,Small,Old,0.555556,1,0,1,1


In [7]:
data.tail()

Unnamed: 0,Consultant,Customer,Project,Service,Performance,Consultant_Label,Gen_Label,Service_Label,Project_Label
26,Aaron,Old,Large,Old,0.666667,0,1,1,2
18,Aaron,Old,Large,New,0.555556,0,1,0,2
15,Aaron,Old,Large,New,0.555556,0,1,0,2
34,Aaron,Old,Large,Old,0.666667,0,1,1,2
3,Aaron,New,Large,Old,0.222222,0,0,1,2


In [8]:
data.drop(['Consultant', 'Customer', 'Project', 'Service'] , axis=1, inplace=True)
data.head()

Unnamed: 0,Performance,Consultant_Label,Gen_Label,Service_Label,Project_Label
74,0.555556,1,0,1,1
81,0.555556,1,0,1,1
84,0.888889,1,1,0,1
47,1.0,0,1,1,1
72,0.555556,1,0,1,1


In [9]:
X = data.drop('Performance', axis=1)
Y = data[['Performance']]
Y

Unnamed: 0,Performance
74,0.555556
81,0.555556
84,0.888889
47,1.000000
72,0.555556
65,0.444444
30,0.666667
60,0.444444
13,0.555556
59,0.444444


### Partition Data
Perform data partitioning into 3 sets, including training, validation and test sets

In [10]:
#Simple partition data into trainig + validation and test sets with 70% and 30% respectively.
X_train, X_test, y_train, y_test \
    = train_test_split(X, Y, random_state=0, \
                                    train_size = 0.7)



## Random Forest Classifier
Build the Random Forest Classifier, one of the ensemble techniques based on bagging by randomly sampling feature. This simple Random Forest Classifier incorporates with DecisionTreeClassifier. A simple holdout validation was performed, where the model was trained on the training set, and tested on the validation set.

In [11]:
# Do the same job with RandomForestClassifier
rf_model = RandomForestRegressor(n_estimators = 100, \
                                                        criterion='mse', \
                                                        max_depth =  50, \
                                                        max_features = 'auto')
rf_model.fit(X_train, y_train.values.ravel())
print(rf_model)
predicted = rf_model.predict(X_test)
mae = mean_absolute_error(y_test, predicted)
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print("MAE = %.3f" % (mae))
print("MSE = %.3f" % (mse))
print("R Squared = %.3f" % (r2))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
MAE = 0.001
MSE = 0.000
R Squared = 1.000


## Adaptive Boosting (AdaBoost)
Build the AdaBoost Classifier, one of the ensemble techniques. This simple AdaBoost Classifier incorporates with DecisionTreeClassifier. A simple holdout validation was performed, where the model was trained on the training set, and tested on the validation set.

In [12]:
# Do the same job with AdaBoostClassifier
model = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(criterion="mse", \
                                                                 max_depth = 50, \
                                                                 max_features = 'auto'), \
                                                                 n_estimators = 100)
model.fit(X_train, y_train.values.ravel())
print(model)
predicted = model.predict(X_test)
mae = mean_absolute_error(y_test, predicted)
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print("MAE = %.3f" % (mae))
print("MSE = %.3f" % (mse))
print("R Squared = %.3f" % (r2))

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=50, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=100,
         random_state=None)
MAE = 0.000
MSE = 0.000
R Squared = 1.000
