# <center>IEE 520: Fall 2019</center>

# <center> Ensembles (11/05/19)</center>

## <center>Klim Drobnyh (klim.drobnyh@asu.edu)</center>

In [None]:
# For compatibility with Python 2
from __future__ import print_function

# To load datasets
from sklearn import datasets

# To import decision tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# To import random forest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# To import adaboost
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor

# To import bagging
from sklearn.ensemble import BaggingClassifier, BaggingRegressor

# To display a tree
from sklearn.tree import plot_tree

# To measure accuracy
from sklearn import metrics

from sklearn.model_selection import GridSearchCV, cross_validate

# To support plots
import matplotlib.pyplot as plt

import seaborn as sn

import numpy as np
import pandas as pd

import math

# To display all the plots inline
%matplotlib inline

In [None]:
# To increase quality of figures
plt.rcParams["figure.figsize"] = (30, 10)

In [None]:
# To import the scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer


class DummyScaler:
    
    def fit(self, data):
        pass
    
    def transform(self, data):
        return data

def create_scaler_dummy():
    return DummyScaler()
    
def create_scaler_standard():
    return StandardScaler()

def create_scaler_minmax():
    return MinMaxScaler()

def crete_scaler_binarizer():
    return Binarizer()

## <center>Classification</center>

### <center>The dataset</center>

Here we will use Pen-Based Recognition of Handwritten Digits Data Set.

This is a quite old dataset (1998), it contains features derived from pen trajectories arising from handwritten digits (0–9) from 44 subjects.

You can find more information about it here:
https://archive.ics.uci.edu/ml/datasets/Pen-Based+Recognition+of+Handwritten+Digits

In [None]:
import requests
import os

def download_file(url):
    filename = os.path.basename(url)
    if not os.path.exists(filename):
        response = requests.get(url)
        open(filename, 'wb').write(response.content)
    return filename


from sklearn.datasets import load_svmlight_file
X_train, y_train = load_svmlight_file(download_file('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/pendigits'), dtype=np.int32)
y_train = y_train.astype(np.int32)
X_train = X_train.toarray()
X_test, y_test = load_svmlight_file(download_file('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/pendigits.t'), dtype=np.int32)
y_test = y_test.astype(np.int32)
X_test = X_test.toarray()

plt.hist(y_train)
plt.show()

In [None]:
print(X_train.shape)
print(X_test.shape)

### <center>Decision Tree Classifier</center>

In [None]:
model_tree = GridSearchCV(DecisionTreeClassifier(random_state=520),
                          cv=5,
                          param_grid={
                              "max_depth": list(range(1, 40, 2)),
                              "min_samples_split": list(range(2, 5, 2))
                          })
model_tree.fit(X_train, y_train)
print('The parameters found by CV search:')
print(model_tree.best_params_)
y_test_hat = model_tree.predict(X_test)

print('Accuracy:', metrics.accuracy_score(y_test, y_test_hat))

cm = metrics.confusion_matrix(y_test, y_test_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()

### <center>Random Forest Classifier</center>

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
model_forest = RandomForestClassifier(n_estimators=100, random_state=520, 
                                      max_depth=model_tree.best_params_['max_depth'],
                                      min_samples_split=model_tree.best_params_['min_samples_split'],
                                      n_jobs=-1)
model_forest.fit(X_train, y_train)
y_test_hat = model_forest.predict(X_test)

print('Accuracy:', metrics.accuracy_score(y_test, y_test_hat))

cm = metrics.confusion_matrix(y_test, y_test_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()

### <center>Ada Boost Classifier</center>

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

In [None]:
model_adaboost = AdaBoostClassifier(DecisionTreeClassifier(random_state=520,
                                                           max_depth=model_tree.best_params_['max_depth'],
                                                           min_samples_split=model_tree.best_params_['min_samples_split']))
model_adaboost.fit(X_train, y_train)
y_test_hat = model_adaboost.predict(X_test)

print('Accuracy:', metrics.accuracy_score(y_test, y_test_hat))

cm = metrics.confusion_matrix(y_test, y_test_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()

### <center>Bagging Classifier</center>

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html

In [None]:
model_bagging = BaggingClassifier(DecisionTreeClassifier(random_state=520,
                                                         max_depth=model_tree.best_params_['max_depth'],
                                                         min_samples_split=model_tree.best_params_['min_samples_split']),
                                  bootstrap=False,
                                  max_samples=0.6,
                                  bootstrap_features=False,
                                  max_features=0.6,
                                  n_estimators=100)
model_bagging.fit(X_train, y_train)
y_test_hat = model_bagging.predict(X_test)

print('Accuracy:', metrics.accuracy_score(y_test, y_test_hat))

cm = metrics.confusion_matrix(y_test, y_test_hat)
ax = sn.heatmap(cm, annot=True, fmt='g', square=True)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')
plt.show()

## <center>Regression</center>

Let's consider that dataset:
http://staff.pubhealth.ku.dk/~tag/Teaching/share/data/Bodyfat.html

The data contain estimates of the percentage of body fat determined by underwater weighing and various body circumference measurements for 252 men.

Accurate measurement of body fat is inconvenient/costly and it is desirable to have easy methods of estimating body fat that are not inconvenient/costly.

Thanks to StatLib and Roger W. Johnson who contributed this data set.

The variables in the data set are:

* Density determined from underwater weighing
* Percent body fat from Siri's (1956) equation
* Age (years)
* Weight (lbs)
* Height (inches)
* Neck circumference (cm)
* Chest circumference (cm)
* Abdomen 2 circumference (cm)
* Hip circumference (cm)
* Thigh circumference (cm)
* Knee circumference (cm)
* Ankle circumference (cm)
* Biceps (extended) circumference (cm)
* Forearm circumference (cm)
* Wrist circumference (cm)

In [None]:
data = pd.read_csv('http://staff.pubhealth.ku.dk/~tag/Teaching/share/data/Bodyfat.csv')

In [None]:
print(data.head())

Let's consider a realistic model: the input containts all the measurements, the output is bodyfat.

In [None]:
vals = data.values
X_complex = vals[:, 2:]
y_complex = vals[:, 1]

In [None]:
def rmse_loss(y_true, y_pred):
    return math.sqrt(metrics.mean_squared_error(y_true, y_pred))

# Greater is better to make sure the signs are not flipped
rmse_score = metrics.make_scorer(rmse_loss, greater_is_better=True)

In [None]:
model_tree = GridSearchCV(DecisionTreeRegressor(random_state=520, max_features='sqrt'),
                          cv=5,
                          iid=False,
                          param_grid={
                             "max_depth": list(range(1, 40, 2)),
                             "min_samples_split": list(range(2, 5, 2)),
                          })
model_tree.fit(X_complex, y_complex)
print('The parameters found by CV search:')
print(model_tree.best_params_)
test_score = cross_validate(model_tree, X_complex, y_complex, cv=10, scoring=rmse_score)['test_score']

print('RMSE:', np.mean(test_score))

y_complex_hat = model_tree.predict(X_complex)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(y_complex, y_complex_hat)
plt.show()

### <center>Random Forest Regressor</center>

In [None]:
model_forest = RandomForestRegressor(n_estimators=100, random_state=520, 
                                     max_depth=model_tree.best_params_['max_depth'],
                                     min_samples_split=model_tree.best_params_['min_samples_split'],
                                     max_features='sqrt',
                                     n_jobs=-1)
model_forest.fit(X_complex, y_complex)
test_score = cross_validate(model_forest, X_complex, y_complex, cv=10, scoring=rmse_score)['test_score']

print('RMSE:', np.mean(test_score))

y_complex_hat = model_forest.predict(X_complex)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(y_complex, y_complex_hat)
plt.show()

In [None]:
model_forest = GridSearchCV(RandomForestRegressor(random_state=598, max_features='sqrt'),
                            cv=5,
                            n_jobs=-1,
                            iid=False,
                            param_grid={
                                'n_estimators': [300, 400, 500],
                                'max_depth': [7, 8, 9],
                                'min_samples_split': [4, 6, 8]
                            })
model_forest.fit(X_complex, y_complex)
print('The parameters found by CV search:')
print(model_forest.best_params_)
test_score = cross_validate(model_forest.best_estimator_, X_complex, y_complex, cv=10, scoring=rmse_score)['test_score']

print('RMSE:', np.mean(test_score))

y_complex_hat = model_forest.predict(X_complex)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(y_complex, y_complex_hat)
plt.show()

### <center>AdaBoost Regressor</center>

In [None]:
from sklearn.ensemble import AdaBoostRegressor
model_adaboost = AdaBoostRegressor(DecisionTreeRegressor(random_state=520,
                                                         max_depth=model_tree.best_params_['max_depth'],
                                                         min_samples_split=model_tree.best_params_['min_samples_split'],
                                                         max_features='sqrt'))
model_adaboost.fit(X_complex, y_complex)
test_score = cross_validate(model_adaboost, X_complex, y_complex, cv=10, scoring=rmse_score)['test_score']

print('RMSE:', np.mean(test_score))

y_complex_hat = model_adaboost.predict(X_complex)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(y_complex, y_complex_hat)
plt.show()

### <center>Bagging Regressor</center>

In [None]:
model_bagging = BaggingRegressor(DecisionTreeRegressor(random_state=520,
                                                         max_depth=model_tree.best_params_['max_depth'],
                                                         min_samples_split=model_tree.best_params_['min_samples_split'],
                                                         max_features='sqrt',),
                                  bootstrap=False,
                                  max_samples=0.6,
                                  bootstrap_features=False,
                                  max_features=0.6,
                                  n_estimators=100)
model_bagging.fit(X_complex, y_complex)
test_score = cross_validate(model_bagging, X_complex, y_complex, cv=10, scoring=rmse_score)['test_score']

print('RMSE:', np.mean(test_score))

y_complex_hat = model_bagging.predict(X_complex)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(y_complex, y_complex_hat)
plt.show()