# **UGain ML Tree-based Models Exercise Notebook**

## **Import Libraries**

In [None]:
!pip install auto-sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting auto-sklearn
  Downloading auto-sklearn-0.15.0.tar.gz (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 22.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
Collecting pynisher<0.7,>=0.6.3
  Downloading pynisher-0.6.4.tar.gz (11 kB)
Collecting distro
  Downloading distro-1.8.0-py3-none-any.whl (20 kB)
Collecting pyrfr<0.9,>=0.8.1
  Downloading pyrfr-0.8.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 44.0 MB/s 
[?25hCollecting smac<1.3,>=1.2
  Downloading smac-1.2.tar.gz (260 kB)
[K     |████████████████████████████████| 260 kB 54.9 MB/s 
Collecting ConfigSpace<0.5,>=0.4.21
  Downloading ConfigSpace-0.4.21-cp

In [None]:
# Auto-Sklearn

# Basic libraries
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

# Sklearn
## Data
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

## Models
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

## Model Explaination
from sklearn.inspection import permutation_importance

## Metrics
from sklearn.metrics import mean_squared_error

# XGBoost
import xgboost

# Plotting
import graphviz
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

# Auto-Sklearn
try:
  import autosklearn.regression
  import autosklearn.metrics
finally:
  import autosklearn.regression
  import autosklearn.metrics

## **Load Dataset**

In [None]:
# Load dataset
diabetes_data = load_diabetes()
predictors = diabetes_data['data']
labels = diabetes_data['target']

# Print description of the dataset
print(diabetes_data['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [None]:
# Parameters
seed = 0

# Train - Test Split
X_train, X_test, y_train, y_test = train_test_split(predictors, 
                                                    labels, 
                                                    random_state=seed)

## **Exercises**

In [None]:
# Utility functions
from sklearn.metrics import make_scorer

def get_rmse(model, predictors, labels):
  predictions = model.predict(predictors)
  rmse = mean_squared_error(labels, predictions, squared=False)
  return rmse

def rmse_loss(true_labels, pred_labels):
  return mean_squared_error(true_labels, pred_labels, squared=False)

score_function_decision_tree = make_scorer(rmse_loss, greater_is_better=False)

### **Exercise 1a: Fit Decision Tree (Regression)**

In [None]:
# Create decision tree regressor object
decision_tree_regressor = tree.DecisionTreeRegressor(random_state=seed)

# Fit the training data to the regressor
decision_tree_regressor = , ...# FILL HERE

# Calculate root mean square error of the train and test sets
train_rmse = get_rmse(decision_tree_regressor, ...# FILL HERE)
test_rmse = get_rmse(decision_tree_regressor, ...# FILL HERE)

# Verbose
print("Train set root mean squared error is: {} and test set root mean squared error is: {}".format(round(train_rmse, 4), 
                                                                                                    round(test_rmse, 4)))

### **Exercise 1b: Search for the Best Cost-Complexity Pruning (alpha)**

In [None]:
# Call built-in method to compute the pruning path during Minimal Cost-Complexity Pruning.
ccp_alphas = decision_tree_regressor.cost_complexity_pruning_path(X_train, y_train).ccp_alphas

# Define parameter space to search
param_grid = {... #FILL HERE (HINT: parameter name is 'cpp_alpha' and cpp_alphas has type numpy.ndarray)
              ,}

# Create decision tree regressor object
decision_tree_regressor = tree.DecisionTreeRegressor(random_state=seed)

# Perform grid search in the defined parameter space with cross validation
CV_decision_tree_regressor = GridSearchCV(estimator=decision_tree_regressor, 
                                          param_grid=param_grid, 
                                          cv= 5, 
                                          scoring=score_function_decision_tree)
CV_decision_tree_regressor.fit(...# FILL HERE)

# Verbose best parameters from the GridSearchCV
print('Best Parameters:', ...# FILL HERE)

# Fit decision tree regressor model with best parameters
decision_tree_regressor = tree.DecisionTreeRegressor(random_state=seed,
                                                     ...# FILL HERE)
decision_tree_regressor = decision_tree_regressor.fit(X_train, y_train)

# Calculate root mean square error of the train and test sets
train_rmse = get_rmse(decision_tree_regressor, X_train, y_train)
test_rmse = get_rmse(decision_tree_regressor, X_test, y_test)

# Verbose
print("Train set root mean squared error is: {} and test set root mean squared error is: {}".format(round(train_rmse, 4), 
                                                                                                    round(test_rmse, 4)))

### **Exercise 2a: Fit an Ensemble Model of Your Choice (Regression)**

Helpful to search regression ensemble models, use websites;
1. https://scikit-learn.org/stable/modules/ensemble.html
2. https://xgboost.readthedocs.io/en/stable/python/python_api.html

In [None]:
# Create a ensemble regressor object
ensemble_regressor = ...# FILL HERE

# Fit the training data to the ensemble regressor
ensemble_regressor = ...# FILL HERE

# Calculate root mean square error of the train and test sets
train_rmse = get_rmse(ensemble_regressor, X_train, y_train)
test_rmse = get_rmse(ensemble_regressor, X_test, y_test)
print("Train set root mean squared error is: {} and test set root mean squared error is: {}".format(round(train_rmse, 4), 
                                                                                                    round(test_rmse, 4)))

### **Exercise 2b: Search Hyperparameter Space of your Choice of Model**

In [None]:
# Define parameter space to search
param_grid = {
  ...# FILL HERE
}

# Create ensemble tree regressor object
ensemble_regressor = ...# FILL HERE

# Perform grid search in the defined parameter space with cross validation
CV_ensemble_regressor = GridSearchCV(estimator=ensemble_regressor, 
                                    param_grid=param_grid, 
                                    cv= 5, 
                                    scoring=score_function_decision_tree)
CV_ensemble_regressor.fit(...# FILL HERE)

# Verbose best parameters from the GridSearchCV
print('Best Parameters:', CV_ensemble_regressor.best_params_)

# Fit ensemble regressor model with best parameters
ensemble_regressor = ...# FILL HERE
                              
ensemble_regressor = ensemble_regressor.fit(X_train, y_train)

# Calculate root mean square error of the train and test sets
train_rmse = get_rmse(ensemble_regressor, X_train, y_train)
test_rmse = get_rmse(ensemble_regressor, X_test, y_test)

# Verbose
print("Train set root mean squared error is: {} and test set root mean squared error is: {}".format(round(train_rmse, 4), 
                                                                                                    round(test_rmse, 4)))

### **Exercise 2c: Plot: Ensemble Regressor Feature Importance**

In [None]:
# Perform permutation feature importance using the best ensemble model
permutation_importance_result = ... # FILL HERE

# Extract the mean and standard deviation of the feature importances from the results and create Pandas Dataframe
forest_importances = pd.DataFrame({"importances" : permutation_importance_result.importances_mean, 
                                   "stdev" : permutation_importance_result.importances_std }, 
                                   index=diabetes_data['feature_names']).sort_values("importances", ascending=False).iloc[:5]

# Plot the feature importances
fig, ax = plt.subplots(figsize=(15,8))
forest_importances["importances"].plot.bar(yerr=forest_importances.stdev, ax=ax)
ax.set_title("Feature importances using permutation on test data")
ax.set_ylabel("Mean RMSE decrease")
ax.set_ylim(bottom=0)
plt.show()

### **Exercise 3 (Bonus): Use AutoML to Improve RMSE Metric**

In [None]:
# Define AutoML regression model from Auto-Sklearn
automl = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=120,
                                                     metric=autosklearn.metrics.root_mean_squared_error)

# Fit AutoML regression model 
...# FILL HERE

# Calculate root mean square error of the train and test sets
train_rmse = ...# FILL HERE
test_rmse = ...# FILL HERE

# Verbose
print("Train set root mean squared error is: {} and test set root mean squared error is: {}".format(round(train_rmse, 4), 
                                                                                                    round(test_rmse, 4)))
# Verbose Final Model Leaderboard from AutoML
print(automl.leaderboard())