<img src="mmu_logo.png" style="height: 80px;" align=left> 

# Learning Objectives

Towards the end of this lesson, you should be able to:
- save and load a predictive model
- constructing a stacking ensemble model
- comparing the performance of stacking
- experiment with GridSearchCV

# Load Libraries

In [1]:
import pandas as pd

from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

from matplotlib import pyplot

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", 500)

# Save and Load a Model

### Read Dataset

In [None]:
df = pd.read_csv("banking.csv")
df.head()

### One-Hot Encoding

In [None]:
# your codes here...



### Separate into X, y

In [None]:
y = df.y
X = df.drop("y", 1)
colnames = X.columns

# Train-Test-Split using 20% test data
# your codes here...
X_train, X_test, y_train, y_test = train_test_split(...)
        

### Train, Save, and Load a Model

In [None]:
# train a Random Forest classifier
# max_depth=3, random_state=10
# your codes here...

# your codes here...

# printing the accuracy, f1, precision, recall scores

print("Accuracy = {}".format(round((accuracy_score(y_test, y_pred)*100), 2)))
print("Recall = {}".format(round((recall_score(y_test, y_pred)*100), 2)))
print("Precision = {}".format(round((precision_score(y_test, y_pred)*100), 2)))
print("F1 = {}".format(round((f1_score(y_test, y_pred)*100), 2)))


In [None]:
# save the model - method 1 (using pickle)

import pickle

# your codes here...



In [None]:
# save the model - method 2 (using joblib)

import joblib

# your codes here...



# Stacked Ensemble Modeling

In [None]:
# get a stacking ensemble of models

def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart', DecisionTreeClassifier()))
    level0.append(('rf', RandomForestClassifier()))    
    level0.append(('bayes', GaussianNB()))
    
    # define the stacking ensemble
       
    level1 = GaussianNB()     
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
        
    
    return model

### Preparing the list of models

In [None]:
# construct a list of models in a dictionary

models = dict()

# your codes here...



### Evaluating the Models

In [None]:
# evaluate a given model using cross-validation

def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
    return scores


In [None]:
# evaluate the models and store results. Will take...may be 5-10 minutes for some of you!!!

results, names = list(), list()

# your codes here...

    
# plot model performance for comparison - based on f1-score

pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

# Model Hyperparameter Tuning

In [None]:
# Instantiate a DecisionTreeClassifier 'dt'

dt = DecisionTreeClassifier(random_state=10)

In [None]:
# Print out 'dt's hyperparameters

print(dt.get_params())

In [None]:
# Define the grid of hyperparameters 'params_dt'

params_dt = {'max_depth': [2, 3, 4], 
             'min_samples_leaf': [0.04, 0.06], 
             'max_features': [0.2, 0.4, 0.6, 0.8]}

# Instantiate a 10-fold CV grid search object 'grid_dt', scoring is based on "accuracy"

# your codes here...


# Fit 'grid_dt' to the training data

grid_dt.fit(X_train, y_train)

In [None]:
# Extract best hyperparameters from 'grid_dt'

best_hyperparams = grid_dt.best_params_

print('Best hyerparameters:\n', best_hyperparams)

In [None]:
# Extract best CV score from 'grid_dt'

best_CV_score = grid_dt.best_score_

print('Best CV accuracy = {}'.format(best_CV_score))

# Exercise on GridSearchCV: 
### Predict Diamond Price using RandomForestRegressor

### Read Dataset

In [None]:
df = pd.read_csv("diamonds.csv")
df.head()

### One-Hot Encoding

In [None]:
col_list = [col for col in df.columns.tolist() if df[col].dtype.name == "object"]
df_oh = df[col_list]
df = df.drop(col_list, 1)
df_oh = pd.get_dummies(df_oh)
df = pd.concat([df, df_oh], axis=1)

df.head()

### Separate into X, y

In [None]:
y = df.price
X = df.drop("price", 1)
colnames = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
        

### Train a Random Forest Regressor

In [None]:
# Instantiate a random forests regressor 'rf'

# your codes here...

# Inspect rf' s hyperparameters

# your codes here...


### Define the hyper-parameters

In [None]:
# Define a grid of hyperparameter 'params_rf'

# your codes here...

# Instantiate 'grid_rf'

# your codes here...



### Train the model

In [None]:
# Fit 'grid_rf' to the training set

grid_rf.fit(X_train, y_train)


### Print out the best hyperparameters


In [None]:
# Extract best hyperparameters from 'grid_rf'

best_hyperparams = grid_rf.best_params_ 
print('Best hyerparameters:\n', best_hyperparams)

### Perform Prediction using the Best Model

In [None]:
# Extract best model from 'grid_rf'

best_model = grid_rf.best_estimator_

# Predict the test set labels
y_pred = best_model.predict(X_test)

# Evaluate the test set RMSE

# your codes here...

# Print the test set RMSE

print('Test set RMSE of rfr: {:.2f}'.format(rmse_test))