# Hyperparameter Optimization

In [None]:
import pickle

import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

### Get data

In [None]:
df = pd.read_csv('../data/housing/housing_train.csv', index_col=0)

train, val = train_test_split(df, test_size=0.2, random_state=42)

Xtrain = train.iloc[:,:-1] # all but last
ytrain = train.iloc[:,-1]

Xval = val.iloc[:,:-1] # all but last
yval = val.iloc[:,-1]

### Define a pipeline

In [None]:
column_trans = ColumnTransformer([
    
    # ('name', object, column names)
    ('cat-to-binary', OneHotEncoder(sparse=False), ['Street']),
    ('do nothing', 'passthrough', ['OverallQual', 'YrSold']),
    
])

pipe = make_pipeline(
    
    column_trans,
    PolynomialFeatures(interaction_only=True, include_bias=False),
    RandomForestRegressor()
)

In [None]:
# steps have names
pipe.steps

### Define a parameter grid

In [None]:
param_grid = {
    # stepname__substepname__parametername : [options]
    'columntransformer__cat-to-binary__drop': [None, 'first'],
    'randomforestregressor__n_estimators': [5, 10, 50, 100],
    'randomforestregressor__max_depth': [2, 3, 4],
}

### Run the grid search

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, param_grid, 
                  return_train_score=True, 
                  scoring='neg_mean_squared_error',  # GridSearch maximizes the score! 
                  cv=5, n_jobs=-1)  # -1 : max CPU cores

grid.fit(Xtrain, ytrain)

### Collect the results

In [None]:
res = pd.DataFrame(grid.cv_results_)
res.head()

In [None]:
res.columns

In [None]:
res[['param_columntransformer__cat-to-binary__drop', 'param_randomforestregressor__max_depth', 'param_randomforestregressor__n_estimators', 'mean_train_score', 'mean_test_score']]

In [None]:
# higher score is better
res.plot.scatter(x='param_randomforestregressor__max_depth', y='mean_test_score')

### Make a final prediction on the left-out test set

In [None]:
grid.best_estimator_

In [None]:
pred = grid.predict(Xval)
mean_absolute_error(yval, pred)