# Small Examples for Literture Surveys
All kind of stuff

## Bayesian Hyper-Parameter Optimization

In [176]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd
import plotly.express as px
import time
import tqdm
import lightgbm as lgb
import numpy as np
from sklearn import pipeline
from hyperopt import hp,STATUS_OK, Trials, fmin
from sklearn.ensemble import RandomForestRegressor

In [202]:
param_gridsearch = {
    'n_estimators': [100,200,300,500],#range(50,500,25),
    'max_depth': [2,10,25,50],#range(2,52,5),
    'min_samples_split': [2,5,7,9]#range(2,9)
}

param_random = {
    'min_samples_split': range(2,9),
    'n_estimators': range(50,500),
    'max_depth': range(2,50),
}

param_hyperopt= {
    'n_estimators': hp.uniformint('n_estimators', 50, 500),
    'max_depth': hp.uniformint('max_depth', 2, 50),
    'min_samples_split': hp.uniformint('min_sample_split', 2, 9)
}

EVALS=200

In [207]:
X, y = load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

results=pd.DataFrame({'bayes': range(EVALS),
                      'grid': range(EVALS),
                      'random': range(EVALS),
                      'iteration': range(EVALS)})

In [208]:
def objective_function(params):
    clf = RandomForestRegressor(**params)
    preds = cross_val_predict(clf, X_train, y_train, cv=5)
    return {'loss': mean_squared_error(preds, y_train), 'status': STATUS_OK}

In [209]:
# Bayesian
tpe_algo = tpe.suggest
tpe_trials = Trials()

tpe_best = fmin(fn=objective_function, 
                space = param_hyperopt, 
                algo=tpe_algo, 
                trials=tpe_trials,
                max_evals=EVALS)
results['bayes'] = [x['loss'] for x in tpe_trials.results]

100%|██████████| 200/200 [11:18<00:00,  3.39s/trial, best loss: 11.701056657283134]


In [142]:
# Grid
grid_optimizer = GridSearchCV(RandomForestRegressor(), 
                              param_gridsearch, 
                              scoring='neg_mean_squared_error', 
                              cv=5)
grid_optimizer.fit(X_train, y_train)
results['grid'] = [-x for x in grid_optimizer.cv_results_['mean_test_score']]

In [210]:
# Random
random_optimizer = RandomizedSearchCV(RandomForestRegressor(), 
                                      param_random, 
                                      scoring='neg_mean_squared_error', 
                                      cv=5, 
                                      n_iter=EVALS)
random_optimizer.fit(X_train, y_train)
results['random'] = [-x for x in random_optimizer.cv_results_['mean_test_score']]

In [211]:
# Plotting
melted_res = pd.melt(results, id_vars="iteration", value_vars=["bayes", "random"])
fig = px.line(melted_res, x="iteration", y="value", color="variable")
fig.update_layout(title="MSE per Iteration", 
                  yaxis_title="MSE",
                  xaxis_title="Iteration")
fig.show()

In [212]:
results.min()

bayes        11.701057
grid          0.000000
random       11.977216
iteration     0.000000
dtype: float64