# Small Examples for Literture Surveys
All kind of stuff

## Bayesian Hyper-Parameter Optimization

In [31]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd
import plotly.express as px
import time
import tqdm
import lightgbm as lgb
import numpy as np
from sklearn import pipeline
from hyperopt import hp,STATUS_OK, Trials, fmin, tpe
from sklearn.ensemble import RandomForestClassifier

In [22]:
param_gridsearch = {
    'n_estimators': [100,200,300,500],#range(50,500,25),
    'max_depth': [2,10,25,50],#range(2,52,5),
    'min_samples_split': [2,5,7,9]#range(2,9)
}

param_random = {
    'min_samples_split': range(2,9),
    'n_estimators': range(50,500),
    'max_depth': range(2,50),
}

param_hyperopt= {
    'n_estimators': hp.uniformint('n_estimators', 50, 500),
    'max_depth': hp.uniformint('max_depth', 2, 50),
    'min_samples_split': hp.uniformint('min_sample_split', 2, 9)
}

EVALS=25
CV=2
TEST_SIZE = 0.5
N_FEATURES = 250

In [None]:
# Read the data
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

In [23]:
# Split and cut
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE)

# Get top FEATURE features for decreasing running time
X_train=X_train[:, :N_FEATURES]
X_test=X_test[:, :N_FEATURES]

results=pd.DataFrame({'bayes': range(EVALS),
                      'grid': range(EVALS),
                      'random': range(EVALS),
                      'iteration': range(EVALS)})

In [34]:
def objective_function(params):
    clf = RandomForestClassifier(**params)
    preds = cross_val_predict(clf, X_train, y_train, cv=CV)
    return {'loss': 1-accuracy_score(preds, y_train), 'status': STATUS_OK}

In [35]:
# Bayesian
tpe_algo = tpe.suggest
tpe_trials = Trials()

tpe_best = fmin(fn=objective_function, 
                space = param_hyperopt, 
                algo=tpe_algo, 
                trials=tpe_trials,
                max_evals=EVALS)
results['bayes'] = [x['loss'] for x in tpe_trials.results]

100%|██████████| 25/25 [10:19<00:00, 24.76s/trial, best loss: 0.25560000000000005]


In [142]:
# Grid
grid_optimizer = GridSearchCV(RandomForestClassifier(), 
                              param_gridsearch, 
                              scoring='accuracy', 
                              cv=CV)
grid_optimizer.fit(X_train, y_train)
results['grid'] = [1-x for x in grid_optimizer.cv_results_['mean_test_score']]

In [38]:
# Random
random_optimizer = RandomizedSearchCV(RandomForestClassifier(), 
                                      param_random, 
                                      scoring='accuracy', 
                                      cv=CV, 
                                      n_iter=EVALS)
random_optimizer.fit(X_train, y_train)
results['random'] = [1-x for x in random_optimizer.cv_results_['mean_test_score']]

In [44]:
# Plotting
melted_res = pd.melt(results, id_vars="iteration", value_vars=["bayes", "random"])
melted_res['accuracy'] = 1-melted_res['value']
fig = px.line(melted_res, x="iteration", y="accuracy", color="variable")
fig.update_layout(title="MSE per Iteration", 
                  yaxis_title="MSE",
                  xaxis_title="Iteration")
fig.show()