# Random Forest

## Setup

In [None]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import fetch_openml
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, make_scorer


from shared import RANDOM_STATE, DIABETES_DATASET_ID, BANKNOTE_DATASET_ID, CREDIT_DATASET_ID, SPAMBASE_DATASET_ID
from shared.utilities import prepare_and_split, create_preprocessor

In [None]:
random_state = RANDOM_STATE
output_path ="..\\results\\gradient-boost\\random_search_results.csv"
bayes_output_path = "..\\results\\gradient-boost\\bayes_search_results.csv"
set_config(transform_output = "pandas")

In [None]:
N_ESTIMATORS_MIN = 10
N_ESTIMATORS_MAX = 400
DEPTH_MIN = 3
DEPTH_MAX = 15
SPLIT_SAMPLES_MIN = 2
SPLIT_SAMPLES_MAX = 10
LEAF_SAMPLES_MIN = 1
LEAF_SAMPLES_MAX = 10
SPLIT_TYPE = ['sqrt', 'log2', 0.2, 0.4]

N_JOBS = -1
N_ITERS = 100

In [None]:
diabetes_dataset = fetch_openml(data_id=DIABETES_DATASET_ID, as_frame=True)
banknote_authentication_dataset = fetch_openml(data_id=BANKNOTE_DATASET_ID, as_frame=True)
credit_dataset = fetch_openml(data_id=CREDIT_DATASET_ID, as_frame=True)
spambase_dataset = fetch_openml(data_id=SPAMBASE_DATASET_ID, as_frame=True)

In [None]:
diabetes_train_x, diabetes_train_y= prepare_and_split(diabetes_dataset)
banknotes_train_x, banknotes_train_y= prepare_and_split(banknote_authentication_dataset)
credit_train_x, credit_train_y= prepare_and_split(credit_dataset)
spambase_train_x, spambase_train_y= prepare_and_split(spambase_dataset)

## Randomized Search

In [None]:
param_distributions = {
    'model__n_estimators': np.arange(N_ESTIMATORS_MIN, N_ESTIMATORS_MAX + 1, 10).astype(int),
    'model__max_depth': np.linspace(DEPTH_MIN, DEPTH_MAX, DEPTH_MAX- DEPTH_MIN).astype(int),
    'model__min_samples_split': np.linspace(SPLIT_SAMPLES_MIN, SPLIT_SAMPLES_MAX, SPLIT_SAMPLES_MAX - SPLIT_SAMPLES_MIN).astype(int),
    'model__min_samples_leaf': np.linspace(LEAF_SAMPLES_MIN, LEAF_SAMPLES_MAX, LEAF_SAMPLES_MAX - LEAF_SAMPLES_MIN).astype(int),
    'model__max_features': SPLIT_TYPE 
}

pipeline = Pipeline([
    ('preprocessing', create_preprocessor()),
    ('model', GradientBoostingClassifier())
])

randomized_search_CV = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_jobs=N_JOBS,
    n_iter=N_ITERS,
    verbose=1,
    random_state=RANDOM_STATE,
    cv=5,
    scoring='roc_auc',
    error_score='raise'
    )

In [None]:
datasets = [
    (diabetes_train_x, diabetes_train_y, "diabetes"),
    (banknotes_train_x, banknotes_train_y, "banknotes"),
    (credit_train_x, credit_train_y, "credit"),
    (spambase_train_x, spambase_train_y, "spambase")
]

In [None]:
random_search_results = []

for x, y, name in datasets:
    randomized_search_CV.fit(x, y)
    cv_results = randomized_search_CV.cv_results_
    df_results = pd.DataFrame(cv_results)
    df_results['dataset'] = name
    random_search_results.append(df_results)
    
all_results = pd.concat(random_search_results)
all_results.to_csv(output_path, index=False)

In [None]:
def get_average_combination_scores(results_df):

    grouped = results_df.groupby(['param_model__n_estimators', 
                                'param_model__min_samples_split',
                                'param_model__min_samples_leaf',
                                'param_model__max_features',
                                'param_model__max_depth'])
    
    average_scores = grouped.agg({
        'mean_test_score': ['mean', 'std'],
    }).round(4)
    
    average_scores.columns = ['average_score', 'std_between_datasets']
    
    average_scores = average_scores.reset_index()
    
    return average_scores.sort_values('average_score', ascending=False)


In [None]:
def get_best_hyperparams(results_df):
    best_params = results_df.sort_values('average_score', ascending=False).iloc[0]
    
    return {
        'n_estimators': best_params['param_model__n_estimators'],
        'min_samples_split': best_params['param_model__min_samples_split'],
        'min_samples_leaf': best_params['param_model__min_samples_leaf'],
        'max_features': best_params['param_model__max_features'],
        'max_depth': best_params['param_model__max_depth'],
        'average_score': best_params['average_score']
    }

In [None]:
results_with_avg_scores = get_average_combination_scores(all_results)
print("Best hyperparameters:")
results_with_avg_scores.head()

In [None]:
print("Worst hyperparameters:")
results_with_avg_scores.tail()

In [None]:
all_results.groupby('dataset').agg({'mean_test_score': 'max'})

# Default params score

In [None]:
results = []
scoring = make_scorer(accuracy_score)
for x, y, name in datasets:
    result = cross_validate(pipeline, x, y, cv=5, scoring=scoring)['test_score'].mean()
    
    results.append({'dataset': name, 'score': result})
pd.DataFrame(results)

## Bayes Optimization

In [None]:
from smac.scenario import Scenario
from ConfigSpace import ConfigurationSpace
from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter
from smac.facade.hyperparameter_optimization_facade import HyperparameterOptimizationFacade
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
bayes_results_log = []
def objective_function(config, seed = RANDOM_STATE):
    scores = []
    
    for x,y, name in datasets:
        pipeline = Pipeline([
            ('preprocessing', create_preprocessor()),
            ('model', GradientBoostingClassifier())
        ])
        
        pipeline.set_params(
            model__n_estimators=config['n_estimators'],
            model__max_depth=config['max_depth'],
            model__min_samples_split=config['min_samples_split'],
            model__min_samples_leaf=config['min_samples_leaf'],
            model__max_features=config['max_features'],
        )
        
        score = cross_val_score(pipeline, x, y, cv=5, scoring='roc_auc').mean()
        scores.append({'score': score, 'dataset': name})
            
        bayes_results_log.append({
            'n_estimators': config['n_estimators'],
            'max_depth': config['max_depth'],
            'min_samples_split': config['min_samples_split'],
            'min_samples_leaf': config['min_samples_leaf'],
            'max_features': config['max_features'],
            'score': score,
            'dataset': name
        })

    scores_mean = np.mean([s['score'] for s in scores])    
    return 1-scores_mean

In [None]:
cs = ConfigurationSpace()

n_estimators = CategoricalHyperparameter("n_estimators", np.arange(N_ESTIMATORS_MIN, N_ESTIMATORS_MAX + 1, 10).tolist())
max_depth = UniformIntegerHyperparameter("max_depth", DEPTH_MIN, DEPTH_MAX, default_value=10)
min_samples_split = UniformIntegerHyperparameter("min_samples_split", SPLIT_SAMPLES_MIN, SPLIT_SAMPLES_MAX, default_value=2)
min_samples_leaf = UniformIntegerHyperparameter("min_samples_leaf", LEAF_SAMPLES_MIN, LEAF_SAMPLES_MAX, default_value=1)
max_features = CategoricalHyperparameter("max_features", SPLIT_TYPE, default_value="sqrt")

cs.add([n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features])

default_cfg = cs.get_default_configuration()
scenario = Scenario(cs, deterministic=True, n_trials=N_ITERS)

smac = HyperparameterOptimizationFacade(scenario, objective_function)
bayes_best_hiperparameters = smac.optimize()

pd.DataFrame(bayes_results_log).to_csv(bayes_output_path, index=False)

In [None]:
pd.DataFrame(bayes_results_log).groupby('dataset').agg({'score': 'max'})