# Find best hyperparameter settings
We deduce the 'best' hyperparamter settings of an algorithm from performance data of 100 OpenML datasets.

In [63]:
from hyperimp import settings
from scipy.io import arff
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [64]:
# load data
data = arff.loadarff('data/meta_svm.arff')
svm = pd.DataFrame(data[0])
data = arff.loadarff('data/meta_random_forest.arff')
rf = pd.DataFrame(data[0])
data = arff.loadarff('data/meta_adaboost.arff')
ada = pd.DataFrame(data[0])

# group svm data by kernel
g = svm.groupby('kernel')

# create dictionary with dataframes per algorithm
alg_names = ['svm_' + s.decode('UTF-8') for s in list(g.groups.keys())] + ['rf', 'ada']
dfs_all = [g.get_group(x) for x in g.groups] + [rf, ada]
dfs = {}
for alg, df in zip(alg_names, dfs_all):
    dfs[alg] = df

# initialize Alg objects
algs = settings.init_algs('data/parameters.csv', dfs)

In [78]:
# find 'best' settings for each algorithm
n = 10
m = 40
best_settings = {}

for alg in algs:
    best_settings[alg.name] = settings.find_settings(alg, n, m)

print("Best parameter settings (upper, lower, average):")
for alg in best_settings.items():
    print(alg[0])
    for param in alg[1].items():
        print(param)
    print('')

Best parameter settings (upper, lower, average):
ada
('learning_rate', (0.9403636048583641, 1.3690599802270496, 1.1346427535897055))
('max_depth', (10.0, 10.0, 10.0))
('n_estimators', (347.0, 347.0, 347.0))

rf
('max_features', (0.10049095794053714, 0.1671092005338944, 0.13380007923721576))
('min_samples_leaf', (1.0, 1.0, 1.0))
('min_samples_split', (3.0, 3.0, 3.0))

svm_poly
('gamma', (0.02777172820303183, 0.0861657709598875, 0.048918016839429306))
('C', (750.2445960405138, 2639.280475839149, 1407.1623624989254))
('tol', (2.3121597756484174e-05, 5.33996069994535e-05, 3.5138073842994044e-05))
('coef0', (0.6354975528283304, 0.8170906933257429, 0.7262941230770367))

svm_rbf
('gamma', (3.064979617150178e-05, 9.517823176868809e-05, 5.401104890366645e-05))
('C', (3187.5329804684493, 10097.45003439787, 5673.266695941192))
('tol', (1.0050080883373552e-05, 2.316065882326913e-05, 1.5256686877761962e-05))

svm_sigmoid
('gamma', (3.054518416172193e-05, 8.611483342565479e-05, 5.1287361465010395e-0

In [98]:
# set ranges used in random search verification experiment
ranges = {'gamma' : np.logspace(-15, 3, 10, base = 2),
          'C' : np.logspace(-5, 15, 10, base = 2),
          'tol' : np.logspace(-5, -1, 10, base = 10),
          'coef0' : np.linspace(0, 1, 10),
          'max_features' : np.linspace(0.1, 0.9, 10), 
          'min_samples_leaf' : np.linspace(1, 20, 10),
          'min_samples_split' : np.linspace(2, 20, 10),
          'learning_rate' : np.logspace(np.log10(0.01), np.log10(2), 10, base = 10),
          'max_depth' : np.linspace(1, 10, 10),
          'n_estimators' : np.linspace(50, 500, 10)
}

print('Parameter ranges used in random search verification experiment:')
for i in ranges.items():
    print('%s \n' % str(i))

Parameter ranges used in random search verification experiment:
('gamma', array([3.05175781e-05, 1.22070312e-04, 4.88281250e-04, 1.95312500e-03,
       7.81250000e-03, 3.12500000e-02, 1.25000000e-01, 5.00000000e-01,
       2.00000000e+00, 8.00000000e+00])) 

('C', array([3.12500000e-02, 1.45816130e-01, 6.80395000e-01, 3.17480210e+00,
       1.48139954e+01, 6.91238233e+01, 3.22539789e+02, 1.50500812e+03,
       7.02254271e+03, 3.27680000e+04])) 

('tol', array([1.00000000e-05, 2.78255940e-05, 7.74263683e-05, 2.15443469e-04,
       5.99484250e-04, 1.66810054e-03, 4.64158883e-03, 1.29154967e-02,
       3.59381366e-02, 1.00000000e-01])) 

('coef0', array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])) 

('max_features', array([0.1       , 0.18888889, 0.27777778, 0.36666667, 0.45555556,
       0.54444444, 0.63333333, 0.72222222, 0.81111111, 0.9       ])) 

('min_samples_leaf', array([ 1.        ,  3.11111111, 

In [102]:
# check if verification experiment settings fall within the best hyper parameter setting ranges 
print("The following parameters of rs verification experiment fall within the 'best' settings:\n")
for alg in best_settings.items():
    for param in alg[1].items():
        name = param[0]
        lower = param[1][0]
        upper = param[1][1]
        for i in ranges[name]:
            if i >= lower and i <= upper:
                print('%s %s: %.5f falls within (%.5f, %.5f)' %(alg[0], name, i, lower, upper))

The following parameters of rs verification experiment fall within the 'best' settings:

ada learning_rate: 1.11009 falls within (0.94036, 1.36906)
ada max_depth: 10.00000 falls within (10.00000, 10.00000)
rf min_samples_leaf: 1.00000 falls within (1.00000, 1.00000)
svm_poly gamma: 0.03125 falls within (0.02777, 0.08617)
svm_poly C: 1505.00812 falls within (750.24460, 2639.28048)
svm_poly tol: 0.00003 falls within (0.00002, 0.00005)
svm_poly coef0: 0.66667 falls within (0.63550, 0.81709)
svm_poly coef0: 0.77778 falls within (0.63550, 0.81709)
svm_rbf C: 7022.54271 falls within (3187.53298, 10097.45003)
svm_sigmoid tol: 0.00167 falls within (0.00150, 0.00346)
