# Parameter search comparisons

1. GridSearch
2. RandomSearch
3. HyperOpt Usage

> We will use IDAO-2020 data for demonstration.
https://www.kaggle.com/datasets/neibyr/idao2020

![title](ECI-ECEF.png)

In [1]:
# Глубина дерева [2,...,12]
# Количество дереьвев [10,..,20]

In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import KFold, cross_val_score

In [3]:
# !pip install catboost

In [4]:
# !kaggle datasets download -d neibyr/idao2020
# !unzip  idao2020.zip

In [5]:
RANDOM_SEED=5

train = pd.read_csv('./data/train.csv', index_col=0)
test =  pd.read_csv('./data/Track 1/test.csv', index_col=0)

In [6]:
train.head(10)

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2014-01-01T00:00:00.000,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133
1,2014-01-01T00:46:43.000,0,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468
2,2014-01-01T01:33:26.001,0,-10578.684043,-10180.46746,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768
3,2014-01-01T02:20:09.001,0,-9148.251857,-20651.43746,-20720.381279,0.7156,-3.373762,1.722115,-9149.620794,-20618.200201,-20765.019094,0.712437,-3.375202,1.718306
4,2014-01-01T03:06:52.002,0,-6719.092336,-28929.061629,-14938.907967,0.992507,-2.519732,2.344703,-6729.358857,-28902.271436,-14992.399986,0.989382,-2.522618,2.342237
5,2014-01-01T03:53:35.002,0,-3708.453525,-34767.115528,-7863.224747,1.136613,-1.651984,2.660079,-3726.986435,-34749.558551,-7921.459045,1.133861,-1.655467,2.659121
6,2014-01-01T04:40:18.003,0,-437.699227,-38249.612548,-234.351187,1.183619,-0.846348,2.752309,-463.278088,-38241.966025,-293.331552,1.181359,-0.849775,2.752681
7,2014-01-01T05:27:01.003,0,2863.147037,-39594.503233,7420.53828,1.162076,-0.128606,2.687907,2831.900642,-39595.997138,7364.088245,1.160316,-0.131566,2.689303
8,2014-01-01T06:13:44.004,0,6031.593902,-39056.319613,14731.102545,1.091816,0.497608,2.512783,5996.014434,-39065.326088,14679.572942,1.090515,0.495341,2.514879
9,2014-01-01T07:00:27.004,0,8950.655291,-36886.362968,21432.111677,0.985854,1.036692,2.25693,8911.9528,-36900.814799,21387.028371,0.984956,1.035218,2.259425


In [7]:
# np.sqrt(np.mean((train['x'] - train['x_sim'])**2))

In [8]:
test.head(2)

Unnamed: 0_level_0,sat_id,epoch,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3927,1,2014-02-01T00:01:45.162,-13366.891347,-14236.753503,6386.774555,4.333815,-0.692764,0.810774
3928,1,2014-02-01T00:22:57.007,-7370.434039,-14498.77152,7130.411325,5.077413,0.360609,0.313402


In [9]:
def prepare_features(df):
    '''minimal preprocessing'''
    date = pd.to_datetime(df.epoch)
    # year and month are the same accross the data
    df['day'] = date.dt.day
    df['weekday'] = date.dt.weekday
    df['hour'] = date.dt.hour
    df['minute'] = date.dt.minute
    df['second'] = date.dt.second
    
    return df.drop('epoch', axis=1)

In [10]:
train = prepare_features(train)
X = train[['x_sim', 'y_sim', 'z_sim',
           'Vx_sim', 'Vy_sim', 'Vz_sim',
           'sat_id', 'day', 'weekday', 'hour', 'minute','second']]
Y = train[['x', 'y', 'z',
           'Vx', 'Vy', 'Vz']]

# 1. GridSearch

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
# Sattelite based cross-validation

rgn = RandomForestRegressor(n_estimators=10)
cv = list(KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, Y['x'], groups=X['sat_id']))

In [13]:
# [1][2][3][4][5]

# Train [1-4],     Test [5] --> Accuracy = 0.78
# Train [1,2,3,5], Test [4] --> Accuracy = 0.73
# Train [1,2,4,5], Test [3] --> Accuracy = 0.88
# Train [1,3,4,5], Test [2] --> Accuracy = 0.71
# Train [2,3,4,5], Test [1] --> Accuracy = 0.75


# RandomForest(n_estimators=10, max_depth=4, min_samples_split=10)

# mean = 0.72 std = 0.1

# RandomForest(n_estimators=10, max_depth=8, min_samples_split=10)

# mean = 0.75 std = 0.08

# Leave-one-out

In [14]:
len(np.arange(3,8,2)) * len(np.arange(2,25,5)) * 1 * 2

30

In [15]:
params = {
    'max_depth': np.arange(3,8,2),
    'min_samples_split': np.arange(2,25,5),
    'min_samples_leaf': [5],
    'max_features': [0.3, 0.7],
    'random_state':[RANDOM_SEED],
}

gs = GridSearchCV(estimator=rgn,
                  param_grid=params,
                  scoring='neg_mean_squared_error',
                  cv=cv,
                  n_jobs=-1,
                  verbose=5,)

In [16]:
# train-validation

# 80-20

In [17]:
# 8 - 1,1
# 8 - 0,0
# 4 - 1,0

# accuracy = 16/20 = 0.8

# 20-20-20-20-20
# 1,2,3,4,5

# 1 -> 0.8 w_j-1
# 2 -> 0.9 w_j-2
# 3 -> 0.7
# ...

# Mean, Std

In [18]:
# 100+-10, 90+-20

# TTtest, Student t-test

In [19]:
# train-validation-test
# train - обучаете "параметры" алгоритма
# validation - подбор гиперпараметров
# test - оценка качества работы алгоритма

# train-test
# train: fold1, fold2, fold3 - cross-validation
# fold1+fold2 -> fold3
# fold2+fold3 -> fold1
# fold1+fold3 -> fold2

In [20]:
gs.fit(X, Y['x'])
# ~10 min

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [21]:
params = {
    'max_depth': np.arange(3,8,2),
    'min_samples_split': np.arange(2,25,5),
    'min_samples_leaf': [5],
    'max_features': [0.3, 0.7],
    'random_state':[RANDOM_SEED],
}


In [34]:
train.head(3)

Unnamed: 0_level_0,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim,day,weekday,hour,minute,second
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133,1,2,0,0,0
1,0,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468,1,2,0,46,43
2,0,-10578.684043,-10180.46746,-24238.280949,0.277435,-4.047522,0.723155,-10571.858472,-10145.939908,-24271.169776,0.27488,-4.046788,0.718768,1,2,1,33,26


In [22]:
gs.best_score_

-46118936.64006169

In [35]:
np.sqrt(46118936)

6791.09240107952

In [37]:
6500 * 2 * 3.1415

40839.5

In [39]:
np.sqrt(np.mean((train['x'] - train['x_sim'])**2))

7203.823696124034

In [23]:
gs.best_params_

{'max_depth': 7,
 'max_features': 0.7,
 'min_samples_leaf': 5,
 'min_samples_split': 22,
 'random_state': 5}

In [24]:
gs.best_estimator_

# 2. RandomSearch

In [25]:
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

In [26]:
params = {
    'max_depth': stats.randint(2, 8),
    'min_samples_split': stats.randint(2, 25),
    'min_samples_leaf': [5],
    'max_features': stats.uniform(),
    'random_state':[RANDOM_SEED],
}


rs = RandomizedSearchCV(estimator=rgn,
                       param_distributions=params,
                       n_iter=30,
                       scoring='neg_mean_squared_error',
                       n_jobs=-1,
                       cv=cv,
                       verbose=5,
                       random_state=RANDOM_SEED)

In [27]:
rs.fit(X, Y['x'])

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [28]:
rs.best_params_

{'max_depth': 7,
 'max_features': 0.8265542486873563,
 'min_samples_leaf': 5,
 'min_samples_split': 18,
 'random_state': 5}

In [29]:
gs.best_params_

{'max_depth': 7,
 'max_features': 0.7,
 'min_samples_leaf': 5,
 'min_samples_split': 22,
 'random_state': 5}

In [40]:
rs.best_score_

-45767720.244413085

In [41]:
np.sqrt(45767720)

6765.184402512617

In [30]:
# RandomForestRegressor(**rs.best_params_)
# 10*0.83

# 3. Hyperopt usage 

http://hyperopt.github.io/hyperopt/

In [42]:
!pip install hyperopt

Looking in indexes: https://neuro_hub_pip:****@gitlab.com/api/v4/groups/11793222/-/packages/pypi/simple/, https://neuro_hub_pip:****@gitlab.com/api/v4/groups/11951174/-/packages/pypi/simple/, https://neuro_hub_pip:****@gitlab.com/api/v4/groups/11828481/-/packages/pypi/simple/, https://neuro_hub_pip:****@gitlab.com/api/v4/groups/68941111/-/packages/pypi/simple/, https://neuro_hub_pip:****@gitlab.com/api/v4/groups/78793364/-/packages/pypi/simple/
Collecting hyperopt
  Obtaining dependency information for hyperopt from https://files.pythonhosted.org/packages/b6/cd/5b3334d39276067f54618ce0d0b48ed69d91352fbf137468c7095170d0e5/hyperopt-0.2.7-py2.py3-none-any.whl.metadata
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting future (from hyperopt)
  Obtaining dependency information for future from https://files.pythonhosted.org/packages/da/71/ae30dadffc90b9006d77af76b393cb9dfbfc9629f339fc1574a1c52e6806/future-1.0.0-py3-none-any.whl.metadata
  Downloading future-1.0.0-

In [43]:
from hyperopt import Trials, fmin, hp, tpe

In [44]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, random_state=RANDOM_SEED)

def score(params):
    print(f"Training with params: {params}")
    rgn.set_params(**params)
    cv = list(KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, Y['x'], groups=X['sat_id']))
    neg_mse = cross_val_score(rgn, X, Y['x'], scoring='neg_mean_squared_error', cv=cv).mean()        
    return -neg_mse


def optimize(random_state=RANDOM_SEED, niter=2):
    param_space = {
    'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 25, dtype=int)),
    'max_features': hp.uniform('max_features', 0, 1.),
    }
    trials = Trials()
    best = fmin(score, 
                param_space, 
                algo=tpe.suggest, 
                trials=trials, 
                max_evals=niter,
                rstate=np.random.default_rng(RANDOM_SEED)
               )
    return best, trials, param_space

In [45]:
# Use niter=2 for minimal example
best_hyperparams, trials = optimize(niter=50) #30 

Training with params: {'max_depth': 5, 'max_features': 0.23244083818919758, 'min_samples_split': 19}
Training with params: {'max_depth': 5, 'max_features': 0.12850062719790667, 'min_samples_split': 8}
Training with params: {'max_depth': 3, 'max_features': 0.5151489749049859, 'min_samples_split': 3}
Training with params: {'max_depth': 2, 'max_features': 0.23370057559854263, 'min_samples_split': 14}
Training with params: {'max_depth': 5, 'max_features': 0.6319311937777371, 'min_samples_split': 10}
Training with params: {'max_depth': 3, 'max_features': 0.16199009717757973, 'min_samples_split': 7}
Training with params: {'max_depth': 3, 'max_features': 0.27304662452385786, 'min_samples_split': 9}
Training with params: {'max_depth': 3, 'max_features': 0.744845460895223, 'min_samples_split': 11}
Training with params: {'max_depth': 4, 'max_features': 0.855854763369589, 'min_samples_split': 15}
Training with params: {'max_depth': 2, 'max_features': 0.023657216970935924, 'min_samples_split': 13}

ValueError: too many values to unpack (expected 2)

# HyperOpt `fmin` returns indexes for `choice` defined parameters 

In [None]:
# np.arange(2, 8, dtype=int)[5], np.arange(2, 25, dtype=int)[1]

best_hyperparams

In [None]:
param_space = {
    'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 25, dtype=int)),
    'max_features': hp.uniform('max_features', 0, 1.),
    }

# Use `space_eval` to get optimal hyperparameters values!

In [None]:
from hyperopt import space_eval
print(space_eval(param_space, best_hyperparams))

In [None]:
# Checkout trials object

print(trials.results)
print(trials.best_trial)
print(trials.idxs_vals)

In [None]:
gs.best_params_

In [None]:
rs.best_params_

In [None]:
best_hyperparams

In [None]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, random_state=RANDOM_SEED, n_jobs=-1)
rgn.set_params(**gs.best_params_)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

In [None]:
np.sqrt(46_118_936)

In [None]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, random_state=RANDOM_SEED, n_jobs=-1)
rgn.set_params(**rs.best_params_)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

In [None]:
np.sqrt(45_767_720)

In [None]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, **space_eval(param_space, best_hyperparams),
                            random_state=RANDOM_SEED, n_jobs=-1)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

In [None]:
np.sqrt(44_255_009)

# Optuna (an alternative to HyperOpt)

https://optuna.org/

In [None]:
!pip install optuna

In [None]:
import optuna

In [None]:
def objective(trial: optuna.trial.Trial):
    param_space = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 50),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 25),
        'max_features': trial.suggest_float('max_features', 1e-6, 1., log=False),
        }

    rgn = RandomForestRegressor(n_estimators=10, random_state=RANDOM_SEED, **param_space, n_jobs=6)

    cv = list(KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, Y['x'], groups=X['sat_id']))
    mse = cross_val_score(rgn, X, Y['x'], scoring='neg_mean_squared_error', cv=cv).mean()

    return mse

In [None]:
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, n_trials=20, show_progress_bar=True, n_jobs=1)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
rgn = RandomForestRegressor(n_estimators=10, **study.best_params,
                            random_state=RANDOM_SEED, n_jobs=-1)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

In [None]:
np.sqrt(40665758)