# Parameter search comparisons

1. GridSearch
2. RandomSearch
3. HyperOpt Usage

> We will use IDAO-2020 data for demonstration.
https://www.kaggle.com/datasets/neibyr/idao2020

In [None]:
# Глубина дерева [2,...,12]
# Количество дереьвев [10,..,20]

In [12]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import KFold, cross_val_score

In [11]:
# !pip install catboost

In [3]:
# !kaggle datasets download -d neibyr/idao2020
# !unzip  idao2020.zip

In [4]:
RANDOM_SEED=5

train = pd.read_csv('./data/train.csv', index_col=0)
test =  pd.read_csv('./data/Track 1/test.csv', index_col=0)

In [5]:
train.head(2)

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2014-01-01T00:00:00.000,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133
1,2014-01-01T00:46:43.000,0,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468


In [6]:
test.head(2)

Unnamed: 0_level_0,sat_id,epoch,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3927,1,2014-02-01T00:01:45.162,-13366.891347,-14236.753503,6386.774555,4.333815,-0.692764,0.810774
3928,1,2014-02-01T00:22:57.007,-7370.434039,-14498.77152,7130.411325,5.077413,0.360609,0.313402


In [7]:
def prepare_features(df):
    '''minimal preprocessing'''
    date = pd.to_datetime(df.epoch)
    # year and month are the same accross the data
    df['day'] = date.dt.day
    df['weekday'] = date.dt.weekday
    df['hour'] = date.dt.hour
    df['minute'] = date.dt.minute
    df['second'] = date.dt.second
    
    return df.drop('epoch', axis=1)

In [8]:
train = prepare_features(train)
X = train[['x_sim', 'y_sim', 'z_sim',
           'Vx_sim', 'Vy_sim', 'Vz_sim',
           'sat_id', 'day', 'weekday', 'hour', 'minute','second']]
Y = train[['x', 'y', 'z',
           'Vx', 'Vy', 'Vz']]

# 1. GridSearch

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
# Sattelite based cross-validation

rgn = RandomForestRegressor(n_estimators=10)
cv = list(KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, Y['x'], groups=X['sat_id']))

In [13]:
# [1][2][3][4][5]

# Train [1-4],     Test [5] --> Accuracy = 0.78
# Train [1,2,3,5], Test [4] --> Accuracy = 0.73
# Train [1,2,4,5], Test [3] --> Accuracy = 0.88
# Train [1,3,4,5], Test [2] --> Accuracy = 0.71
# Train [2,3,4,5], Test [1] --> Accuracy = 0.75


# RandomForest(n_estimators=10, max_depth=4, min_samples_split=10)

# mean = 0.72 std = 0.1

# RandomForest(n_estimators=10, max_depth=8, min_samples_split=10)

# mean = 0.75 std = 0.08

# Leave-one-out

In [14]:
len(np.arange(3,8,2)) * len(np.arange(2,25,5)) * 1 * 2

30

In [15]:
params = {
    'max_depth': np.arange(3,8,2),
    'min_samples_split': np.arange(2,25,5),
    'min_samples_leaf': [5],
    'max_features': [0.3, 0.7],
    'random_state':[RANDOM_SEED],
}

gs = GridSearchCV(estimator=rgn,
                  param_grid=params,
                  scoring='neg_mean_squared_error',
                  cv=cv,
                  n_jobs=-1,
                  verbose=5,)

In [None]:
train-validation

80-20

In [None]:
8 - 1,1
8 - 0,0
4 - 1,0

accuracy = 16/20 = 0.8

20-20-20-20-20
1,2,3,4,5

1 -> 0.8 w_j-1
2 -> 0.9 w_j-2
3 -> 0.7
...

Mean, Std

In [None]:
100+-10, 90+-20

TTtest, Student t-test

In [45]:
# train-validation-test
# train - обучаете "параметры" алгоритма
# validation - подбор гиперпараметров
# test - оценка качества работы алгоритма

# train-test
# train: fold1, fold2, fold3 - cross-validation
# fold1+fold2 -> fold3
# fold2+fold3 -> fold1
# fold1+fold3 -> fold2

In [16]:
gs.fit(X, Y['x'])
# ~10 min

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=[(array([     3,      4,      5, ..., 649908, 649910, 649911]),
                  array([     0,      1,      2, ..., 649906, 649907, 649909])),
                 (array([     0,      1,      2, ..., 649909, 649910, 649911]),
                  array([     7,     10,     13, ..., 649889, 649893, 649908])),
                 (array([     0,      1,      2, ..., 649908, 649909, 649910]),
                  array([     3,      4,     11, ..., 649904, 649905, 649911])),
                 (array([     0,      1,      2, ..., 649909, 649910, 649911]),
                  array([     5,      8,     15, ..., 649898, 649900, 649902])),
                 (array([     0,      1,      2, ..., 649908, 649909, 649911]),
                  array([     9,     17,     18, ..., 649897, 649903, 649910]))],
             estimator=RandomForestRegressor(n_estimators=10), n_jobs=-1,
             param_grid={'max_depth': array([3, 5, 7]),
                         'max_features': [0.3, 0.7], 'min_sample

In [None]:
params = {
    'max_depth': np.arange(3,8,2),
    'min_samples_split': np.arange(2,25,5),
    'min_samples_leaf': [5],
    'max_features': [0.3, 0.7],
    'random_state':[RANDOM_SEED],
}


In [19]:
gs.best_score_

-46118936.64006169

In [21]:
gs.best_params_

{'max_depth': 7,
 'max_features': 0.7,
 'min_samples_leaf': 5,
 'min_samples_split': 22,
 'random_state': 5}

In [22]:
gs.best_estimator_

RandomForestRegressor(max_depth=7, max_features=0.7, min_samples_leaf=5,
                      min_samples_split=22, n_estimators=10, random_state=5)

# 2. RandomSearch

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

In [46]:
stats.uniform()

<scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x7f50dfc6ac40>

In [24]:
params = {
    'max_depth': stats.randint(2, 8),
    'min_samples_split': stats.randint(2, 25),
    'min_samples_leaf': [5],
    'max_features': stats.uniform(),
    'random_state':[RANDOM_SEED],
}


rs = RandomizedSearchCV(estimator=rgn,
                       param_distributions=params,
                       n_iter=30,
                       scoring='neg_mean_squared_error',
                       n_jobs=-1,
                       cv=cv,
                       verbose=5,
                       random_state=RANDOM_SEED)

In [25]:
rs.fit(X, Y['x'])

Fitting 5 folds for each of 30 candidates, totalling 150 fits


RandomizedSearchCV(cv=[(array([     3,      4,      5, ..., 649908, 649910, 649911]),
                        array([     0,      1,      2, ..., 649906, 649907, 649909])),
                       (array([     0,      1,      2, ..., 649909, 649910, 649911]),
                        array([     7,     10,     13, ..., 649889, 649893, 649908])),
                       (array([     0,      1,      2, ..., 649908, 649909, 649910]),
                        array([     3,      4,     11, ..., 649904, 649905, 649911])),
                       (array([     0,      1,      2, ..., 649909, 649910, 649911]),
                        array([     5,      8,     15, ..., 649898, 649900, 64990...
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7f50dfc1ca30>,
                                        'max_features': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f50dfc1cac0>,
                                     

In [26]:
rs.best_params_

{'max_depth': 7,
 'max_features': 0.8265542486873563,
 'min_samples_leaf': 5,
 'min_samples_split': 18,
 'random_state': 5}

In [47]:
gs.best_params_

{'max_depth': 7,
 'max_features': 0.7,
 'min_samples_leaf': 5,
 'min_samples_split': 22,
 'random_state': 5}

In [27]:
# RandomForestRegressor(**rs.best_params_)
# 10*0.83

# 3. Hyperopt usage 

http://hyperopt.github.io/hyperopt/

In [31]:
# !pip install hyperopt

In [32]:
from hyperopt import Trials, fmin, hp, tpe

In [56]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, random_state=RANDOM_SEED)

def score(params):
    print(f"Training with params: {params}")
    rgn.set_params(**params)
    cv = list(KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, Y['x'], groups=X['sat_id']))
    neg_mse = cross_val_score(rgn, X, Y['x'], scoring='neg_mean_squared_error', cv=cv).mean()        
    return -neg_mse


def optimize(random_state=RANDOM_SEED, niter=2):
    param_space = {
    'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 25, dtype=int)),
    'max_features': hp.uniform('max_features', 0, 1.),
    }
    trials = Trials()
    best = fmin(score, 
                param_space, 
                algo=tpe.suggest, 
                trials=trials, 
                max_evals=niter,
                rstate=np.random.default_rng(RANDOM_SEED)
               )
    return best, trials, param_space

In [80]:
# Use niter=2 for minimal example
best_hyperparams, trials = optimize(niter=50) #30 

Training with params: {'max_depth': 5, 'max_features': 0.23244083818919758, 'min_samples_split': 19}    
Training with params: {'max_depth': 5, 'max_features': 0.12850062719790667, 'min_samples_split': 8}     
Training with params: {'max_depth': 3, 'max_features': 0.5151489749049859, 'min_samples_split': 3}      
Training with params: {'max_depth': 2, 'max_features': 0.23370057559854263, 'min_samples_split': 14}    
Training with params: {'max_depth': 5, 'max_features': 0.6319311937777371, 'min_samples_split': 10}     
Training with params: {'max_depth': 3, 'max_features': 0.16199009717757973, 'min_samples_split': 7}     
Training with params: {'max_depth': 3, 'max_features': 0.27304662452385786, 'min_samples_split': 9}     
Training with params: {'max_depth': 3, 'max_features': 0.744845460895223, 'min_samples_split': 11}      
Training with params: {'max_depth': 4, 'max_features': 0.855854763369589, 'min_samples_split': 15}      
Training with params: {'max_depth': 2, 'max_features': 

# HyperOpt `fmin` returns indexes for `choice` defined parameters 

In [81]:
# np.arange(2, 8, dtype=int)[5], np.arange(2, 25, dtype=int)[1]

best_hyperparams

{'max_depth': 5, 'max_features': 0.9407602571513763, 'min_samples_split': 11}

In [83]:
param_space = {
    'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 25, dtype=int)),
    'max_features': hp.uniform('max_features', 0, 1.),
    }

# Use `space_eval` to get optimal hyperparameters values!

In [90]:
from hyperopt import space_eval
print(space_eval(param_space, best_hyperparams))

{'max_depth': 7, 'max_features': 0.9407602571513763, 'min_samples_split': 13}


In [77]:
# Checkout trials object

print(trials.results)
print(trials.best_trial)
print(trials.idxs_vals)

[{'loss': 310758559.1580734, 'status': 'ok'}, {'loss': 382120055.5637368, 'status': 'ok'}, {'loss': 155642079.99346083, 'status': 'ok'}, {'loss': 422269175.05689955, 'status': 'ok'}, {'loss': 75231158.40544374, 'status': 'ok'}]
{'state': 2, 'tid': 4, 'spec': None, 'result': {'loss': 75231158.40544374, 'status': 'ok'}, 'misc': {'tid': 4, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'max_depth': [4], 'max_features': [4], 'min_samples_split': [4]}, 'vals': {'max_depth': [3], 'max_features': [0.6319311937777371], 'min_samples_split': [8]}}, 'exp_key': None, 'owner': None, 'version': 0, 'book_time': datetime.datetime(2023, 3, 30, 16, 52, 41, 714000), 'refresh_time': datetime.datetime(2023, 3, 30, 16, 52, 48, 854000)}
({'max_depth': [0, 1, 2, 3, 4], 'max_features': [0, 1, 2, 3, 4], 'min_samples_split': [0, 1, 2, 3, 4]}, {'max_depth': [3, 3, 1, 0, 3], 'max_features': [0.23244083818919758, 0.12850062719790667, 0.5151489749049859, 0.23370057559854263, 0.63193119377

In [39]:
gs.best_params_

{'max_depth': 7,
 'max_features': 0.7,
 'min_samples_leaf': 5,
 'min_samples_split': 22,
 'random_state': 5}

In [40]:
rs.best_params_

{'max_depth': 7,
 'max_features': 0.8265542486873563,
 'min_samples_leaf': 5,
 'min_samples_split': 18,
 'random_state': 5}

In [48]:
best_hyperparams

{'max_depth': 4, 'max_features': 0.9384392973316729, 'min_samples_split': 1}

In [41]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, random_state=RANDOM_SEED, n_jobs=-1)
rgn.set_params(**gs.best_params_)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

-46118936.64006169

In [50]:
np.sqrt(46_118_936)

6791.09240107952

In [42]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, random_state=RANDOM_SEED, n_jobs=-1)
rgn.set_params(**rs.best_params_)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

-45767720.244413085

In [51]:
np.sqrt(45_767_720)

6765.184402512617

In [91]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, **space_eval(param_space, best_hyperparams),
                            random_state=RANDOM_SEED, n_jobs=-1)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

-44255009.564420745

In [93]:
np.sqrt(44_255_009)

6652.443836666342

# Optuna (an alternative to HyperOpt)

https://optuna.org/

In [98]:
# !pip install optuna

In [99]:
import optuna

In [111]:
def objective(trial: optuna.trial.Trial):
    param_space = {
#         'n_estimators': trial.suggest_int('n_estimators', 10, 50),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 25),
        'max_features': trial.suggest_float('max_features', 1e-6, 1., log=False),
        }

    rgn = RandomForestRegressor(n_estimators=10, random_state=RANDOM_SEED, **param_space, n_jobs=6)

    cv = list(KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, Y['x'], groups=X['sat_id']))
    mse = cross_val_score(rgn, X, Y['x'], scoring='neg_mean_squared_error', cv=cv).mean()

    return mse

In [112]:
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, n_trials=20, show_progress_bar=True, n_jobs=1)

[32m[I 2023-04-05 12:37:49,008][0m A new study created in memory with name: no-name-6e2d623f-ef20-4e40-b376-a5807092cc86[0m
  self._init_valid()


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2023-04-05 12:37:52,763][0m Trial 0 finished with value: -422269175.05689955 and parameters: {'max_depth': 2, 'min_samples_split': 23, 'max_features': 0.21698009297885476}. Best is trial 0 with value: -422269175.05689955.[0m
[32m[I 2023-04-05 12:38:01,107][0m Trial 1 finished with value: -130513293.2825294 and parameters: {'max_depth': 3, 'min_samples_split': 15, 'max_features': 0.6256556934473542}. Best is trial 1 with value: -130513293.2825294.[0m
[32m[I 2023-04-05 12:38:04,778][0m Trial 2 finished with value: -319833435.91123134 and parameters: {'max_depth': 6, 'min_samples_split': 11, 'max_features': 0.1201239238944989}. Best is trial 1 with value: -130513293.2825294.[0m
[32m[I 2023-04-05 12:38:20,771][0m Trial 3 finished with value: -54956240.06746979 and parameters: {'max_depth': 5, 'min_samples_split': 14, 'max_features': 0.8416357332728771}. Best is trial 3 with value: -54956240.06746979.[0m
[32m[I 2023-04-05 12:38:27,976][0m Trial 4 finished with value: -1

In [113]:
study.best_params

{'max_depth': 8, 'min_samples_split': 25, 'max_features': 0.9961552727861642}

In [114]:
study.best_value

-40665758.104110725

In [117]:
rgn = RandomForestRegressor(n_estimators=10, **study.best_params,
                            random_state=RANDOM_SEED, n_jobs=-1)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

-40665758.10411073