# Parameter search comparisons

1. GridSearch
2. RandomSearch
3. HyperOpt Usage

> We will use IDAO-2020 data for demonstration.
https://www.kaggle.com/datasets/neibyr/idao2020

In [10]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import KFold, cross_val_score

In [11]:
# !pip install catboost

In [12]:
RANDOM_SEED=5

train = pd.read_csv('/home/anvar/model-selection/idao2020/data/train.csv', index_col=0)
test =  pd.read_csv('/home/anvar/model-selection/idao2020/data/Track 1/test.csv', index_col=0)

In [13]:
train.head(2)

Unnamed: 0_level_0,epoch,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2014-01-01T00:00:00.000,0,-8855.823863,13117.780146,-20728.353233,-0.908303,-3.808436,-2.022083,-8843.131454,13138.22169,-20741.615306,-0.907527,-3.80493,-2.024133
1,2014-01-01T00:46:43.000,0,-10567.672384,1619.746066,-24451.813271,-0.30259,-4.272617,-0.612796,-10555.500066,1649.289367,-24473.089556,-0.303704,-4.269816,-0.616468


In [15]:
test.head(2)

Unnamed: 0_level_0,sat_id,epoch,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3927,1,2014-02-01T00:01:45.162,-13366.891347,-14236.753503,6386.774555,4.333815,-0.692764,0.810774
3928,1,2014-02-01T00:22:57.007,-7370.434039,-14498.77152,7130.411325,5.077413,0.360609,0.313402


In [16]:
def prepare_features(df):
    '''minimal preprocessing'''
    date = pd.to_datetime(df.epoch)
    # year and month are the same accross the data
    df['day'] = date.dt.day
    df['weekday'] = date.dt.weekday
    df['hour'] = date.dt.hour
    df['minute'] = date.dt.minute
    df['second'] = date.dt.second
    
    return df.drop('epoch', axis=1)

In [17]:
train = prepare_features(train)
X = train[['x_sim', 'y_sim', 'z_sim',
           'Vx_sim', 'Vy_sim', 'Vz_sim',
           'sat_id', 'day', 'weekday', 'hour', 'minute','second']]
Y = train[['x', 'y', 'z',
           'Vx', 'Vy', 'Vz']]

# 1. GridSearch

In [18]:
from sklearn.model_selection import GridSearchCV

In [21]:
# Sattelite based cross-validation

rgn = RandomForestRegressor(n_estimators=10)
cv = list(KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, Y['x'], groups=X['sat_id']))

In [None]:
[1][2][3][4][5]

Train [1-4],     Test [5] --> Accuracy = 0.78
Train [1,2,3,5], Test [4] --> Accuracy = 0.73
Train [1,2,4,5], Test [3] --> Accuracy = 0.88
Train [1,3,4,5], Test [2] --> Accuracy = 0.71
Train [2,3,4,5], Test [1] --> Accuracy = 0.75


RandomForest(n_estimators=10, max_depth=4, min_samples_split=10)

mean = 0.72 std = 0.1

RandomForest(n_estimators=10, max_depth=8, min_samples_split=10)

mean = 0.75 std = 0.08

Leave-one-out

In [22]:
len(np.arange(3,8,2)) * len(np.arange(2,25,5)) * 1 * 2

30

In [25]:
params = {
    'max_depth': np.arange(3,8,2),
    'min_samples_split': np.arange(2,25,5),
    'min_samples_leaf': [5],
    'max_features': [0.3, 0.7],
    'random_state':[RANDOM_SEED],
}

gs = GridSearchCV(estimator=rgn,
                  param_grid=params,
                  scoring='neg_mean_squared_error',
                  cv=cv,
                  n_jobs=-1,
                  verbose=5,)
#                   iid=False)

In [26]:
gs.fit(X, Y['x'])
# ~10 min

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=[(array([     3,      4,      5, ..., 649908, 649910, 649911]),
                  array([     0,      1,      2, ..., 649906, 649907, 649909])),
                 (array([     0,      1,      2, ..., 649909, 649910, 649911]),
                  array([     7,     10,     13, ..., 649889, 649893, 649908])),
                 (array([     0,      1,      2, ..., 649908, 649909, 649910]),
                  array([     3,      4,     11, ..., 649904, 649905, 649911])),
                 (array([     0,      1,      2, ..., 649909, 649910, 649911]),
                  array([     5,      8,     15, ..., 649898, 649900, 649902])),
                 (array([     0,      1,      2, ..., 649908, 649909, 649911]),
                  array([     9,     17,     18, ..., 649897, 649903, 649910]))],
             estimator=RandomForestRegressor(n_estimators=10), n_jobs=-1,
             param_grid={'max_depth': array([3, 5, 7]),
                         'max_features': [0.3, 0.7], 'min_sample

In [28]:
gs.best_params_

{'max_depth': 7,
 'max_features': 0.7,
 'min_samples_leaf': 5,
 'min_samples_split': 22,
 'random_state': 5}

In [None]:
{'max_depth': 7,
 'max_features': 0.8265542486873563,
 'min_samples_leaf': 5,
 'min_samples_split': 18,
 'random_state': 5}

In [38]:
gs.best_score_

-46118936.64006169

In [39]:
rs.best_score_

-45767720.244413085

In [29]:
gs.best_estimator_

RandomForestRegressor(max_depth=7, max_features=0.7, min_samples_leaf=5,
                      min_samples_split=22, n_estimators=10, random_state=5)

# 2. RandomSearch

In [30]:
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

In [33]:
params = {
    'max_depth': stats.randint(2, 8),
    'min_samples_split': stats.randint(2, 25),
    'min_samples_leaf': [5],
    'max_features': stats.uniform(),
    'random_state':[RANDOM_SEED],
}


rs = RandomizedSearchCV(estimator=rgn,
                       param_distributions=params,
                       n_iter=30,
                       scoring='neg_mean_squared_error',
                       n_jobs=-1,
                       cv=cv,
                       verbose=5,
                       random_state=RANDOM_SEED)

In [34]:
rs.fit(X, Y['x'])

Fitting 5 folds for each of 30 candidates, totalling 150 fits


RandomizedSearchCV(cv=[(array([     3,      4,      5, ..., 649908, 649910, 649911]),
                        array([     0,      1,      2, ..., 649906, 649907, 649909])),
                       (array([     0,      1,      2, ..., 649909, 649910, 649911]),
                        array([     7,     10,     13, ..., 649889, 649893, 649908])),
                       (array([     0,      1,      2, ..., 649908, 649909, 649910]),
                        array([     3,      4,     11, ..., 649904, 649905, 649911])),
                       (array([     0,      1,      2, ..., 649909, 649910, 649911]),
                        array([     5,      8,     15, ..., 649898, 649900, 64990...
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fbdff1f1d60>,
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fbdff1e1fa0>,
                                        'min_samples_leaf

In [37]:
rs.best_params_

{'max_depth': 7,
 'max_features': 0.8265542486873563,
 'min_samples_leaf': 5,
 'min_samples_split': 18,
 'random_state': 5}

In [42]:
RandomForestRegressor(**rs.best_params_)
# 10*0.83

8.299999999999999

# 3. Hyperopt usage 

In [35]:
from hyperopt import Trials, fmin, hp, tpe

In [20]:
# !pip install hyperopt

In [36]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, random_state=RANDOM_SEED)

def score(params):
    print(f"Training with params: {params}")
    rgn.set_params(**params)
    cv = list(KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(X, Y['x'], groups=X['sat_id']))
    neg_mse = cross_val_score(rgn, X, Y['x'], scoring='neg_mean_squared_error', cv=cv).mean()        
    return -neg_mse


def optimize(random_state=RANDOM_SEED, niter=2):
    param_space = {
    'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 25, dtype=int)),
    'max_features': hp.uniform('max_features',0, 1.),
    }
    trials = Trials()
    best = fmin(score, param_space, algo=tpe.suggest, 
                trials=trials, 
                max_evals=niter,
                rstate=np.random.RandomState(random_state)
               )
    return best, trials

In [40]:
# Use niter=2 for minimal example
best_hyperparams, trials = optimize(niter=30) 

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]


AttributeError: 'numpy.random.mtrand.RandomState' object has no attribute 'integers'

In [None]:
# HyperOpt `fmin` returns indexes for `choice` defined parameters 

np.arange(2, 8, dtype=int)[5], np.arange(2, 25, dtype=int)[1]

best_hyperparams

In [None]:
# Checkout trials object

print(trials.results)
print(trials.best_trial)
print(trials.idxs_vals)

In [None]:
gs.best_params_

In [None]:
rs.best_params_

In [None]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, random_state=RANDOM_SEED, n_jobs=-1)
rgn.set_params(**gs.best_params_)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

In [None]:
rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, random_state=RANDOM_SEED, n_jobs=-1)
rgn.set_params(**rs.best_params_)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()

In [None]:
#{'max_depth': 5, 'max_features': 0.9336701952987806, 'min_samples_split': 1}

rgn = RandomForestRegressor(n_estimators=10, min_samples_leaf=5, max_features=0.9336701952987806,
                            min_samples_split = 3, max_depth=7,
                            random_state=RANDOM_SEED, n_jobs=-1)
cross_val_score(rgn, X, Y['x'], cv=cv, scoring='neg_mean_squared_error').mean()