In [44]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, ShuffleSplit
from data_preprocess import load_and_preprocess_data

In [2]:
(train_x, train_y, test_x) = load_and_preprocess_data()

In [22]:
params = {
    'max_depth': np.arange(5,16),
    'learning_rate': np.linspace(0.05, 0.2, num=10),
    'max_iter': np.arange(100, 1100, step=100),
    'min_samples_leaf': np.arange(10, 200, step=20),
    'max_features': np.linspace(0.1, 1, num=10),
    'max_leaf_nodes': np.arange(20, 200, step=20)
}
r_search = RandomizedSearchCV(HistGradientBoostingRegressor(categorical_features=['parentspecies']), params, n_iter=100, scoring='r2', verbose=3)
r_search.fit(train_x, train_y)
print(r_search.best_estimator_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END learning_rate=0.08333333333333334, max_depth=10, max_features=0.2, max_iter=100, max_leaf_nodes=140, min_samples_leaf=110;, score=0.752 total time=   0.8s
[CV 2/5] END learning_rate=0.08333333333333334, max_depth=10, max_features=0.2, max_iter=100, max_leaf_nodes=140, min_samples_leaf=110;, score=0.736 total time=   0.6s
[CV 3/5] END learning_rate=0.08333333333333334, max_depth=10, max_features=0.2, max_iter=100, max_leaf_nodes=140, min_samples_leaf=110;, score=0.741 total time=   0.6s
[CV 4/5] END learning_rate=0.08333333333333334, max_depth=10, max_features=0.2, max_iter=100, max_leaf_nodes=140, min_samples_leaf=110;, score=0.733 total time=   0.6s
[CV 5/5] END learning_rate=0.08333333333333334, max_depth=10, max_features=0.2, max_iter=100, max_leaf_nodes=140, min_samples_leaf=110;, score=0.750 total time=   0.7s
[CV 1/5] END learning_rate=0.13333333333333336, max_depth=12, max_features=0.2, max_iter=600, max

In [11]:
print([x for x in np.linspace(0.00001, 0.1, 5)])

[1e-05, 0.025007500000000002, 0.05000500000000001, 0.0750025, 0.1]


In [12]:
params = {
    'svr__C': [x for x in np.linspace(0.4, 2, 10)], 
    'svr__gamma': [x for x in np.linspace(0.00001, 0.1, 5)],
    'svr__epsilon': [x for x in np.linspace(0.01, 0.2, 5)],
}
params['svr__gamma'].append('scale')
svm_pipe = make_pipeline(StandardScaler(), SVR())
r_search = RandomizedSearchCV(svm_pipe, params, n_iter=100, scoring='r2', verbose=3, cv=3)
r_search.fit(train_x, train_y)
print(r_search.best_estimator_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 1/3] END svr__C=2.0, svr__epsilon=0.0575, svr__gamma=scale;, score=0.750 total time=  21.8s
[CV 2/3] END svr__C=2.0, svr__epsilon=0.0575, svr__gamma=scale;, score=0.744 total time=  22.0s
[CV 3/3] END svr__C=2.0, svr__epsilon=0.0575, svr__gamma=scale;, score=0.745 total time=  21.2s
[CV 1/3] END svr__C=2.0, svr__epsilon=0.0575, svr__gamma=0.0750025;, score=0.746 total time=  22.4s
[CV 2/3] END svr__C=2.0, svr__epsilon=0.0575, svr__gamma=0.0750025;, score=0.740 total time=  22.5s
[CV 3/3] END svr__C=2.0, svr__epsilon=0.0575, svr__gamma=0.0750025;, score=0.742 total time=  22.1s
[CV 1/3] END svr__C=0.5777777777777778, svr__epsilon=0.01, svr__gamma=0.0750025;, score=0.740 total time=  22.9s
[CV 2/3] END svr__C=0.5777777777777778, svr__epsilon=0.01, svr__gamma=0.0750025;, score=0.733 total time=  23.0s
[CV 3/3] END svr__C=0.5777777777777778, svr__epsilon=0.01, svr__gamma=0.0750025;, score=0.736 total time=  23.2s
[CV 1/3] E

In [1]:
svm_tuning_results = pd.DataFrame({
    'Mean score': r_search.cv_results_['mean_test_score'],
    'C': r_search.cv_results_['param_svr__C'],
    'epsilon': r_search.cv_results_['param_svr__epsilon'],
    'gamma': r_search.cv_results_['param_svr__gamma'],
    'rank': r_search.cv_results_['rank_test_score'],
})
print(svm_tuning_results[:10])

NameError: name 'pd' is not defined