In [31]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, ShuffleSplit
from data_preprocess import load_and_preprocess_data
import pickle

In [2]:
(train_x, train_y, test_x) = load_and_preprocess_data()

## Gradient Boosting

In [None]:
dropped_features = ['apin_toluene', 'decane_toluene', 'C=C-C=O in non-aromatic ring', 'toluene',
    'NumOfN', 'nitroester', 'nitrate', 'NumOfAtoms', 'MW']
train_x = train_x.loc[:, ~train_x.columns.isin(dropped_features)]

In [44]:
try:
    with open('../pickles/gb_random_search_tuning_results.pickle', 'rb') as handle:
        gb_random_search_results = pickle.load(handle)
except:
params = {
    'max_depth': [5, 10, 15, None],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_iter': [100, 1000, 1500],
    'min_samples_leaf': [20, 60, 100],
    'max_features': [0.5, 0.75, 1.0],
    'max_leaf_nodes': [20, 31, 100, None]
}

r_search = RandomizedSearchCV(HistGradientBoostingRegressor(), params, n_iter=500, scoring=['r2', 'neg_mean_squared_error'], verbose=3, refit=False)
r_search.fit(train_x, train_y)
gb_random_search_results = r_search.cv_results_

gb_random_search_tuning_results = pd.DataFrame({
    'R2': gb_random_search_results['mean_test_r2'],
    'MSE': gb_random_search_results['mean_test_neg_mean_squared_error'],
    'max_depth': gb_random_search_results['param_max_depth'],
    'max_iter': gb_random_search_results['param_max_iter'],
    'max_features': gb_random_search_results['param_max_features'],
    'leraning_rate': gb_random_search_results['param_learning_rate'],
    'max_leaf_nodes': gb_random_search_results['param_max_leaf_nodes'],
    'min_samples_leaf': gb_random_search_results['param_min_samples_leaf'],
    'rank R2': gb_random_search_results['rank_test_r2'],
    'rank MSE': gb_random_search_results['rank_test_neg_mean_squared_error'],
})
print(gb_random_search_tuning_results.sort_values('rank R2'))

           R2       MSE max_depth  max_iter  max_features  leraning_rate  \
55   0.746955 -2.467405         5      1500           0.5           0.05   
87   0.746943 -2.467667        15      1500           0.5           0.05   
108  0.746796 -2.468978        10      1500           0.5           0.05   
166  0.746786 -2.469194      None      1500           0.5           0.05   
15   0.746741 -2.469502        10      1500           0.5           0.05   
..        ...       ...       ...       ...           ...            ...   
341  0.725771 -2.673996         5       100           0.5           0.05   
158  0.725659 -2.675150         5       100           0.5           0.05   
39   0.724594 -2.685317      None       100           1.0           0.15   
174  0.724102 -2.690233      None      1500           1.0           0.15   
368  0.723222 -2.698604      None      1000           1.0           0.15   

    max_leaf_nodes  min_samples_leaf  rank R2  rank MSE  
55              31           

In [50]:
try:
    with open('../pickles/gb_grid_search_tuning_results.pickle', 'rb') as handle:
        gb_grid_search_results = pickle.load(handle)
except:
    params = {
        'max_depth': [None],
        'learning_rate': np.linspace(0.01, 0.07, num=7),
        'max_iter': np.arange(1000, 1500, step=100),
        'min_samples_leaf': np.arange(50, 100, step=10),
        'max_features': np.linspace(0.4, 0.9, 6),
        'max_leaf_nodes': [31]
    }

    g_search = GridSearchCV(HistGradientBoostingRegressor(), params, scoring=['r2', 'neg_mean_squared_error'], verbose=3, refit=False)
    g_search.fit(train_x, train_y)
    gb_grid_search_results = g_search.cv_results_

gb_grid_search_tuning_results = pd.DataFrame({
    'R2': gb_grid_search_results['mean_test_r2'],
    'MSE': gb_grid_search_results['mean_test_neg_mean_squared_error'],
    'max_depth': gb_grid_search_results['param_max_depth'],
    'max_iter': gb_grid_search_results['param_max_iter'],
    'max_features': gb_grid_search_results['param_max_features'],
    'learning_rate': gb_grid_search_results['param_learning_rate'],
    'max_leaf_nodes': gb_grid_search_results['param_max_leaf_nodes'],
    'min_samples_leaf': gb_grid_search_results['param_min_samples_leaf'],
    'rank R2': gb_grid_search_results['rank_test_r2'],
    'rank MSE': gb_grid_search_results['rank_test_neg_mean_squared_error'],
})
print(gb_grid_search_tuning_results.sort_values('rank R2'))

           R2       MSE max_depth  max_iter  max_features  learning_rate  \
336  0.748433 -2.452987      None      1200           0.5           0.03   
451  0.748010 -2.457181      None      1000           0.4           0.04   
21   0.747888 -2.458278      None      1400           0.4           0.01   
603  0.747862 -2.458545      None      1000           0.4           0.05   
346  0.747814 -2.459080      None      1400           0.5           0.03   
..        ...       ...       ...       ...           ...            ...   
101  0.744538 -2.490917      None      1000           0.8           0.01   
295  0.744414 -2.492132      None      1400           0.9           0.02   
126  0.744351 -2.492727      None      1000           0.9           0.01   
406  0.744196 -2.494383      None      1100           0.8           0.03   
125  0.744120 -2.495000      None      1000           0.9           0.01   

     max_leaf_nodes  min_samples_leaf  rank R2  rank MSE  
336              31         

## Pickle

In [45]:
with open('../pickles/gb_random_search_tuning_results.pickle', 'wb+') as handle:
    pickle.dump(r_search.cv_results_, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [49]:
with open('../pickles/gb_grid_search_tuning_results.pickle', 'wb+') as handle:
    pickle.dump(g_search.cv_results_, handle, protocol=pickle.HIGHEST_PROTOCOL)