In [23]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.svm import SVR
import pickle
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from data_preprocess import load_and_preprocess_data

In [7]:
(train_x, train_y, test_x) = load_and_preprocess_data()

In [37]:
dropped_features = [
    "decane",
    "aromatic hydroxyl",
    "C=C-C=O in non-aromatic ring",
    "MW",
    "apin_toluene",
    "nitro",
    "apin_decane",
    "nitroester",
    "apin_decane_toluene"
]
train_x_bf = train_x.loc[:, ~train_x.columns.isin(dropped_features)]

In [38]:
params = {
    'svr__kernel': ['rbf'],
    'svr__gamma': ['auto', 'scale', 0.01      , 0.02689655, 0.0437931 , 0.06068966, 0.07758621,
       0.09448276, 0.11137931, 0.12827586, 0.14517241, 0.16206897,
       0.17896552, 0.19586207, 0.21275862, 0.22965517, 0.24655172,
       0.26344828, 0.28034483, 0.29724138, 0.31413793, 0.33103448,
       0.34793103, 0.36482759, 0.38172414, 0.39862069, 0.41551724,
       0.43241379, 0.44931034, 0.4662069 , 0.48310345, 0.5]
}

svr_pipe = make_pipeline(StandardScaler(), SVR())
grid_search_gamma = GridSearchCV(svr_pipe, params, cv=5, scoring=['r2', 'neg_mean_squared_error'], verbose=3, refit=False)
grid_search_gamma.fit(train_x_bf, train_y)
print(grid_search_gamma.best_estimator_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5] END svr__gamma=auto, svr__kernel=rbf; neg_mean_squared_error: (test=-2.371) r2: (test=0.760) total time=  20.3s
[CV 2/5] END svr__gamma=auto, svr__kernel=rbf; neg_mean_squared_error: (test=-2.527) r2: (test=0.740) total time=  26.5s
[CV 3/5] END svr__gamma=auto, svr__kernel=rbf; neg_mean_squared_error: (test=-2.478) r2: (test=0.746) total time=  25.4s
[CV 4/5] END svr__gamma=auto, svr__kernel=rbf; neg_mean_squared_error: (test=-2.570) r2: (test=0.738) total time=  26.3s
[CV 5/5] END svr__gamma=auto, svr__kernel=rbf; neg_mean_squared_error: (test=-2.358) r2: (test=0.753) total time=  26.0s
[CV 1/5] END svr__gamma=scale, svr__kernel=rbf; neg_mean_squared_error: (test=-2.371) r2: (test=0.760) total time=  26.2s
[CV 2/5] END svr__gamma=scale, svr__kernel=rbf; neg_mean_squared_error: (test=-2.527) r2: (test=0.740) total time=  26.1s
[CV 3/5] END svr__gamma=scale, svr__kernel=rbf; neg_mean_squared_error: (test=-2.478) r2:

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [13]:
svr_grid_search_results = grid_search_gamma.cv_results_

pca_random_search_tuning_results = pd.DataFrame({
    'R2': svr_grid_search_results['mean_test_r2'],
    'MSE': svr_grid_search_results['mean_test_neg_mean_squared_error'],
    'rank R2': svr_grid_search_results['rank_test_r2'],
    'rank MSE': svr_grid_search_results['rank_test_neg_mean_squared_error'],
    'gamma': svr_grid_search_results['param_svr__gamma'],
    'epsilon': svr_grid_search_results['param_svr__epsilon'],
    'C': svr_grid_search_results['param_svr__C'],
})


{'mean_fit_time': array([29.21525097, 28.72960768, 28.41681471, 28.89358807, 29.52513909,
       25.59901581, 24.03620214, 26.41210961, 30.72962294, 31.61832638,
       29.21788683, 30.50271811, 31.75164747, 32.49174323, 35.41562409,
       37.03133068, 38.45033655, 38.16915021, 38.69365373, 39.68196025,
       39.16915975, 39.67800107, 38.88467512, 38.29413991, 40.37333965,
       45.74591007, 38.81836982, 38.86622672, 39.21942453, 39.31421947,
       38.33761559, 39.22983546]), 'std_fit_time': array([0.48062439, 0.18526299, 0.59999663, 0.22015486, 0.17120931,
       2.5659253 , 0.03536535, 3.41504887, 0.21231975, 3.40866708,
       0.27029799, 0.34095544, 1.40483114, 0.34524318, 0.83403222,
       0.67281152, 1.33529989, 0.37704037, 0.2150765 , 1.84992695,
       0.18167089, 0.48222297, 0.25387597, 0.6436727 , 4.28114743,
       0.23387254, 1.53852594, 1.92667833, 2.08364939, 1.63943082,
       0.38566229, 1.51385975]), 'mean_score_time': array([8.1571353 , 8.18880658, 8.40856647, 8.

In [10]:
params = {
    'svr__kernel': ['rbf'],
    'svr__gamma': ['scale', 0.03, 0.0375, 0.045],
    'svr__epsilon': np.linspace(0.1, 1, num=15),
    'svr__C': np.linspace(2, 6, num=15)
}

svr_pipe = make_pipeline(StandardScaler(), SVR())
rs_svr = RandomizedSearchCV(svr_pipe, params, cv=5, n_iter=150, scoring=['r2', 'neg_mean_squared_error'], verbose=3, refit=False)
rs_svr.fit(train_x_bf, train_y)

Fitting 5 folds for each of 150 candidates, totalling 750 fits
[CV 1/5] END svr__C=5.142857142857142, svr__epsilon=1.0, svr__gamma=scale, svr__kernel=rbf; neg_mean_squared_error: (test=-2.361) r2: (test=0.761) total time=  12.4s
[CV 2/5] END svr__C=5.142857142857142, svr__epsilon=1.0, svr__gamma=scale, svr__kernel=rbf; neg_mean_squared_error: (test=-2.486) r2: (test=0.744) total time=  11.6s
[CV 3/5] END svr__C=5.142857142857142, svr__epsilon=1.0, svr__gamma=scale, svr__kernel=rbf; neg_mean_squared_error: (test=-2.424) r2: (test=0.752) total time=  12.5s
[CV 4/5] END svr__C=5.142857142857142, svr__epsilon=1.0, svr__gamma=scale, svr__kernel=rbf; neg_mean_squared_error: (test=-2.576) r2: (test=0.737) total time=  12.7s
[CV 5/5] END svr__C=5.142857142857142, svr__epsilon=1.0, svr__gamma=scale, svr__kernel=rbf; neg_mean_squared_error: (test=-2.311) r2: (test=0.758) total time=  12.0s
[CV 1/5] END svr__C=6.0, svr__epsilon=0.2928571428571429, svr__gamma=scale, svr__kernel=rbf; neg_mean_squar

In [52]:
rs_svr_results = rs_svr.cv_results_

pca_random_search_tuning_results = pd.DataFrame({
    'R2': rs_svr_results['mean_test_r2'],
    'MSE': rs_svr_results['mean_test_neg_mean_squared_error'],
    'rank R2': rs_svr_results['rank_test_r2'],
    'rank MSE': rs_svr_results['rank_test_neg_mean_squared_error'],
    'gamma': rs_svr_results['param_svr__gamma'],
    'epsilon': rs_svr_results['param_svr__epsilon'],
    'C': rs_svr_results['param_svr__C']
})

pca_random_search_tuning_results.sort_values('rank R2').head(10)

Unnamed: 0,R2,MSE,rank R2,rank MSE,gamma,epsilon,C
36,0.753483,-2.403737,1,1,0.025,0.678571,8.0
32,0.753454,-2.404012,2,2,0.025,0.678571,7.611111
28,0.753421,-2.404327,3,3,0.025,0.678571,7.222222
24,0.753366,-2.404871,4,4,0.025,0.678571,6.833333
37,0.753326,-2.405293,5,5,0.028333,0.678571,8.0
33,0.753305,-2.405493,6,6,0.028333,0.678571,7.611111
20,0.753272,-2.405781,7,7,0.025,0.678571,6.444444
29,0.753267,-2.405861,8,8,0.028333,0.678571,7.222222
25,0.753226,-2.406258,9,9,0.028333,0.678571,6.833333
21,0.753187,-2.40663,10,10,0.028333,0.678571,6.444444


In [32]:
params = {
    'svr__kernel': ['rbf'],
    'svr__gamma': np.linspace(0.025, 0.035, num=4),
    'svr__epsilon': [0.678571],
    'svr__C': np.linspace(4.5, 8, num=10)
}


svr_pipe = make_pipeline(StandardScaler(), SVR())
rs_svr = RandomizedSearchCV(svr_pipe, params, cv=5, n_iter=50, scoring=['r2', 'neg_mean_squared_error'], verbose=3, refit=False)
rs_svr.fit(train_x_bf, train_y)



Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END svr__C=4.5, svr__epsilon=0.678571, svr__gamma=0.025, svr__kernel=rbf; neg_mean_squared_error: (test=-2.324) r2: (test=0.765) total time=  15.3s
[CV 2/5] END svr__C=4.5, svr__epsilon=0.678571, svr__gamma=0.025, svr__kernel=rbf; neg_mean_squared_error: (test=-2.474) r2: (test=0.746) total time=  24.8s
[CV 3/5] END svr__C=4.5, svr__epsilon=0.678571, svr__gamma=0.025, svr__kernel=rbf; neg_mean_squared_error: (test=-2.413) r2: (test=0.753) total time=  26.0s
[CV 4/5] END svr__C=4.5, svr__epsilon=0.678571, svr__gamma=0.025, svr__kernel=rbf; neg_mean_squared_error: (test=-2.545) r2: (test=0.740) total time=  26.4s
[CV 5/5] END svr__C=4.5, svr__epsilon=0.678571, svr__gamma=0.025, svr__kernel=rbf; neg_mean_squared_error: (test=-2.311) r2: (test=0.758) total time=  25.3s
[CV 1/5] END svr__C=4.5, svr__epsilon=0.678571, svr__gamma=0.028333333333333335, svr__kernel=rbf; neg_mean_squared_error: (test=-2.325) r2: (test=0.765) 

In [50]:
rs_svr_results = rs_svr.cv_results_

rs_search_tuning_results = pd.DataFrame({
    'R2': rs_svr_results['mean_test_r2'],
    'MSE': rs_svr_results['mean_test_neg_mean_squared_error'],
    'rank R2': rs_svr_results['rank_test_r2'],
    'rank MSE': rs_svr_results['rank_test_neg_mean_squared_error'],
    'gamma': rs_svr_results['param_svr__gamma'],
    'epsilon': rs_svr_results['param_svr__epsilon'],
    'C': rs_svr_results['param_svr__C']
})

rs_search_tuning_results.sort_values('rank R2').head(10)

Unnamed: 0,R2,MSE,rank R2,rank MSE,gamma,epsilon,C
36,0.753483,-2.403737,1,1,0.025,0.678571,8.0
32,0.753454,-2.404012,2,2,0.025,0.678571,7.611111
28,0.753421,-2.404327,3,3,0.025,0.678571,7.222222
24,0.753366,-2.404871,4,4,0.025,0.678571,6.833333
37,0.753326,-2.405293,5,5,0.028333,0.678571,8.0
33,0.753305,-2.405493,6,6,0.028333,0.678571,7.611111
20,0.753272,-2.405781,7,7,0.025,0.678571,6.444444
29,0.753267,-2.405861,8,8,0.028333,0.678571,7.222222
25,0.753226,-2.406258,9,9,0.028333,0.678571,6.833333
21,0.753187,-2.40663,10,10,0.028333,0.678571,6.444444


In [48]:
params = {
    'svr__kernel': ['rbf'],
    'svr__gamma': np.linspace(0.020, 0.027, num=4),
    'svr__epsilon': [0.678571],
    'svr__C': np.linspace(8, 10, num=10)
}
try:
    with open('../pickles/svm_feature_importances_tuning_results.pickle', 'rb') as handle:
            rs_svr_2_results = pickle.load(handle)
except:
    svr_pipe = make_pipeline(StandardScaler(), SVR())
    rs_svr_2 = GridSearchCV(svr_pipe, params, cv=5, scoring=['r2', 'neg_mean_squared_error'], verbose=3, refit=False)
    rs_svr_2.fit(train_x_bf, train_y)
    rs_svr_2_results = rs_svr_2.cv_results_

In [51]:
rs_svr_2_results = rs_svr_2.cv_results_

rs_search_tuning_results_2 = pd.DataFrame({
    'R2': rs_svr_2_results['mean_test_r2'],
    'MSE': rs_svr_2_results['mean_test_neg_mean_squared_error'],
    'rank R2': rs_svr_2_results['rank_test_r2'],
    'rank MSE': rs_svr_2_results['rank_test_neg_mean_squared_error'],
    'gamma': rs_svr_2_results['param_svr__gamma'],
    'epsilon': rs_svr_2_results['param_svr__epsilon'],
    'C': rs_svr_2_results['param_svr__C']
})

rs_search_tuning_results_2.sort_values('rank R2').head(10)

Unnamed: 0,R2,MSE,rank R2,rank MSE,gamma,epsilon,C
37,0.75375,-2.401117,1,1,0.022333,0.678571,10.0
33,0.753735,-2.401265,2,2,0.022333,0.678571,9.777778
29,0.753715,-2.401461,3,3,0.022333,0.678571,9.555556
38,0.753697,-2.401653,4,4,0.024667,0.678571,10.0
25,0.753692,-2.401689,5,5,0.022333,0.678571,9.333333
34,0.753673,-2.401885,6,6,0.024667,0.678571,9.777778
21,0.753661,-2.40199,7,7,0.022333,0.678571,9.111111
36,0.753659,-2.402,8,8,0.02,0.678571,10.0
30,0.75365,-2.402113,9,9,0.024667,0.678571,9.555556
17,0.753629,-2.402293,10,10,0.022333,0.678571,8.888889


In [33]:
svm_pipe = make_pipeline(StandardScaler(), SVR(C=10, epsilon=0.678571, gamma=0.022333))
svm_scores = cross_val_score(svm_pipe, train_x_bf, train_y, cv=10, scoring='r2')

In [36]:
np.mean(svm_scores)

0.7551369812780145

### Pickle

In [43]:
with open('../pickles/svm_feature_importances_tuning_results.pickle', 'wb+') as handle:
    pickle.dump(rs_svr_2_results, handle, protocol=pickle.HIGHEST_PROTOCOL)