In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, ShuffleSplit
import pickle

import sys
sys.path.append('../okko')

from data_preprocess import load_and_preprocess_data


In [2]:
(train_x, train_y, test_x) = load_and_preprocess_data()

## Random Forest

In [3]:
dropped_features = ['apin_toluene', 'decane_toluene', 'C=C-C=O in non-aromatic ring', 'toluene',
    'NumOfN', 'nitroester', 'nitrate', 'NumOfAtoms', 'MW']
train_x = train_x.loc[:, ~train_x.columns.isin(dropped_features)]

In [None]:
try:
    with open('../pickles/rf_random_search_tuning_results.pickle', 'rb') as handle:
        rf_random_search_results = pickle.load(handle)
except:
    params = {
        'n_estimators': [int(x) for x in np.linspace(start=50, stop=1000, num=10)],  # Extended range
        'max_features': ['auto', 'sqrt', 'log2', 0.5, 0.8],  # Additional options
        'max_depth': [None] + [int(x) for x in np.linspace(10, 250, num=25)],  # Extended depth range
        'min_samples_split': [2, 5, 10, 15, 20],  # More values to explore
        'min_samples_leaf': [1, 2, 4, 8, 10],  # Additional values
        'bootstrap': [True, False]  # Bootstrap options
    }
    

    r_search = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1), params, n_iter=500, scoring=['r2', 'neg_mean_squared_error'], verbose=3, refit=False)
    r_search.fit(train_x, train_y)
    rf_random_search_results = r_search.cv_results_

rf_random_search_tuning_results = pd.DataFrame({
    'R2': rf_random_search_results['mean_test_r2'],
    'MSE': rf_random_search_results['mean_test_neg_mean_squared_error'],
    'n_estimators': rf_random_search_results['param_n_estimators'],
    'max_features': rf_random_search_results['param_max_features'],
    'max_depth': rf_random_search_results['param_max_depth'],
    'min_samples_split': rf_random_search_results['param_min_samples_split'],
    'min_samples_leaf': rf_random_search_results['param_min_samples_leaf'],
    'bootstrap' : rf_random_search_results['param_bootstrap'],
    'rank R2': rf_random_search_results['rank_test_r2'],
    'rank MSE': rf_random_search_results['rank_test_neg_mean_squared_error'],
})

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV 1/5] END bootstrap=False, max_depth=130, max_features=0.8, min_samples_leaf=10, min_samples_split=2, n_estimators=683; neg_mean_squared_error: (test=-2.672) r2: (test=0.730) total time=   8.4s
[CV 2/5] END bootstrap=False, max_depth=130, max_features=0.8, min_samples_leaf=10, min_samples_split=2, n_estimators=683; neg_mean_squared_error: (test=-2.738) r2: (test=0.719) total time=   8.2s
[CV 3/5] END bootstrap=False, max_depth=130, max_features=0.8, min_samples_leaf=10, min_samples_split=2, n_estimators=683; neg_mean_squared_error: (test=-2.754) r2: (test=0.718) total time=   8.1s
[CV 4/5] END bootstrap=False, max_depth=130, max_features=0.8, min_samples_leaf=10, min_samples_split=2, n_estimators=683; neg_mean_squared_error: (test=-2.804) r2: (test=0.714) total time=   8.2s
[CV 5/5] END bootstrap=False, max_depth=130, max_features=0.8, min_samples_leaf=10, min_samples_split=2, n_estimators=683; neg_mean_squared_error: (

470 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
470 fits failed with the following error:
Traceback (most recent call last):
  File "/tmp/ipykernel_110656/222628487.py", line 2, in <module>
    with open('../pickles/rf_random_search_tuning_results.pickle', 'rb') as handle:
  File "/home/niklaska/.virtualenvs/r-virtualenv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 324, in _modified_open
    return io_open(file, *args, **kwargs)
FileNotFoundError: [Errno 2] No such file or directory: '../pickles/rf_random_search_tuning_results.pickle'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/niklaska/.virtualenvs/r

In [6]:

print(rf_random_search_tuning_results.sort_values('rank R2'))

           R2       MSE  n_estimators max_features max_depth  \
434  0.734716 -2.586794           683          0.5        50   
122  0.734545 -2.588470          1000         log2        50   
251  0.734413 -2.589722           472          0.5       220   
65   0.734378 -2.590042          1000          0.5        90   
52   0.734296 -2.590893           577         sqrt      None   
..        ...       ...           ...          ...       ...   
200       NaN       NaN            50         auto       120   
204       NaN       NaN           155         auto      None   
373       NaN       NaN           683         auto        60   
219       NaN       NaN           788         auto        10   
249       NaN       NaN           366         auto       140   

     min_samples_split  min_samples_leaf  bootstrap  rank R2  rank MSE  
434                 15                 1       True        1         1  
122                 15                 1      False        2         2  
251         

In [None]:
try:
    with open('../pickles/rf_grid_search_tuning_results.pickle', 'rb') as handle:
        rf_grid_search_results = pickle.load(handle)
except:
    params = {
        'n_estimators': [int(x) for x in np.linspace(start=650, stop=700, num=5)],
        'max_features': [0.5],
        'max_depth': [int(x) for x in np.linspace(45, 55, num=1)],
        'min_samples_split': [14, 15, 16],
        'min_samples_leaf': [1, 2, 3],
        'bootstrap': [True]
    }

    g_search = GridSearchCV(RandomForestRegressor(n_jobs=-1), params, scoring=['r2', 'neg_mean_squared_error'], verbose=3, refit=False)
    g_search.fit(train_x, train_y)
    rf_grid_search_results = g_search.cv_results_

rf_grid_search_tuning_results = pd.DataFrame({
    'R2': rf_random_search_results['mean_test_r2'],
    'MSE': rf_random_search_results['mean_test_neg_mean_squared_error'],
    'n_estimators': rf_random_search_results['param_n_estimators'],
    'max_features': rf_random_search_results['param_max_features'],
    'max_depth': rf_random_search_results['param_max_depth'],
    'min_samples_split': rf_random_search_results['param_min_samples_split'],
    'min_samples_leaf': rf_random_search_results['param_min_samples_leaf'],
    'bootstrap' : rf_random_search_results['param_bootstrap'],
    'rank R2': rf_random_search_results['rank_test_r2'],
    'rank MSE': rf_random_search_results['rank_test_neg_mean_squared_error'],
})


Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV 1/5] END bootstrap=True, max_depth=45, max_features=0.5, min_samples_leaf=1, min_samples_split=14, n_estimators=650; neg_mean_squared_error: (test=-2.531) r2: (test=0.744) total time=   4.9s
[CV 2/5] END bootstrap=True, max_depth=45, max_features=0.5, min_samples_leaf=1, min_samples_split=14, n_estimators=650; neg_mean_squared_error: (test=-2.622) r2: (test=0.730) total time=   4.8s
[CV 3/5] END bootstrap=True, max_depth=45, max_features=0.5, min_samples_leaf=1, min_samples_split=14, n_estimators=650; neg_mean_squared_error: (test=-2.611) r2: (test=0.733) total time=   4.7s
[CV 4/5] END bootstrap=True, max_depth=45, max_features=0.5, min_samples_leaf=1, min_samples_split=14, n_estimators=650; neg_mean_squared_error: (test=-2.683) r2: (test=0.726) total time=   4.8s
[CV 5/5] END bootstrap=True, max_depth=45, max_features=0.5, min_samples_leaf=1, min_samples_split=14, n_estimators=650; neg_mean_squared_error: (test=-2.471)

  _data = np.array(data, dtype=dtype, copy=copy,


In [8]:

print(rf_grid_search_tuning_results.sort_values('rank R2'))

           R2       MSE  n_estimators max_features max_depth  \
434  0.734716 -2.586794           683          0.5        50   
122  0.734545 -2.588470          1000         log2        50   
251  0.734413 -2.589722           472          0.5       220   
65   0.734378 -2.590042          1000          0.5        90   
52   0.734296 -2.590893           577         sqrt      None   
..        ...       ...           ...          ...       ...   
200       NaN       NaN            50         auto       120   
204       NaN       NaN           155         auto      None   
373       NaN       NaN           683         auto        60   
219       NaN       NaN           788         auto        10   
249       NaN       NaN           366         auto       140   

     min_samples_split  min_samples_leaf  bootstrap  rank R2  rank MSE  
434                 15                 1       True        1         1  
122                 15                 1      False        2         2  
251         

## Pickle

In [5]:
with open('../pickles/rf_random_search_tuning_results.pickle', 'wb+') as handle:
    pickle.dump(r_search.cv_results_, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
with open('../pickles/rf_grid_search_tuning_results.pickle', 'wb+') as handle:
    pickle.dump(g_search.cv_results_, handle, protocol=pickle.HIGHEST_PROTOCOL)