In [1]:
import numpy as np
import pandas as pd #to read dataset
#for graphics of best model
import matplotlib.pyplot as plt
import seaborn as sns
#import tensorflow as tf
from tensorflow import keras

# random forest
from sklearn.ensemble import RandomForestRegressor

# evaluation metrics
from sklearn.metrics import mean_squared_error #mean_squared_error(y_true, y_pred)
from sklearn.metrics import r2_score #r2_score(y_true, y_pred)

In [2]:
#Dataset 
df = pd.read_csv('12kr2.csv')
#print(df.columns)
#print(df.shape)
#print(df.dtypes)

In [3]:
#first, separate the full df into 2 by rows assigning some % of the rows to training and some to testing, for example by

train = df.sample(frac=0.8, random_state=200)
test = df.drop(train.index)

#in this way you slice your data horizontally.

#then, identify which columns should be in your training data (X) and which in your training labels(Y). Then explicitly select them:

X_train = train[['MONOISOTOPIC_MASS', 'Number of Carbons', 'Number of Fluorines',
       'Contains N', 'Contains O',
       'ATMOSPHERIC_HYDROXYLATION_RATE_(AOH)_CM3/MOLECULE*SEC_OPERA_PRED',
       'BIODEGRADATION_HALF_LIFE_DAYS_DAYS_OPERA_PRED',
       'BOILING_POINT_DEGC_OPERA_PRED', 'HENRYS_LAW_ATM-M3/MOLE_OPERA_PRED',
       'OPERA_KM_DAYS_OPERA_PRED',
       'OCTANOL_AIR_PARTITION_COEFF_LOGKOA_OPERA_PRED',
       'SOIL_ADSORPTION_COEFFICIENT_KOC_L/KG_OPERA_PRED',
       'OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED',
       'MELTING_POINT_DEGC_OPERA_PRED', 'VAPOR_PRESSURE_MMHG_OPERA_PRED',
       'WATER_SOLUBILITY_MOL/L_OPERA_PRED', 'Acid Group', 'Hyde Group',
       'Carbon Ring Group', 'Alcohol Group', 'Ether Linkage',
       'Linear Vs. Branched', 'ide', 'ate', 'ite', 'sodium', 'polymer',
       'telomer', 'nitrile', 'Bis', 'Amide', 'Amine', 'Si', 'Sulfur',]].values.astype(float)
y_train = train['BIOCONCENTRATION_FACTOR_OPERA_PRED'].values.astype(float) 

#Note that columns should be the same for training and testing, including the order:

X_test = test[['MONOISOTOPIC_MASS', 'Number of Carbons', 'Number of Fluorines',
       'Contains N', 'Contains O',
       'ATMOSPHERIC_HYDROXYLATION_RATE_(AOH)_CM3/MOLECULE*SEC_OPERA_PRED',
       'BIODEGRADATION_HALF_LIFE_DAYS_DAYS_OPERA_PRED',
       'BOILING_POINT_DEGC_OPERA_PRED', 'HENRYS_LAW_ATM-M3/MOLE_OPERA_PRED',
       'OPERA_KM_DAYS_OPERA_PRED',
       'OCTANOL_AIR_PARTITION_COEFF_LOGKOA_OPERA_PRED',
       'SOIL_ADSORPTION_COEFFICIENT_KOC_L/KG_OPERA_PRED',
       'OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED',
       'MELTING_POINT_DEGC_OPERA_PRED', 'VAPOR_PRESSURE_MMHG_OPERA_PRED',
       'WATER_SOLUBILITY_MOL/L_OPERA_PRED', 'Acid Group', 'Hyde Group',
       'Carbon Ring Group', 'Alcohol Group', 'Ether Linkage',
       'Linear Vs. Branched', 'ide', 'ate', 'ite', 'sodium', 'polymer',
       'telomer', 'nitrile', 'Bis', 'Amide', 'Amine', 'Si', 'Sulfur',]].values.astype(float)
y_test = test['BIOCONCENTRATION_FACTOR_OPERA_PRED'].values.astype(float)

In [4]:
#Sherpa parameter optimization
import time
import sherpa
import sherpa.algorithms.bayesian_optimization as bayesian_optimization

In [5]:
#SHERPA Parameter Optimization
from sklearn.model_selection import cross_val_score
parameters = [sherpa.Discrete('n_estimators', [2, 100]),
              #sherpa.Choice('criterion', ['mse', 'rmse']),
              sherpa.Continuous('max_features', [0.1, 0.9])]

algorithm = bayesian_optimization.GPyOpt(max_concurrent=1,
                                         model_type='GP_MCMC',
                                         acquisition_type='EI_MCMC',
                                         max_num_trials=10)

study = sherpa.Study(parameters=parameters,
                     algorithm=algorithm,
                     lower_is_better=False)

for trial in study:
    print("Trial ", trial.id, " with parameters ", trial.parameters)
    regressor = RandomForestRegressor(max_features=trial.parameters['max_features'],
                                 n_estimators=trial.parameters['n_estimators'],
                                 random_state=0)
    scores = cross_val_score(regressor, X_train, y_train, cv=5)
    print("Score: ", scores.mean())
    study.add_observation(trial, iteration=1, objective=scores.mean())
    study.finalize(trial)
print(study.get_best_result())

INFO:sherpa.core:
-------------------------------------------------------
SHERPA Dashboard running. Access via
http://192.168.1.197:8882 if on a cluster or
http://localhost:8882 if running locally.
-------------------------------------------------------


 * Serving Flask app 'sherpa.app.app' (lazy loading)
 * Environment: production
 * Debug mode: on
[2m   Use a production WSGI server instead.[0m




Trial  1  with parameters  {'n_estimators': 90, 'max_features': 0.32052568705791107}
Score:  0.3100507899781772
Trial  2  with parameters  {'n_estimators': 51, 'max_features': 0.7509349997317084}
Score:  0.2046020749194871
Trial  3  with parameters  {'n_estimators': 17, 'max_features': 0.2746228948698371}


INFO:GP:initializing Y
INFO:GP:initializing inference method
INFO:GP:adding kernel and likelihood as parameters


Score:  0.23272121550412192




Trial  4  with parameters  {'n_estimators': 90.0, 'max_features': 0.32037333716365735}
Score:  nan


KeyError: nan