In [1]:
import numpy as np
import pandas as pd
import math

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [5]:
df = pd.read_csv('data_combined.csv')
df_f = pd.read_csv('features2.csv')

In [6]:
df

Unnamed: 0,solubility,SMILES
0,-2.1800,ClCC(Cl)(Cl)Cl
1,-2.0000,CC(Cl)(Cl)Cl
2,-1.7400,ClC(Cl)C(Cl)Cl
3,-1.4800,ClCC(Cl)Cl
4,-3.0400,FC(F)(Cl)C(F)(Cl)Cl
...,...,...
4490,-3.3319,NS(=O)(=O)c1ccc(C(=O)c2ccc(CNCc3ccccc3)cc2)s1
4491,-2.1669,CCCCNCc1ccc(C(=O)c2ccc(S(N)(=O)=O)s2)cc1
4492,-1.4812,NS(=O)(=O)c1ccc(C(=O)c2ccc(CN3CCOCC3)cc2)s1
4493,-1.8802,CN1CCN(Cc2ccc(C(=O)c3ccc(S(N)(=O)=O)s3)cc2)CC1


In [7]:
df_f

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,num_aromatic_atoms,num_heavy_atoms,Aromatic_Proportion,valence_electrons,num_aromatic_rings,exact_MolWt,heavyatomWt,LASA
0,2.5954,167.850,0.0,0,6,0.000000,38,0,165.891061,165.834,56.317258
1,2.3765,133.405,0.0,0,5,0.000000,32,0,131.930033,130.381,46.013992
2,2.5938,167.850,1.0,0,6,0.000000,38,0,165.891061,165.834,56.317258
3,2.0289,133.405,1.0,0,5,0.000000,32,0,131.930033,130.381,46.013992
4,2.9189,187.375,1.0,0,8,0.000000,50,0,185.901768,187.375,58.510598
...,...,...,...,...,...,...,...,...,...,...,...
4490,2.9163,386.498,7.0,17,26,0.653846,134,3,386.075884,368.354,156.092832
4491,2.5162,352.481,8.0,11,23,0.478261,124,2,352.091535,332.321,140.130564
4492,1.4587,366.464,5.0,11,24,0.458333,128,2,366.070799,348.320,144.448250
4493,1.3739,379.507,5.0,11,25,0.440000,134,2,379.102434,358.339,151.459480


In [8]:
## Set X and y variables 
X = df_f
y = df.drop(columns = ['SMILES'])

In [9]:
## Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=202)

In [10]:
## standard scaler
scaler = StandardScaler()

## Baseline predictions

In [12]:
len(y)

4495

In [37]:
## Baseline based on mean
y_pred_baseline_mean = [y.mean()] * len(y)

In [41]:
mae_baseline = mean_absolute_error(y,y_pred_baseline_mean)
mse_baseline = mean_squared_error(y,y_pred_baseline_mean)
rmse_baseline = math.sqrt(mse_baseline)
r2_baseline = r2_score(y,y_pred_baseline_mean)

print('Baseline MAE: ', mae_baseline)
print('Baseline MSE: ', mse_baseline)
print('Baseline RMSE: ', rmse_baseline)
print('Baseline r2: ', r2_baseline)

Baseline MAE:  1.5737798933681106
Baseline MSE:  4.184337003741072
Baseline RMSE:  2.045565203981792
Baseline r2:  2.220446049250313e-16


In [47]:
## Baseline from Delaney Research paper
df_delaney = pd.read_csv('data_delaney.csv')

In [49]:
df_delaney.head()

Unnamed: 0,Compound ID,measured log(solubility:mol/L),ESOL predicted log(solubility:mol/L),SMILES
0,"1,1,1,2-Tetrachloroethane",-2.18,-2.794,ClCC(Cl)(Cl)Cl
1,"1,1,1-Trichloroethane",-2.0,-2.232,CC(Cl)(Cl)Cl
2,"1,1,2,2-Tetrachloroethane",-1.74,-2.549,ClC(Cl)C(Cl)Cl
3,"1,1,2-Trichloroethane",-1.48,-1.961,ClCC(Cl)Cl
4,"1,1,2-Trichlorotrifluoroethane",-3.04,-3.077,FC(F)(Cl)C(F)(Cl)Cl


In [50]:
y_delaney = df_delaney['measured log(solubility:mol/L)']
y_pred_delaney = df_delaney['ESOL predicted log(solubility:mol/L)']

In [51]:
mae_baseline = mean_absolute_error(y_delaney,y_pred_delaney)
mse_baseline = mean_squared_error(y_delaney,y_pred_delaney)
rmse_baseline = math.sqrt(mse_baseline)
r2_baseline = r2_score(y_delaney,y_pred_delaney)

print('Baseline MAE: ', mae_baseline)
print('Baseline MSE: ', mse_baseline)
print('Baseline RMSE: ', rmse_baseline)
print('Baseline r2: ', r2_baseline)

Baseline MAE:  0.6944676573426574
Baseline MSE:  0.8217520533216783
Baseline RMSE:  0.9065054072214233
Baseline r2:  0.8128757557709796


## Linear Regression

In [13]:
model_lr = LinearRegression()

In [14]:
## Scale data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# Train and test linear regression model
model_lr.fit(X_train_scaled,y_train)
y_pred_train = model_lr.predict(X_train_scaled)
y_pred_test = model_lr.predict(X_test_scaled)

In [16]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_train = math.sqrt(mse_train)
rmse_test = math.sqrt(mse_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("The train MAE score is: ",  mae_train)
print("The test MAE score is: ",  mae_test)

print("The train MSE is: ",  mse_train)
print("The test MSE is: ",  mse_test)
print("The train RMSE is: ",  rmse_train)
print("The test RMSE is: ",  rmse_test)


print("The train r2 score is: ",  r2_train)
print("The test r2 score is: ",  r2_test)

The train MAE score is:  0.808686974840313
The test MAE score is:  0.8154177510530086
The train MSE is:  1.1390764033403933
The test MSE is:  1.1136574512050215
The train RMSE is:  1.0672752238014305
The test RMSE is:  1.0552996973395858
The train r2 score is:  0.7283247693288075
The test r2 score is:  0.7315665007363614


## XGBoost

In [17]:
model_xgb = XGBRegressor()

In [18]:
# Train and test XGB model
model_xgb.fit(X_train,y_train)
y_pred_train = model_xgb.predict(X_train)
y_pred_test = model_xgb.predict(X_test)

In [19]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_train = math.sqrt(mse_train)
rmse_test = math.sqrt(mse_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("The train MAE score is: ",  mae_train)
print("The test MAE score is: ",  mae_test)

print("The train MSE is: ",  mse_train)
print("The test MSE is: ",  mse_test)
print("The train RMSE is: ",  rmse_train)
print("The test RMSE is: ",  rmse_test)


print("The train r2 score is: ",  r2_train)
print("The test r2 score is: ",  r2_test)

The train MAE score is:  0.25901265574160864
The test MAE score is:  0.5053593575387365
The train MSE is:  0.14131368683037163
The test MSE is:  0.5889091209613779
The train RMSE is:  0.3759171276097587
The test RMSE is:  0.7674041444775874
The train r2 score is:  0.966296002310246
The test r2 score is:  0.8580506636785988


## Random Forest

In [13]:
model_rfr = RandomForestRegressor()

In [None]:
# Train and test random forest model
model_rfr.fit(X_train,y_train)
y_pred_train = model_rfr.predict(X_train)
y_pred_test = model_rfr.predict(X_test)

In [15]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_train = math.sqrt(mse_train)
rmse_test = math.sqrt(mse_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("The train MAE score is: ",  mae_train)
print("The test MAE score is: ",  mae_test)

print("The train MSE is: ",  mse_train)
print("The test MSE is: ",  mse_test)
print("The train RMSE is: ",  rmse_train)
print("The test RMSE is: ",  rmse_test)


print("The train r2 score is: ",  r2_train)
print("The test r2 score is: ",  r2_test)

The train MAE score is:  0.22619660183931817
The test MAE score is:  0.47830305418371183
The train MSE is:  0.12063739989941574
The test MSE is:  0.5632394350800597
The train RMSE is:  0.3473289505633179
The test RMSE is:  0.7504927948222153
The train r2 score is:  0.971227396732006
The test r2 score is:  0.8642380273052371


## Extra Trees Regressor

In [4]:
model_ET = ExtraTreesRegressor()

In [None]:
model_ET.fit(X_train, y_train)
y_pred_train = model_ET.predict(X_train)
y_pred_test = model_ET.predict(X_test)

In [12]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_train = math.sqrt(mse_train)
rmse_test = math.sqrt(mse_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("The train MAE score is: ",  mae_train)
print("The test MAE score is: ",  mae_test)

print("The train MSE is: ",  mse_train)
print("The test MSE is: ",  mse_test)
print("The train RMSE is: ",  rmse_train)
print("The test RMSE is: ",  rmse_test)


print("The train r2 score is: ",  r2_train)
print("The test r2 score is: ",  r2_test)

The train MAE score is:  0.09001704945298201
The test MAE score is:  0.44152522715952924
The train MSE is:  0.05493743542216311
The test MSE is:  0.5677957303094437
The train RMSE is:  0.23438736190793885
The test RMSE is:  0.7535222162016484
The train r2 score is:  0.9868971559791501
The test r2 score is:  0.8631397881018102
