In [43]:
from math import sqrt

import lightgbm as lgb
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('pdXY_rdkit_descriptors_200ft.csv')
# Remove rows where code is pred
df = df[df['train_test'] != 'pred']
# Drop train_test column
df = df.drop('train_test', axis=1)
df = df.drop('code', axis=1)
df.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,id,smiles,dG,smiles_len
0,8.645556,0.169259,8.645556,0.169259,0.490728,110.112,104.064,110.036779,42.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0_,C1=CC(=CC=C1O)O,-12.21,15
1,8.975183,0.177713,8.975183,0.177713,0.794127,265.362,248.226,265.111759,96.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1_,CC1=C(SC=[N+]1CC2=CN=C(N=C2N)C)CCO,-4.62,34
2,10.660372,-4.412252,10.660372,0.026376,0.520728,345.341,327.197,345.078089,120.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2_,CC1=C(SC=[N+]1CC2=CN=C(N=C2N)C)CCOP(=O)(O)O,-5.39,43
3,11.405303,-5.115401,11.405303,0.245126,0.351107,425.32,406.168,425.04442,144.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,3_,CC1=C(SC=[N+]1CC2=CN=C(N=C2N)C)CCOP(=O)(O)OP(=...,-9.61,52
4,8.541389,0.225231,8.541389,0.225231,0.669277,143.211,134.139,143.040485,50.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,4_,CC1=C(SC=N1)CCO,-3.48,15


In [3]:
# encode the SMILES strings to categorical values
le = LabelEncoder()
df['smiles'] = le.fit_transform(df['smiles'])

# remove _ at the end of column id and convert to int
df['id'] = df['id'].str.replace('_','').astype(int)

# Split X and y where the dG is the target
X = df.drop('dG', axis=1)
y = df['dG']

# Split train test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train

array([[ 0.0583933 , -0.04460835,  0.0583933 , ..., -0.82493055,
         0.01831198, -1.29653196],
       [ 0.56167447, -0.15965239,  0.56167447, ...,  1.00594357,
         0.0318679 ,  0.23367727],
       [ 1.22061204, -0.25825254,  1.22061204, ...,  0.25986237,
         1.54109301,  2.11974911],
       ...,
       [-0.30878601,  0.78843885, -0.30878601, ..., -0.49995039,
         0.91300232, -0.47804795],
       [ 1.25697854, -0.26479181,  1.25697854, ...,  0.25528518,
         1.52301846,  2.44002546],
       [ 0.44265924, -0.43351015,  0.44265924, ..., -1.26891753,
        -1.64906546, -0.40687543]])

In [15]:
# Check NaN values in X_train
pd.DataFrame(X_train).isnull().sum()
# Replace NaN values with 0
X_train = pd.DataFrame(X_train).fillna(0)
X_test = pd.DataFrame(X_test).fillna(0)

In [41]:
# Fit an XGBRegressor model
xgb = XGBRegressor(max_depth=100, n_estimators=2000, learning_rate=0.1, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, gamma=0.1, reg_alpha=0.1, reg_lambda=0.1)
xgb.fit(X_train, y_train)

In [42]:
# Predict the test set and calculate the accuracy
y_pred = xgb.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE: %f' % rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %f' % mae)

RMSE: 1.498628
MAE: 1.042112


# Linear Regression

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [20]:
# Predict the test set and calculate the accuracy
y_pred = lr.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE: %f' % rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %f' % mae)

RMSE: 4.361776
MAE: 2.127011


# CatBoost

In [23]:
cat = CatBoostRegressor()
cat.fit(X_train, y_train)

Learning rate set to 0.037857
0:	learn: 2.2925222	total: 142ms	remaining: 2m 21s
1:	learn: 2.2718670	total: 149ms	remaining: 1m 14s
2:	learn: 2.2471879	total: 155ms	remaining: 51.5s
3:	learn: 2.2265299	total: 161ms	remaining: 40.1s
4:	learn: 2.2097520	total: 167ms	remaining: 33.3s
5:	learn: 2.1884163	total: 174ms	remaining: 28.8s
6:	learn: 2.1705005	total: 180ms	remaining: 25.6s
7:	learn: 2.1508855	total: 186ms	remaining: 23.1s
8:	learn: 2.1236736	total: 193ms	remaining: 21.2s
9:	learn: 2.1046479	total: 199ms	remaining: 19.7s
10:	learn: 2.0846209	total: 206ms	remaining: 18.5s
11:	learn: 2.0661652	total: 213ms	remaining: 17.6s
12:	learn: 2.0500033	total: 221ms	remaining: 16.8s
13:	learn: 2.0281696	total: 229ms	remaining: 16.1s
14:	learn: 2.0137062	total: 237ms	remaining: 15.5s
15:	learn: 1.9950932	total: 244ms	remaining: 15s
16:	learn: 1.9784757	total: 251ms	remaining: 14.5s
17:	learn: 1.9645289	total: 257ms	remaining: 14s
18:	learn: 1.9431262	total: 264ms	remaining: 13.6s
19:	learn: 1.

<catboost.core.CatBoostRegressor at 0x1a3839f3ee0>

In [24]:
# Predict the test set and calculate the accuracy
y_pred = cat.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE: %f' % rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %f' % mae)

RMSE: 1.451579
MAE: 1.035624


# SVR

In [44]:
# Fit an SVR model
svr = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr.fit(X_train, y_train)

In [45]:
# Predict the test set and calculate the accuracy
y_pred = svr.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE: %f' % rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %f' % mae)

RMSE: 1.783255
MAE: 1.365067


# Random Forest

In [46]:
rf = RandomForestRegressor(n_estimators=1000, max_depth=100, random_state=42)
rf.fit(X_train, y_train)

In [47]:
# Predict the test set and calculate the accuracy
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE: %f' % rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %f' % mae)

RMSE: 1.480722
MAE: 1.062004


# Gradient Boosting

In [48]:
gb = GradientBoostingRegressor(n_estimators=1000, max_depth=100, random_state=42)
gb.fit(X_train, y_train)

In [49]:
# Predict the test set and calculate the accuracy
y_pred = gb.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE: %f' % rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %f' % mae)

RMSE: 2.093648
MAE: 1.396960


# K-Nearest Neighbors

In [56]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

In [57]:
# Predict the test set and calculate the accuracy
y_pred = knn.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE: %f' % rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %f' % mae)

RMSE: 1.660513
MAE: 1.193190


# LightGBM

In [58]:
lgbm = lgb.LGBMRegressor()
lgbm.fit(X_train, y_train)

In [59]:
# Predict the test set and calculate the accuracy
y_pred = lgbm.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('RMSE: %f' % rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %f' % mae)

RMSE: 1.441519
MAE: 1.021037
