In [32]:
pip install pandas scikit-learn numpy rdkit


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [39]:
#import libraries 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
import warnings

In [68]:
#read dataframe
df = pd.read_csv("~/Documents/glp1-20.csv")

In [34]:
#extract smiles and pChEMBL values 
smiles =df['Smiles']
pchembl_value=df['pChEMBL-value']

In [67]:
#check df structure
df.head(5)

Unnamed: 0,Smiles,pChEMBL-value
0,CCc1cc(OC)ccc1-c1ccc(C[C@H](NC(=O)[C@H](CC(=O)...,6.01
1,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C...,7.82
2,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,7.77
3,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C...,10.26
4,CC[C@H](C)[C@@H]1NC(=O)[C@H](CCC(=O)O)NC(=O)[C...,9.85


In [66]:
#check df structure
len(df)

19

In [46]:
#convert smiles to molecular fingerprints 
from rdkit.Chem import rdFingerprintGenerator

def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(2048)  # Handle invalid SMILES
    # Use MorganGenerator
    generator = rdFingerprintGenerator.GetMorganGenerator(radius=2)
    fp = generator.GetFingerprint(mol)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

# Apply the function to the SMILES column
X = np.array([smiles_to_fingerprint(sm) for sm in smiles])


In [48]:
#define target 
y = df['pChEMBL-value']

In [49]:
#split into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
#feature scaling (SVR is sensitive to the scale of input features)
scaler_X = StandardScaler()
scaler_y = StandardScaler()

#fit to training data and transform test and training data 
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

#reshape y so it fits the scaler 
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()

In [63]:
#train the model 
svr = SVR(kernel ='poly', C=1.0, epsilon=0.1)

#fit the model to the training data
svr.fit(X_train_scaled, y_train_scaled)


In [64]:
#make predictions 
y_pred_scaled = svr.predict(X_test_scaled)

#reshape y_pred_scaled to a 2D array
y_pred_scaled = y_pred_scaled.reshape(-1, 1)

#transform back to origional scale 
y_pred = scaler_y.inverse_transform(y_pred_scaled)


In [65]:
#evaluation of the model 
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 5.335665750110055
R2 Score: -8.007338334626956


In [62]:
#hyperparameter tuning 
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf']
}

grid_search = GridSearchCV(SVR(), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train_scaled)

# Best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")


Best parameters: {'C': 1, 'epsilon': 0.1, 'kernel': 'poly'}
