# Random Forest models

## DATASETS:
(a) Carbonic Anhydrase II (ChEMBL205), a protein lyase,  
(b) Cyclin-dependent kinase 2 (CHEMBL301), a protein kinase,  
(c) ether-a-go-go-related gene potassium channel 1 (HERG) (CHEMBL240), a voltage-gated ion channel,  
(d) Dopamine D4 receptor (CHEMBL219), a monoamine GPCR,  
(e) Coagulation factor X (CHEMBL244), a serine protease,  
(f) Cannabinoid CB1 receptor (CHEMBL218), a lipid-like GPCR and  
(g) Cytochrome P450 19A1 (CHEMBL1978), a cytochrome P450.  
The activity classes were selected based on data availability and as representatives of therapeutically important target classes or as anti-targets.

In [4]:
!nvidia-smi

Tue Apr 26 14:25:51 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.172.01   Driver Version: 450.172.01   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-DGXS...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   36C    P0    53W / 300W |   2449MiB / 32505MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-DGXS...  On   | 00000000:08:00.0 Off |                    0 |
| N/A   37C    P0    38W / 300W |      9MiB / 32508MiB |      0%      Default |
|       

In [5]:
# Import
import pandas as pd
import numpy as np
from pathlib import Path

In [6]:
from rdkit import Chem
from rdkit.Chem import AllChem

[14:25:52] Enabling RDKit 2019.09.3 jupyter extensions


In [7]:
path = Path('../dataset/13321_2017_226_MOESM1_ESM/')
#df = pd.read_csv('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205_cl.csv', index_col=0)

In [8]:
#df.head()
list(path.iterdir())

[PosixPath('../dataset/13321_2017_226_MOESM1_ESM/mol_images'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL218'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL219'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL240'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL244'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL301'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL1978')]

# Run the functions on a file from dataset and store the results

In [9]:
dataset='CHEMBL205'

In [10]:
df = pd.read_csv(path/f'{dataset}/{dataset}_ecfp_1024_train_valid.csv')

In [11]:
df.head()

Unnamed: 0,CID,SMILES,ECFP4_1,ECFP4_2,ECFP4_3,ECFP4_4,ECFP4_5,ECFP4_6,ECFP4_7,ECFP4_8,...,ECFP4_1017,ECFP4_1018,ECFP4_1019,ECFP4_1020,ECFP4_1021,ECFP4_1022,ECFP4_1023,ECFP4_1024,Activity,is_valid
0,CHEMBL1589687,S1c2n(ncn2)C(O)=C1C([NH+]1CCc2c(C1)cccc2)c1ccc...,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
1,CHEMBL3092937,S(=O)(=O)(N)c1cc(ccc1)-c1nnn(c1)C1OC(COC(=O)C)...,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,False
2,CHEMBL325684,O=C1N(Cc2ccc(cc2)-c2ccccc2C(=O)[O-])C(=NC12CC2...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
3,CHEMBL488713,Fc1cc(F)c(F)cc1CC([NH3+])CC(=O)N1N=CCC1C(=O)Nc...,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,False
4,CHEMBL2069846,Fc1cc(F)c(F)cc1CC([NH3+])CC(=O)N1CCN(CC1)C(=O)...,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False


# Split data

In [12]:
# Split into X,y
X, y = df.drop(["CID", "SMILES", "Activity", 'is_valid'], axis=1), df["Activity"]

In [13]:
# check info of dataframe
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10764 entries, 0 to 10763
Columns: 1024 entries, ECFP4_1 to ECFP4_1024
dtypes: int64(1024)
memory usage: 84.1 MB


In [14]:
X.head()

Unnamed: 0,ECFP4_1,ECFP4_2,ECFP4_3,ECFP4_4,ECFP4_5,ECFP4_6,ECFP4_7,ECFP4_8,ECFP4_9,ECFP4_10,...,ECFP4_1015,ECFP4_1016,ECFP4_1017,ECFP4_1018,ECFP4_1019,ECFP4_1020,ECFP4_1021,ECFP4_1022,ECFP4_1023,ECFP4_1024
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# y is a pandas series
y.head(), y.size, type(y)

(0    0
 1    1
 2    0
 3    0
 4    0
 Name: Activity, dtype: int64,
 10764,
 pandas.core.series.Series)

# Train test split

In [16]:
from sklearn.model_selection import train_test_split, KFold

In [17]:
# regular train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

# 5-Fold Cross Validation

In [18]:
#5-fold
kf = KFold(n_splits=5, shuffle=True, random_state=999)

In [19]:
# append to a list / could write to csv file to keep integrity
X_train_list, X_valid_list, y_train_list, y_valid_list = [], [], [], []

for train_index, valid_index in kf.split(X_train):
    X_train_list.append(X_train.iloc[train_index])
    X_valid_list.append(X_train.iloc[valid_index])
    y_train_list.append(y_train.iloc[train_index])
    y_valid_list.append(y_train.iloc[valid_index]) 

In [20]:
y_train_list[0].head()

8760    0
18      1
2813    0
6590    0
969     0
Name: Activity, dtype: int64

In [21]:
X_train_list[1].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6889 entries, 8760 to 6380
Columns: 1024 entries, ECFP4_1 to ECFP4_1024
dtypes: int64(1024)
memory usage: 53.9 MB


In [22]:
# TODO: add splits to csv file

# SVC

In [24]:
from sklearn.svm import SVC

In [25]:
from sklearn.metrics import auc,roc_auc_score,recall_score,precision_score,f1_score
from  sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score

In [34]:
# train method for Random Forest
def train_svm(X_train, X_test, y_train, y_test, cost=1, kern='linear'):

    
    svm = SVC(C=cost, kernel=kern, degree=3, shrinking=True, probability=True, tol=0.001, 
              cache_size=200, verbose=True, max_iter=-1, random_state=None)
    
    svm.fit(X_train,y_train)
    y_pred= svm.predict(X_test)
    y_pred_prob=svm.predict_proba(X_test)
    
    temp=[]
    for j in range(len(y_pred_prob)):
        temp.append(y_pred_prob[j][1])
    auc=roc_auc_score(np.array(y_test),np.array(temp))
    acc2=accuracy_score(y_test,y_pred)
    mcc=matthews_corrcoef(y_test,y_pred)
    Recall=recall_score(y_test, y_pred,pos_label=1)
    Precision=precision_score(y_test, y_pred,pos_label=1)
    F1_score=f1_score(y_test, y_pred,pos_label=1)

    return auc,acc2,mcc,Recall,Precision,F1_score, svm

In [35]:
def train_on_dataset(dataset):
    
    print(f'Training on dataset: {dataset}')
    
    df = pd.read_csv(path/f'{dataset}/{dataset}_ecfp_1024_train_valid.csv')
    X, y = df.drop(["CID", "SMILES", "Activity", 'is_valid'], axis=1), df["Activity"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=666)
    kf = KFold(n_splits=5, shuffle=True, random_state=999)
    X_train_list, X_valid_list, y_train_list, y_valid_list = [], [], [], []

    for train_index, valid_index in kf.split(X_train):
        X_train_list.append(X_train.iloc[train_index])
        X_valid_list.append(X_train.iloc[valid_index])
        y_train_list.append(y_train.iloc[train_index])
        y_valid_list.append(y_train.iloc[valid_index]) 
    aucs, accs, mccs, recalls, precs, f1_scores = [], [], [], [], [], []
    for i in range(0,5):
        X_train = X_train_list[i]
        X_valid = X_valid_list[i]
        y_train = y_train_list[i]
        y_valid = y_valid_list[i]
        auc,acc2,mcc,Recall,Precision,F1_score, svm = train_svm(X_train, X_valid, y_train, y_valid)
          
        mccs.append(mcc)
        aucs.append(auc)
        accs.append(acc2)
        mccs.append(mcc)
        recalls.append(Recall)
        precs.append(Precision)
        f1_scores.append(F1_score)
        
    print(f"Average ROCAUC of the folds: {np.mean(aucs)}")
    print(f"Average accuracy of the folds: {np.mean(accs)}")
    print(f"Average Matthews correlation of the folds: {np.mean(mccs)}")
    print(f"Average recall of the folds: {np.mean(recalls)}")
    print(f"Average precision of the folds: {np.mean(precs)}")
    print(f"Average f1 score of the folds: {np.mean(f1_scores)}")
    print()
    score = []
    score.append(np.mean(aucs))
    score.append(np.mean(accs))
    score.append(np.mean(mccs))
    score.append(np.mean(recalls))
    score.append(np.mean(precs))
    score.append(np.mean(f1_scores))
    score = np.mean(score)
    mean_mcc = np.mean(mccs)
    return score, mean_mcc, svm

In [36]:
_, _, svm = train_on_dataset(dataset)

Training on dataset: CHEMBL205
[LibSVM].....*....*
optimization finished, #iter = 9315
obj = -62.854469, rho = -0.457068
nSV = 524, nBSV = 15
Total nSV = 524
......*...*
optimization finished, #iter = 9688
obj = -62.109719, rho = -0.655050
nSV = 528, nBSV = 12
Total nSV = 528
.....*....*.*
optimization finished, #iter = 9347
obj = -58.162893, rho = -0.687911
nSV = 506, nBSV = 16
Total nSV = 506
.....*...*
optimization finished, #iter = 8572
obj = -52.703721, rho = -0.972076
nSV = 516, nBSV = 13
Total nSV = 516
.....*...*
optimization finished, #iter = 8505
obj = -49.960418, rho = -0.353565
nSV = 485, nBSV = 9
Total nSV = 485
........*.....*
optimization finished, #iter = 13353
obj = -85.505738, rho = 0.612364
nSV = 574, nBSV = 30
Total nSV = 574
[LibSVM]....*...*
optimization finished, #iter = 7770
obj = -50.321161, rho = -0.323727
nSV = 480, nBSV = 14
Total nSV = 480
....*...*
optimization finished, #iter = 7574
obj = -50.938960, rho = -0.601549
nSV = 487, nBSV = 14
Total nSV = 487
..

# Test

In [37]:
df_test = pd.read_csv(path/f'{dataset}/{dataset}_ecfp_1024_test1.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3589 entries, 0 to 3588
Columns: 1027 entries, CID to Activity
dtypes: int64(1025), object(2)
memory usage: 28.1+ MB


In [38]:
# Split into X,y
X, y = df_test.drop(["CID", "SMILES", "Activity"], axis=1), df_test["Activity"]

In [40]:
y_pred= svm.predict(X)
len(y_pred)

3589

In [41]:
y_pred_prob=svm.predict_proba(X)
y_pred_prob

array([[9.99975146e-01, 2.48538045e-05],
       [9.90211842e-01, 9.78815796e-03],
       [9.45116426e-01, 5.48835735e-02],
       ...,
       [9.99494813e-01, 5.05187016e-04],
       [2.02364386e-02, 9.79763561e-01],
       [9.90595261e-01, 9.40473924e-03]])

In [42]:
preds = pd.DataFrame()
preds['class'] = y
preds['predictions'] = list(y_pred_prob)
preds
preds.head()

Unnamed: 0,class,predictions
0,0,"[0.9999751461955029, 2.485380449699628e-05]"
1,0,"[0.9902118420358993, 0.009788157964100711]"
2,0,"[0.945116426485229, 0.05488357351477098]"
3,0,"[0.999606827121969, 0.00039317287803097145]"
4,1,"[0.004347499404470977, 0.995652500595529]"


In [43]:
preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3589 entries, 0 to 3588
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   class        3589 non-null   int64 
 1   predictions  3589 non-null   object
dtypes: int64(1), object(1)
memory usage: 56.2+ KB


In [44]:
preds = preds[0:-1]
preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3588 entries, 0 to 3587
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   class        3588 non-null   int64 
 1   predictions  3588 non-null   object
dtypes: int64(1), object(1)
memory usage: 56.2+ KB


In [45]:
preds.to_csv(path/f'{dataset}/{dataset}_predictions_SVM.csv', index=False)