#### Import necessary libraries 

In [1]:
# importing utility modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
 
# importing machine learning models for prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier 
# importing voting classifier
from sklearn.ensemble import VotingClassifier
import deepchem as dc
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from deepchem import metrics

import numpy as np
from sklearn.metrics import matthews_corrcoef
from deepchem.splits import RandomSplitter
from scipy.stats import ttest_ind

#### Read in the preprocessed BBB dataset from Adenot paper

In [2]:
bbb_df = pd.read_csv('adenot_processed.csv')

In [3]:
bbb_df.head()

Unnamed: 0,Drug,SMILES,permeable,0,1,2,3,4,5,6,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,ACEBUTOLOL,CCCC(=O)Nc1ccc(c(c1)C(C)=O)OCC(O)CNC(C)C,0,0,1,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
1,DACTINOMYCIN,CC(C)[C@H]1NC(=O)[C@@H](NC(=O)c2ccc(c3c2N=C2C(...,0,0,1,0,0,1,1,0,...,0,0,0,0,0,1,0,0,0,0
2,ALDOSTERONE,C[C@@]12CCC(=O)C=C2CC[C@H]2C3CC[C@H](C(=O)CO)C...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,AMILORIDE,N\C(=N)\NC(=O)c1nc(c(nc1N)N)Cl,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AMIODARONE,CCCCc1oc2ccccc2c1C(=O)c1cc(c(c(c1)[I])OCCN(CC)...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Data preprocessing

In [4]:
X = bbb_df.iloc[:,3:].copy()
y = bbb_df.iloc[:,2].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Optimized Models

#### Optimized Random Forest

In [5]:
# Create a Random Forest Classifier
rf_best = RandomForestClassifier(random_state=0, n_estimators = 100, criterion='gini', max_depth=20)

# Train the model using the training sets
rf_best.fit(X_train,y_train)

#### Optimized SVM

In [6]:
# Create a Support Vector Machine Classifier
SVM_best = SVC(C=0.1, gamma=1, kernel='linear', probability=True).fit(X_train, y_train)

# Train the model using the training sets
SVM_best.fit(X_train, y_train)

#### Optimized XGBoost

In [7]:
xg_best = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=3,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.85,
 reg_alpha=1e-05,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,)

In [8]:
xg_best.fit(X_train, y_train)

### Ensemble

In [9]:
# Making the final model using voting classifier
final_model = VotingClassifier(
    estimators=[('svm', SVM_best), ('xgb', xg_best), ('rf', rf_best)], voting='soft')
 
# training all the model on the train dataset
final_model.fit(X_train, y_train)


## Data Preprocessing

In [10]:
Xs = bbb_df.iloc[:,3:].copy()
Ys = bbb_df.iloc[:,2].copy()
dataset = dc.data.DiskDataset.from_numpy(X=Xs,y=Ys,ids=bbb_df['SMILES'].tolist())
scaffoldsplitter = dc.splits.ScaffoldSplitter()

## K-Fold and MCC Caluclations

In [11]:
def K_fold_MCC(dataset, h, split_name="Random Split", splitter=RandomSplitter()):
    
    split_data = splitter.k_fold_split(dataset, k=4)
    
    MCCs = []
    y_true = []
    h_predictions = []
    
    for data in split_data:
        h.fit(data[0].X, data[0].y)
        y_pred = h.predict(data[1].X)
        y_true.extend(list(data[1].y))
        h_predictions.extend(list(y_pred))
        mcc = matthews_corrcoef(data[1].y, y_pred)
        MCCs.append(mcc)
    
    print(split_name + " MCC Values:")
    
    for mcc in MCCs:
        print(mcc)
    print("Mean: " + str(np.mean(MCCs)))

    print("MCC value across full test data: " + str(matthews_corrcoef(y_true, h_predictions)))
    
    return MCCs

#### Comparing MCC Values

In [12]:
model_MCC_dict = {}

#### MCC values for RF Model

In [13]:
scaffold_split_mcc = K_fold_MCC(dataset, rf_best, 'Scaffold Split', scaffoldsplitter)
random_split_mcc = K_fold_MCC(dataset=dataset, h=rf_best)
model_MCC_dict["Random Forest"] = (scaffold_split_mcc, random_split_mcc)

Scaffold Split MCC Values:
0.7846914939903075
0.807007127726689
0.8145075485434284
0.6801964731777397
Mean: 0.7716006608595412
MCC value across full test data: 0.7700602690681662
Random Split MCC Values:
0.7925529062308825
0.7909259824878913
0.830220173642609
0.8464134917307761
Mean: 0.8150281385230397
MCC value across full test data: 0.8154185234666431


#### MCC values for SVM Model

In [14]:
scaffold_split_mcc = K_fold_MCC(dataset, SVM_best, 'Scaffold Split', scaffoldsplitter)
random_split_mcc = K_fold_MCC(dataset=dataset, h=SVM_best)
model_MCC_dict["SVM"] = (scaffold_split_mcc, random_split_mcc)

Scaffold Split MCC Values:
0.6887294242922333
0.8368969503903855
0.3739754258990208
0.7728042598325765
Mean: 0.668101515103554
MCC value across full test data: 0.7598248879287592
Random Split MCC Values:
0.867336487000795
0.8459374696729989
0.8977288048677402
0.8351349744198275
Mean: 0.8615344339903405
MCC value across full test data: 0.8616504529564221


#### MCC values for XG-Boost Model

In [15]:
scaffold_split_mcc = K_fold_MCC(dataset, xg_best, 'Scaffold Split', scaffoldsplitter)
random_split_mcc = K_fold_MCC(dataset=dataset, h=xg_best)
model_MCC_dict["XG-Boost"] = (scaffold_split_mcc, random_split_mcc)

Scaffold Split MCC Values:
0.6887294242922333
0.8229210098537754
0.3739754258990208
0.7626409821295004
Mean: 0.6620667105436324
MCC value across full test data: 0.7532089305387478
Random Split MCC Values:
0.8697312983332525
0.8717265195335877
0.7793273863779517
0.8025247677050317
Mean: 0.8308274929874558
MCC value across full test data: 0.8305919141597708


#### MCC values for Ensemble Model

In [16]:
scaffold_split_mcc = K_fold_MCC(dataset, final_model, 'Scaffold Split', scaffoldsplitter)
random_split_mcc = K_fold_MCC(dataset=dataset, h=final_model)
model_MCC_dict["Ensemble"] = (scaffold_split_mcc, random_split_mcc)

Scaffold Split MCC Values:
0.7586646256039993
0.830687003199148
0.5094404799339134
0.7657810101349732
Mean: 0.7161432797180085
MCC value across full test data: 0.7900369379200567
Random Split MCC Values:
0.9066508331602346
0.842997522397687
0.8468453655033208
0.8608891184542393
Mean: 0.8643457098788705
MCC value across full test data: 0.8637013483692094


#### Statistical Comparison

In [17]:
splits = ["Scaffold Split", "Random Split"]

for i in range(2):
    print(splits[i] + " P Values")
    for model in model_MCC_dict:
        for model_2 in model_MCC_dict:
            if model != model_2:
                print(model + " vs. " + model_2 + ": " + str(ttest_ind(model_MCC_dict[model][i], model_MCC_dict[model_2][i]).pvalue))
        if model != model_2:
            print()
    print("____________________")
    print()

Scaffold Split P Values
Random Forest vs. SVM: 0.3717673941105037
Random Forest vs. XG-Boost: 0.33540065212223014
Random Forest vs. Ensemble: 0.5001751872734981

SVM vs. Random Forest: 0.3717673941105037
SVM vs. XG-Boost: 0.9677533056066379
SVM vs. Ensemble: 0.713266198876052

XG-Boost vs. Random Forest: 0.33540065212223014
XG-Boost vs. SVM: 0.9677533056066379
XG-Boost vs. Ensemble: 0.6741389198820624

Ensemble vs. Random Forest: 0.5001751872734981
Ensemble vs. SVM: 0.713266198876052
Ensemble vs. XG-Boost: 0.6741389198820624
____________________

Random Split P Values
Random Forest vs. SVM: 0.054842391853314756
Random Forest vs. XG-Boost: 0.5837840840960108
Random Forest vs. Ensemble: 0.04984436380165369

SVM vs. Random Forest: 0.054842391853314756
SVM vs. XG-Boost: 0.30315475455318236
SVM vs. Ensemble: 0.8933380284046588

XG-Boost vs. Random Forest: 0.5837840840960108
XG-Boost vs. SVM: 0.30315475455318236
XG-Boost vs. Ensemble: 0.2716666509903061

Ensemble vs. Random Forest: 0.0498443