#### Import necessary libraries 

In [1]:
# importing utility modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
 
# importing machine learning models for prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier 
# importing voting classifier
from sklearn.ensemble import VotingClassifier
import deepchem as dc
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from deepchem import metrics

import numpy as np
from sklearn.metrics import matthews_corrcoef
from deepchem.splits import RandomSplitter
from scipy.stats import ttest_ind

#### Read in the preprocessed BBB dataset from Adenot paper

In [2]:
bbb_df = pd.read_csv('adenot_processed.csv')

In [3]:
bbb_df.head()

Unnamed: 0,Drug,SMILES,permeable,0,1,2,3,4,5,6,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,ACEBUTOLOL,CCCC(=O)Nc1ccc(c(c1)C(C)=O)OCC(O)CNC(C)C,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DACTINOMYCIN,CC(C)[C@H]1NC(=O)[C@@H](NC(=O)c2ccc(c3c2N=C2C(...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,ALDOSTERONE,C[C@@]12CCC(=O)C=C2CC[C@H]2C3CC[C@H](C(=O)CO)C...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,AMILORIDE,N\C(=N)\NC(=O)c1nc(c(nc1N)N)Cl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AMIODARONE,CCCCc1oc2ccccc2c1C(=O)c1cc(c(c(c1)[I])OCCN(CC)...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Data preprocessing

In [4]:
X = bbb_df.iloc[:,3:].copy()
y = bbb_df.iloc[:,2].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Optimized Models

#### Optimized Random Forest

In [5]:
# Create a Random Forest Classifier
rf_best = RandomForestClassifier(random_state=0, n_estimators = 100, criterion='gini', max_depth=20)

# Train the model using the training sets
rf_best.fit(X_train,y_train)

#### Optimized SVM

In [6]:
# Create a Support Vector Machine Classifier
SVM_best = SVC(C=0.1, gamma=1, kernel='linear', probability=True).fit(X_train, y_train)

# Train the model using the training sets
SVM_best.fit(X_train, y_train)

#### Optimized XGBoost

In [7]:
xg_best = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=3,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.85,
 reg_alpha=1e-05,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,)

In [8]:
xg_best.fit(X_train, y_train)

### Ensemble

In [9]:
# Making the final model using voting classifier
final_model = VotingClassifier(
    estimators=[('svm', SVM_best), ('xgb', xg_best), ('rf', rf_best)], voting='soft')
 
# training all the model on the train dataset
final_model.fit(X_train, y_train)


## Data Preprocessing

In [11]:
Xs = bbb_df.iloc[:,3:].copy()
Ys = bbb_df.iloc[:,2].copy()
dataset = dc.data.DiskDataset.from_numpy(X=Xs,y=Ys,ids=bbb_df['SMILES'].tolist())
scaffoldsplitter = dc.splits.ScaffoldSplitter()

## K-Fold and MCC Caluclations

In [None]:
def K_fold_MCC(dataset, h, split_name="Random Split", splitter=RandomSplitter()):
    
    split_data = splitter.k_fold_split(dataset, k=4)
    
    MCCs = []
    y_true = []
    h_predictions = []
    
    for data in split_data:
        h.fit(data[0].X, data[0].y)
        y_pred = h.predict(data[1].X)
        y_true.extend(list(data[1].y))
        h_predictions.extend(list(y_pred))
        mcc = matthews_corrcoef(data[1].y, y_pred)
        MCCs.append(mcc)
    
    print(split_name + " MCC Values:")
    
    for mcc in MCCs:
        print(mcc)
    print("Mean: " + str(np.mean(MCCs)))

    print("MCC value across full test data: " + str(matthews_corrcoef(y_true, h_predictions)))
    
    return MCCs

#### Comparing MCC Values

In [None]:
model_MCC_dict = {}

#### MCC values for RF Model

In [None]:
scaffold_split_mcc = K_fold_MCC(dataset, rf_best, 'Scaffold Split', scaffoldsplitter)
random_split_mcc = K_fold_MCC(dataset=dataset, h=rf_best)
model_MCC_dict["Random Forest"] = (scaffold_split_mcc, random_split_mcc)

#### MCC values for SVM Model

In [None]:
scaffold_split_mcc = K_fold_MCC(dataset, SVM_best, 'Scaffold Split', scaffoldsplitter)
random_split_mcc = K_fold_MCC(dataset=dataset, h=SVM_best)
model_MCC_dict["SVM"] = (scaffold_split_mcc, random_split_mcc)

#### MCC values for XG-Boost Model

In [None]:
scaffold_split_mcc = K_fold_MCC(dataset, xg_best, 'Scaffold Split', scaffoldsplitter)
random_split_mcc = K_fold_MCC(dataset=dataset, h=xg_best)
model_MCC_dict["XG-Boost"] = (scaffold_split_mcc, random_split_mcc)

#### MCC values for Ensemble Model

In [None]:
scaffold_split_mcc = K_fold_MCC(dataset, final_model, 'Scaffold Split', scaffoldsplitter)
random_split_mcc = K_fold_MCC(dataset=dataset, h=final_model)
model_MCC_dict["Ensemble"] = (scaffold_split_mcc, random_split_mcc)

#### Statistical Comparison

In [None]:
splits = ["Scaffold Split", "Random Split"]

for i in range(2):
    print(splits[i] + " P Values")
    for model in model_MCC_dict:
        for model_2 in model_MCC_dict:
            if model != model_2:
                print(model + " vs. " + model_2 + ": " + str(ttest_ind(model_MCC_dict[model][i], model_MCC_dict[model_2][i]).pvalue))
        if model != model_2:
            print()
    print("____________________")
    print()