# Import packages

In [22]:
import pickle  # Local Python (3.8) is fine with this. If you're suing Google
# colab, which uses a Python version of 3.6, you need to do import pickel5
# as pickle
import cloudpickle as cp
from urllib.request import urlopen

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.cluster import MiniBatchKMeans
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.combine import SMOTEENN

from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from collinearity import SelectNonCollinear

from sklearn.decomposition import PCA
from sklearn.svm import SVR, SVC

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, \
    StratifiedKFold, RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import mean_squared_error, mean_absolute_error, \
    r2_score, make_scorer, recall_score, accuracy_score, f1_score, \
    precision_score, balanced_accuracy_score, roc_curve, auc

# Custom functions

In [36]:
from dataset_expansion import dataset_feature_expansion, merge_multiple_dfs
from dataset_cleanup import filter_low_variance
from dataset_plot import simple_pie_plot
from pickle_managment import save_pickle, load_pickle


# Read in data

## Regression

In [None]:
regression_df = pd.read_csv('datasets\cleaned_datasets\BBB_regression.csv')
regression_df

In [None]:
(regression_df['logBB'] <= -1.01).sum()  #These are BBB-

In [None]:
(regression_df['logBB'] >= -1).sum()  #These are BBB+

## Classification

In [None]:
classification_df = pd.read_csv(
    'datasets\cleaned_datasets\BBB_classification.csv'
)
classification_df

In [None]:
classification_df['BBB+/BBB-'].value_counts()

# Dataset expansion & cleaning
Major expansion steps:
1. Add in RDKit descriptors
2. Add in Morgan fingerprints
3. Add in MACCS keys

Major cleaning steps:
1. Remove columns whose variance is 0--all values are hte same
    * Done by a function so later the threshold for filtering
    based on variance level is adjustable

## Regression

In [None]:
regression_df_expanded, regression_expansion_errors = dataset_feature_expansion(
    regression_df)
regression_df_expanded  #Missing SMILES are the chemicals that have errors
# when going through the calculations

In [None]:
regression_df_expanded_cleaned = filter_low_variance(
    regression_df_expanded,
    exclude_col_list=['SMILES', 'logBB'],
    threshold_level=0
)
regression_df_expanded_cleaned

In [None]:
regression_df_expanded_cleaned.to_csv(
    'datasets\expanded_datasets\BBB_regression_expanded.csv.zip',
    index=False,
    compression='zip'  # Have to use zip here since the classification
    # dataset will become very large. Zipped .csv files can be directly read
    # by pd.read_csv()
)
print('Done!')

## Classification

In [None]:
classification_df_expanded, classification_expansion_errors = dataset_feature_expansion(
    classification_df)
classification_df_expanded

In [None]:
classification_df_expanded_cleaned = filter_low_variance(
    classification_df_expanded,
    exclude_col_list=['SMILES', 'BBB+/BBB-'],
    threshold_level=0
)
classification_df_expanded_cleaned

In [None]:
classification_df_expanded_cleaned.to_csv(
    'datasets\expanded_datasets\BBB_classification_expanded.csv.zip',
    index=False,
    compression='zip'
)
print('Done!')

# Dataset rebalance
Before center and standardization

## Regression
Regression dataset doesn't need rebalancing

## Classification

In [None]:
classification_df_expanded_cleaned = pd.read_csv(
    'datasets\expanded_datasets\BBB_classification_expanded.csv.zip')
classification_df_expanded_cleaned

In [None]:
classification_BBB_N = (classification_df_expanded_cleaned
[classification_df_expanded_cleaned['BBB+/BBB-'] == 'BBB-']).shape[0]  #These
# are BBB-
classification_BBB_Y = (classification_df_expanded_cleaned
[classification_df_expanded_cleaned['BBB+/BBB-'] == 'BBB+']).shape[0]  #These
# are BBB+

simple_pie_plot(
    label_list=['BBB-', 'BBB+'],
    num_list=[classification_BBB_N, classification_BBB_Y],
    title_str='Composition of 2 categories in regression dataset before balancing'
)
plt.show()

In [None]:
X = classification_df_expanded_cleaned.loc[
    :,
    ~classification_df_expanded_cleaned.columns.isin(['SMILES', 'BBB+/BBB-'])
]
y = classification_df_expanded_cleaned['BBB+/BBB-']

### Under-sampling by ClusterCentroids

In [None]:
cluster_centroids = ClusterCentroids(
    estimator=MiniBatchKMeans(n_init=1, random_state=1),
    random_state=1
)



X_resample, y_resample = cluster_centroids.fit_resample(X, y)

classification_df_after_centroid_balancing = merge_multiple_dfs\
    (df_list=[classification_df_expanded_cleaned['SMILES'], y_resample, X_resample])
classification_df_after_centroid_balancing

In [None]:
classification_BBB_N = (classification_df_after_centroid_balancing
[classification_df_after_centroid_balancing['BBB+/BBB-'] == 'BBB-']).shape[0]  #These
# are BBB-
classification_BBB_Y = (classification_df_after_centroid_balancing
[classification_df_after_centroid_balancing['BBB+/BBB-'] == 'BBB+']).shape[0]  #These
# are BBB+

simple_pie_plot(
    label_list=['BBB-', 'BBB+'],
    num_list=[classification_BBB_N, classification_BBB_Y],
    title_str='Composition of 2 categories in regression dataset after '
              'balancing by centroids method'
)
plt.show()

In [None]:
classification_df_after_centroid_balancing.to_csv(
    r'datasets\balanced_datasets\BBB_classification_balanced_centroid.csv.zip',
    index=False,
    compression='zip'
)
print('Done!')

### Over-sample by SMOTE then cleaning using ENN
Not using SMOTE only to create lots of hypothetical chemicals that might
not exist

In [None]:
smoteenn = SMOTEENN(random_state=1)

X_resample, y_resample = smoteenn.fit_resample(X, y)

classification_df_after_smoteenn_balancing = merge_multiple_dfs\
    (df_list=[classification_df_expanded_cleaned['SMILES'], y_resample, X_resample])
classification_df_after_smoteenn_balancing

In [None]:
classification_BBB_N = (classification_df_after_smoteenn_balancing
[classification_df_after_smoteenn_balancing['BBB+/BBB-'] == 'BBB-']).shape[0]  #These
# are BBB-
classification_BBB_Y = (classification_df_after_smoteenn_balancing
[classification_df_after_smoteenn_balancing['BBB+/BBB-'] == 'BBB+']).shape[0]  #These
# are BBB+

simple_pie_plot(
    label_list=['BBB-', 'BBB+'],
    num_list=[classification_BBB_N, classification_BBB_Y],
    title_str='Composition of 2 categories in regression dataset after '
              'balancing by SMOTE-ENN method'
)
plt.show()

In [None]:
classification_df_after_smoteenn_balancing.to_csv(
    r'datasets\balanced_datasets\BBB_classification_balanced_smoteenn.csv.zip',
    index=False,
    compression='zip'
)
print('Done!')

# Model training
Datasets transformation will be done along the way

## Regression

In [6]:
regression_df_expanded = pd.read_csv(r'datasets/expanded_datasets/BBB_regression_expanded.csv.zip')
regression_df_expanded

Unnamed: 0,SMILES,logBB,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,CN1C(=NN=N1)SCC2=C(N3C(C(C3=O)(NC(=O)C(C4=CC=C...,-2.52,13.190522,13.190522,0.042537,-2.144257,0.133795,22.000000,520.480,500.320,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)OC6[C@...,-2.15,11.445328,11.445328,0.165306,-1.798901,0.346256,45.303030,461.467,434.251,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@...,-2.09,11.479044,11.479044,0.060963,-1.790095,0.359144,45.393939,461.467,434.251,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,CC1=NC=C(C=C1)CC2CNC(NC2=O)NCCCCC3=NC=C(C=C3C)Br,-1.88,12.391214,12.391214,0.061101,-0.159783,0.543803,19.464286,446.393,418.169,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,c1(c2c3n(c4c(C(N(C)C3)=O)c(Cl)ccc4)cn2)noc(C(O...,-1.82,12.699094,12.699094,0.092039,-2.255140,0.648321,14.192308,375.772,361.660,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,C[NH2+]CCCN1C2=CC=CC=C2CCC3=CC=CC=C31,1.20,2.515046,2.515046,1.095602,1.095602,0.843816,13.550000,267.396,244.212,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1047,CN(C)CCCN1C2=CC=CC=C2SC3=CC=CC=C31,1.23,2.462963,2.462963,1.062269,1.062269,0.828858,13.250000,284.428,264.268,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1048,CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2,1.30,6.083380,6.083380,0.016065,0.016065,0.784550,11.157895,255.361,234.193,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1049,CNCCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl,1.40,6.182100,6.182100,0.793840,0.793840,0.834133,13.000000,304.846,287.710,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


In [12]:
regression_X = regression_df_expanded.loc[
                   :,
                   ~regression_df_expanded.columns.isin(
                       ['SMILES', 'logBB'])
                   ]

regression_y = regression_df_expanded['logBB']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(
        n_components=0.95, #When using the svd_solver='full', n_components
        # can be between 0 and 1 to represent the percentage of variance
        # that you want to explain
        svd_solver='full'
    ))
])

regression_X_processed=data_processing_pipeline.fit_transform(regression_X)
regression_X_processed

array([[ 14.78371761,   3.1919762 ,   3.58857037, ...,   0.35103119,
         -0.14879143,   0.15825741],
       [ 12.43488374,   8.49247331, -13.0038119 , ...,   0.0434666 ,
         -0.4067051 ,   0.56873198],
       [ 13.58017443,   9.86524109,  -9.14099145, ...,  -0.28354372,
         -0.14371695,  -1.29357963],
       ...,
       [ -4.71408993,   0.4475601 ,  -1.87718704, ...,   4.05773864,
         -1.76356108,  -4.33797849],
       [ -3.54861458,  -5.6401522 ,  -0.73223939, ...,  -0.52063431,
         -0.26357589,   0.12871897],
       [ -5.17044143,  -0.28775045,  -0.47069693, ...,  -0.45914922,
          0.01601137,   0.45803261]])

In [14]:
regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X_processed,
    regression_y,
    test_size=0.2,
    random_state=1,
    shuffle=True
)


In [18]:
start_time = datetime.now()

svr_model = SVR()

svr_regressor_grid_search = GridSearchCV(
    estimator=svr_model,
    param_grid={
        'kernel': ['rbf'],  #Kernel to solve with model, should try rbf, poly, and sigmoid
        'gamma': [0.1, 0.5],  #Solver value important for rbf kernel
        #'degree': [2,3,4] #Used for polynomial kernel
        #'C': [0.001, 0.01, 0.1, 1, 10, 100] #Regularization parameter
    },
    cv=2,  #Number of fold for cross validation. It should be 8 or 10
    scoring={
        # All these are only viable in the negative option
        'MAE': 'neg_mean_absolute_error',
        'MSE': 'neg_mean_squared_error',
        'R2': 'r2'
    },
    refit='R2',

    n_jobs=1,
    # -1 means using all processors, but it won't give you any messages.
    # Only using 1 for my computer print out the training messages

    verbose=10  #Provide detailed messages
)

svr_regressor_grid_search.fit(regression_X_train, regression_y_train)

end_time = datetime.now()
print('GridSearchCV took {}', end_time - start_time)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 1/2; 1/2] END gamma=0.1, kernel=rbf; MAE: (test=-0.534) MSE: (test=-0.501) R2: (test=0.103) total time=   0.0s
[CV 2/2; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 2/2; 1/2] END gamma=0.1, kernel=rbf; MAE: (test=-0.527) MSE: (test=-0.472) R2: (test=0.130) total time=   0.0s
[CV 1/2; 2/2] START gamma=0.5, kernel=rbf.......................................
[CV 1/2; 2/2] END gamma=0.5, kernel=rbf; MAE: (test=-0.537) MSE: (test=-0.505) R2: (test=0.095) total time=   0.0s
[CV 2/2; 2/2] START gamma=0.5, kernel=rbf.......................................
[CV 2/2; 2/2] END gamma=0.5, kernel=rbf; MAE: (test=-0.530) MSE: (test=-0.477) R2: (test=0.122) total time=   0.0s
GridSearchCV took {} 0:00:00.626605


## Classification: Centroid

In [19]:
classification_centroid_df = pd.read_csv(
    r"datasets/balanced_datasets/BBB_classification_balanced_centroid.csv.zip"
)

classification_centroid_df

Unnamed: 0,SMILES,BBB+/BBB-,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB+,7.521176,7.521176,0.635885,0.411976,0.787615,16.129212,268.543,251.743,...,1.0,0.777778,0.333333,1.0,0.777778,1.0,1.0,1.0,1.0,0.0
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB+,15.163364,15.163364,0.020535,-1.624425,0.147925,26.905882,1202.635,1090.747,...,1.0,1.000000,1.000000,1.0,1.000000,0.0,0.0,1.0,1.0,0.0
2,Oc1c(I)cc(Cl)c2cccnc12,BBB+,15.022344,15.022344,0.036460,-1.947471,0.109538,42.500000,1085.156,1008.548,...,1.0,0.000000,1.000000,1.0,0.000000,1.0,1.0,1.0,1.0,0.0
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB+,15.129540,15.129540,0.022871,-1.609940,0.147925,26.905882,1202.635,1090.747,...,1.0,1.000000,1.000000,1.0,1.000000,0.0,0.0,1.0,1.0,0.0
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB+,14.995286,14.995286,0.035518,-1.944947,0.109538,42.500000,1085.156,1008.548,...,1.0,0.000000,1.000000,1.0,0.000000,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5697,CCCCCCCCCCCCCCCCCCCCCCO,BBB-,11.219721,11.219721,0.047430,-0.047430,0.770309,17.750000,256.092,249.036,...,0.0,1.000000,0.000000,0.0,1.000000,1.0,1.0,1.0,1.0,0.0
5698,CCCCCCCCCCCCCCCCO,BBB-,12.530603,12.530603,0.037140,-1.168657,0.614448,15.760000,332.311,320.215,...,1.0,0.000000,1.000000,0.0,0.000000,1.0,1.0,1.0,1.0,0.0
5699,CCCCCCCCCCCCCCCC[N+](C)(C)CCN(Cc1ccc(OC)cc1)c1...,BBB-,4.506501,4.506501,0.832250,0.832250,0.899820,13.700000,265.360,246.208,...,0.0,1.000000,0.000000,0.0,1.000000,1.0,1.0,0.0,1.0,0.0
5700,CCCCCCCCCCCCCCOS(=O)(=O)O,BBB-,11.364205,11.364205,0.122604,-0.349254,0.778670,9.750000,211.224,202.152,...,0.0,0.000000,0.000000,1.0,1.000000,1.0,1.0,1.0,1.0,0.0


In [20]:
# Data pre-processing

classification_X = classification_centroid_df.loc[
                   :,
                   ~classification_centroid_df.columns.isin(
                       ['SMILES', 'BBB+/BBB-'])
                   ]

classification_y = classification_centroid_df['BBB+/BBB-']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(
        n_components=0.95, #When using the svd_solver='full', n_components
        # can be between 0 and 1 to represent the percentage of variance
        # that you want to explain
        svd_solver='full'
    ))
])

classification_X_processed=data_processing_pipeline.fit_transform(classification_X)
classification_X_processed

array([[-6.42601324,  0.29530609,  4.04143496, ..., -1.0648704 ,
        -1.02364449, -1.04507957],
       [25.45366241,  1.86331758,  7.29782431, ..., -0.39649147,
         1.26883027,  2.24679253],
       [29.01725269, -6.53697365, 10.57512951, ..., -0.04419358,
         0.27012644, -0.12043378],
       ...,
       [-9.81614895, -0.34114706,  5.17398358, ...,  0.20136597,
         0.41088972, -2.19329225],
       [-9.08588563,  1.3507683 ,  2.55011334, ..., -0.28250091,
        -3.40581269, -0.13956248],
       [-9.65904595,  2.82594513,  2.2292678 , ...,  0.1880736 ,
        -0.07262933,  0.85691918]])

In [21]:
classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X_processed,
    classification_y,
    test_size=0.2,
    random_state=1,
    shuffle=True,
    stratify=classification_y #Ensure train set and test set have the same
    # ratio for the 2 categories
)

In [24]:
start_time = datetime.now()

svc_centroid = SVC(
    random_state=1
)

svc_centroid_grid_search = GridSearchCV(
    estimator=svc_centroid,
    param_grid={
        # 'C': [0.001, 0.01, 0.1, 1, 10, 100], #Regularization parameter
        'kernel': ['rbf'], #Algorithm kernel. Should add poly and sigmoid
        #'degree': [2,3,4], #Polynomial degree
        'gamma': [0.1, 0.5] #Coefficient for algorithms
    },
    cv=RepeatedStratifiedKFold(
        n_splits=3,
        n_repeats=2, #Each time the split will be different
        random_state=1
    ),
    scoring={
        'Recall': make_scorer(
            recall_score, #Need pos_label
            pos_label='BBB+', #Without this, pos_label is default to be 1
            # and will through an error since 1 isn't "BBB+" or "BBB-"
            average='binary'
        ),
        'Precision': make_scorer(
            precision_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'F1': make_scorer(
            f1_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'Accuracy': 'accuracy', #accuracy_score doesn't need pos_label
        'Balanced accuracy': 'balanced_accuracy',
        'AUROC': 'roc_auc'
    },
    refit='AUROC',

    n_jobs=1,
    verbose=10
)

svc_centroid_grid_search.fit(classification_X_train, classification_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

Fitting 6 folds for each of 2 candidates, totalling 12 fits
[CV 1/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 1/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.812) Accuracy: (test=0.738) Balanced accuracy: (test=0.738) F1: (test=0.791) Precision: (test=0.659) Recall: (test=0.987) total time=  18.6s
[CV 2/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 2/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.824) Accuracy: (test=0.749) Balanced accuracy: (test=0.749) F1: (test=0.797) Precision: (test=0.670) Recall: (test=0.984) total time=  16.5s
[CV 3/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 3/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.837) Accuracy: (test=0.768) Balanced accuracy: (test=0.768) F1: (test=0.810) Precision: (test=0.685) Recall: (test=0.991) total time=  16.4s
[CV 4/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 4/6; 1/2] END gamma=0

## Classification: SMOTEENN

In [25]:
classification_smoteenn_df = pd.read_csv(
    r"datasets/balanced_datasets/BBB_classification_balanced_smoteenn.csv.zip"
)

classification_smoteenn_df

Unnamed: 0,SMILES,BBB+/BBB-,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB+,12.955912,12.955912,0.038527,-0.090963,0.541356,17.514286,477.649,438.337,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB+,11.988073,11.988073,0.084512,-0.304363,0.493562,25.181818,314.426,284.186,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
2,Oc1c(I)cc(Cl)c2cccnc12,BBB+,12.226486,12.226486,0.111551,-0.322514,0.550314,10.928571,373.416,354.264,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB+,12.055386,12.055386,0.034054,-0.791056,0.396405,21.000000,304.302,288.174,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB+,12.055386,12.055386,0.034054,-0.791056,0.396405,21.000000,304.302,288.174,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7013,CC1(C)S[C@@H]2[C@H](NC(=O)C34C[C@H]5C[C@@H](CC...,BBB-,12.990847,12.990847,0.047104,-0.275467,0.727574,24.518519,416.349,389.133,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
7014,CC1(C)S[C@@H]2[C@H](NC(=O)CCC[C@H](N)C(=O)O)C(...,BBB-,13.528728,13.528728,0.001028,-2.638422,0.205427,30.258065,429.429,406.245,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7015,CC1(C)S[C@@H]2[C@H](NC(=O)[C@@H](C(=O)O)c3cccc...,BBB-,14.197646,14.197646,0.133072,-1.798815,0.238475,49.615385,748.996,676.420,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
7016,CC1(C)S[C@@H]2[C@H](NC(=O)[C@@H](C(=O)O)c3ccsc...,BBB-,12.031231,12.031231,0.080972,-4.151088,0.656447,21.350000,380.662,372.598,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [26]:
# Data pre-processing

classification_X = classification_smoteenn_df.loc[
                   :,
                   ~classification_smoteenn_df.columns.isin(
                       ['SMILES', 'BBB+/BBB-'])
                   ]

classification_y = classification_smoteenn_df['BBB+/BBB-']


data_processing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(
        n_components=0.95, #When using the svd_solver='full', n_components
        # can be between 0 and 1 to represent the percentage of variance
        # that you want to explain
        svd_solver='full'
    ))
])

classification_X_processed=data_processing_pipeline.fit_transform(classification_X)
classification_X_processed

array([[-1.81126047,  0.332442  ,  7.16579034, ..., -0.22047884,
        -1.84892849,  4.02465876],
       [-2.84095883, -1.83519149,  0.11927076, ...,  0.91383698,
         0.97879081, -0.51055539],
       [-5.21047776,  5.68744537,  4.1245998 , ...,  0.17337791,
         0.55800201, -0.08844594],
       ...,
       [19.22222078, -5.57056762,  7.61060965, ..., -0.10561963,
        -0.06470579,  0.22293541],
       [-6.74793141,  4.58313598, -0.42378292, ..., -0.65296936,
        -0.67723505,  0.45762744],
       [24.59168134,  3.51108364, 18.20634069, ..., -0.20088314,
         0.11885443,  0.04185005]])

In [27]:
classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X_processed,
    classification_y,
    test_size=0.2,
    random_state=1,
    shuffle=True,
    stratify=classification_y #Ensure train set and test set have the same
    # ratio for the 2 categories
)

In [28]:
start_time = datetime.now()

svc_smoteenn = SVC(
    random_state=1
)

svc_smoteenn_grid_search = GridSearchCV(
    estimator=svc_smoteenn,
    param_grid={
        # 'C': [0.001, 0.01, 0.1, 1, 10, 100], #Regularization parameter
        'kernel': ['rbf'], #Algorithm kernel. Should add poly and sigmoid
        #'degree': [2,3,4], #Polynomial degree
        'gamma': [0.1, 0.5] #Coefficient for algorithms
    },
    cv=RepeatedStratifiedKFold(
        n_splits=3,
        n_repeats=2, #Each time the split will be different
        random_state=1
    ),
    scoring={
        'Recall': make_scorer(
            recall_score, #Need pos_label
            pos_label='BBB+', #Without this, pos_label is default to be 1
            # and will through an error since 1 isn't "BBB+" or "BBB-"
            average='binary'
        ),
        'Precision': make_scorer(
            precision_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'F1': make_scorer(
            f1_score, #Need pos_label
            pos_label='BBB+',
            average='binary'
        ),
        'Accuracy': 'accuracy', #accuracy_score doesn't need pos_label
        'Balanced accuracy': 'balanced_accuracy',
        'AUROC': 'roc_auc'
    },
    refit='AUROC',

    n_jobs=1,
    verbose=10
)

svc_smoteenn_grid_search.fit(classification_X_train, classification_y_train)

end_time = datetime.now()
print('GridSearchCV took {}'.format(end_time - start_time))

Fitting 6 folds for each of 2 candidates, totalling 12 fits
[CV 1/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 1/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.971) Accuracy: (test=0.855) Balanced accuracy: (test=0.855) F1: (test=0.873) Precision: (test=0.774) Recall: (test=1.000) total time=  15.2s
[CV 2/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 2/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.970) Accuracy: (test=0.841) Balanced accuracy: (test=0.841) F1: (test=0.862) Precision: (test=0.758) Recall: (test=1.000) total time=  13.0s
[CV 3/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 3/6; 1/2] END gamma=0.1, kernel=rbf; AUROC: (test=0.963) Accuracy: (test=0.827) Balanced accuracy: (test=0.828) F1: (test=0.852) Precision: (test=0.742) Recall: (test=1.000) total time=  13.1s
[CV 4/6; 1/2] START gamma=0.1, kernel=rbf.......................................
[CV 4/6; 1/2] END gamma=0

# Save Data

In [32]:
svr_regressor_results_df = pd.DataFrame(svr_regressor_grid_search.cv_results_)
#Make the GridSearch results into a df
svr_regressor_results_df.drop(
    list(svr_regressor_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting

svr_regressor_results_df = svr_regressor_results_df.sort_values(
    by='rank_test_R2')
svr_regressor_results_df

Unnamed: 0,param_gamma,param_kernel,params,mean_test_MAE,rank_test_MAE,mean_test_MSE,rank_test_MSE,mean_test_R2,rank_test_R2
0,0.1,rbf,"{'gamma': 0.1, 'kernel': 'rbf'}",-0.530265,1,-0.486448,1,0.116536,1
1,0.5,rbf,"{'gamma': 0.5, 'kernel': 'rbf'}",-0.533852,2,-0.490969,2,0.108317,2


In [33]:
svr_regressor_results_df.to_csv(
    r'model_grid_search\svr_regressor\results.csv',
    index=False
)

In [37]:
best_svr_regressor = svr_regressor_grid_search.best_estimator_
save_pickle(
    best_svr_regressor,
    r'model_pickles\svr_regressor\best_svr_regressor.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svr_regressor\best_rf_regressor.pkl')

0

In [29]:
svc_centroid_results_df = pd.DataFrame(svc_centroid_grid_search.cv_results_)
#Make the GridSearch results into a df

svc_centroid_results_df.drop(
    list(svc_centroid_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting
svc_centroid_results_df = svc_centroid_results_df.sort_values(
    by='rank_test_AUROC')

svc_centroid_results_df

Unnamed: 0,param_gamma,param_kernel,params,mean_test_Recall,rank_test_Recall,mean_test_Precision,rank_test_Precision,mean_test_F1,rank_test_F1,mean_test_Accuracy,rank_test_Accuracy,mean_test_Balanced accuracy,rank_test_Balanced accuracy,mean_test_AUROC,rank_test_AUROC
0,0.1,rbf,"{'gamma': 0.1, 'kernel': 'rbf'}",0.986628,1,0.674225,1,0.801008,1,0.754771,1,0.754718,1,0.825064,1
1,0.5,rbf,"{'gamma': 0.5, 'kernel': 'rbf'}",0.986628,1,0.673815,2,0.80072,2,0.754332,2,0.754279,2,0.810061,2


In [31]:
svc_centroid_results_df.to_csv(
    r'model_grid_search\svc_centroid_classifier\results.csv',
    index=False
)

In [42]:
best_svc_centroid_classifier = svc_centroid_grid_search.best_estimator_
save_pickle(
    best_svc_centroid_classifier,
    r'model_pickles\svc_centroid_classifier\best_svc_centroid_classifier.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svc_centroid_classifier\best_svc_centroid_classifier.pkl')

0

In [39]:
svc_smoteenn_results_df = pd.DataFrame(svc_smoteenn_grid_search.cv_results_)
#Make the GridSearch results into a df

svc_smoteenn_results_df.drop(
    list(svc_smoteenn_results_df.filter(regex='time|split|std')),
    axis=1,
    inplace=True
)  # Remove columns that aren't very interesting
svc_smoteenn_results_df = svc_smoteenn_results_df.sort_values(
    by='rank_test_AUROC')

svc_smoteenn_results_df

Unnamed: 0,param_gamma,param_kernel,params,mean_test_Recall,rank_test_Recall,mean_test_Precision,rank_test_Precision,mean_test_F1,rank_test_F1,mean_test_Accuracy,rank_test_Accuracy,mean_test_Balanced accuracy,rank_test_Balanced accuracy,mean_test_AUROC,rank_test_AUROC
0,0.1,rbf,"{'gamma': 0.1, 'kernel': 'rbf'}",1.0,1,0.756672,1,0.861436,1,0.839863,1,0.840656,1,0.966726,1
1,0.5,rbf,"{'gamma': 0.5, 'kernel': 'rbf'}",1.0,1,0.74113,2,0.851269,2,0.826058,2,0.82692,2,0.9561,2


In [40]:
svc_smoteenn_results_df.to_csv(
    r'model_grid_search\svc_smoteenn_classifier\results.csv',
    index=False
)

In [41]:
best_svc_smoteenn_classifier = svc_smoteenn_grid_search.best_estimator_
save_pickle(
    best_svc_smoteenn_classifier,
    r'model_pickles\svc_smoteenn_classifier\best_svc_smoteenn_classifier.pkl'
)
# To load this best model again, use load_pickle(r'model_pickles\svc_smoteenn_classifier\best_svc_smoteenn_classifier.pkl')

0