# Import packages

In [1]:
import pickle  # Local Python (3.8) is fine with this. If you're suing Google
# colab, which uses a Python version of 3.6, you need to do import pickel5
# as pickle
import cloudpickle as cp
from urllib.request import urlopen

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from collinearity import SelectNonCollinear

# Custom functions

In [2]:
from dataset_expansion import dataset_feature_expansion
from dataset_cleanup import filter_low_variance


def dump_pickle(content_to_dump,
                pickle_name_str):  #Dump the content_to_dump into a pickle. Remember to include ".pkl" in pickle_name_str
    cp.dump(content_to_dump, open(pickle_name_str,
                                  'wb'))  #Save the GridSearchCV object to be used later
    return


def load_cloud_pickle(
        github_url):  #Load pickles (contain trained models) from a GitHub link
    url_raw_file = github_url + '?raw=true'
    loaded_pickle_object = pickle.load(urlopen(url_raw_file))
    return loaded_pickle_object

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


# Read in data

## Regression

In [3]:
regression_df = pd.read_csv('datasets\cleaned_datasets\BBB_regression.csv')
regression_df

Unnamed: 0,SMILES,logBB
0,CN1C(=NN=N1)SCC2=C(N3C(C(C3=O)(NC(=O)C(C4=CC=C...,-2.52
1,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)OC6[C@...,-2.15
2,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@...,-2.09
3,CC1=NC=C(C=C1)CC2CNC(NC2=O)NCCCCC3=NC=C(C=C3C)Br,-1.88
4,c1(c2c3n(c4c(C(N(C)C3)=O)c(Cl)ccc4)cn2)noc(C(O...,-1.82
...,...,...
1053,C[NH2+]CCCN1C2=CC=CC=C2CCC3=CC=CC=C31,1.20
1054,CN(C)CCCN1C2=CC=CC=C2SC3=CC=CC=C31,1.23
1055,CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2,1.30
1056,CNCCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl,1.40


In [4]:
(regression_df['logBB'] <= -1.01).sum()  #These are BBB+

128

In [5]:
(regression_df['logBB'] >= -1).sum()  #These are BBB-

930

## Classification

In [6]:
classification_df = pd.read_csv(
    'datasets\cleaned_datasets\BBB_classification.csv'
)
classification_df

Unnamed: 0,SMILES,BBB+/BBB-
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB-
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB-
2,Oc1c(I)cc(Cl)c2cccnc12,BBB-
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB-
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB-
...,...,...
7802,c1ccc(CN(CC2=NCCN2)c2ccccc2)cc1,BBB-
7803,CCOCCn1c(N2CCCN(C)CC2)nc2ccccc21,BBB+
7804,CN1CCC(=C2c3ccccc3CC(=O)c3sccc32)CC1,BBB+
7805,Cc1[nH]c(=O)c(C#N)cc1-c1ccncc1,BBB-


In [7]:
classification_df['BBB+/BBB-'].value_counts()

BBB+/BBB-
BBB+    4956
BBB-    2851
Name: count, dtype: int64

# Dataset expansion & cleaning
Major expansion steps:
1. Add in RDKit descriptors
2. Add in Morgan fingerprints
3. Add in MACCS keys

Major cleaning steps:
1. Remove columns whose variance is 0--all values are hte same
    * Done by a function so later the threshold for filtering
    based on variance level is adjustable

## Regression

In [8]:
regression_df_expanded, regression_expansion_errors = dataset_feature_expansion(
    regression_df)
regression_df_expanded  #Missing SMILES are the chemicals that have errors
# when going through the calculations

Failed to patch pandas - unable to change molecule rendering


Generating 210 RDKit descriptors:   0%|          | 0/1058 [00:00<?, ?it/s]

Generating 4096 Morgan fingerprints:   0%|          | 0/1058 [00:00<?, ?it/s]

Generating 167 MACCS keys:   0%|          | 0/1058 [00:00<?, ?it/s]

Unnamed: 0,SMILES,logBB,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,CN1C(=NN=N1)SCC2=C(N3C(C(C3=O)(NC(=O)C(C4=CC=C...,-2.52,13.190522,13.190522,0.042537,-2.144257,0.133795,22.000000,520.480,500.320,...,1,1,1,1,1,1,1,1,1,0
1,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)OC6[C@...,-2.15,11.445328,11.445328,0.165306,-1.798901,0.346256,45.303030,461.467,434.251,...,1,1,1,1,1,1,1,1,1,0
2,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@...,-2.09,11.479044,11.479044,0.060963,-1.790095,0.359144,45.393939,461.467,434.251,...,1,1,1,1,1,1,1,1,1,0
3,CC1=NC=C(C=C1)CC2CNC(NC2=O)NCCCCC3=NC=C(C=C3C)Br,-1.88,12.391214,12.391214,0.061101,-0.159783,0.543803,19.464286,446.393,418.169,...,0,1,0,1,1,1,1,1,1,0
4,c1(c2c3n(c4c(C(N(C)C3)=O)c(Cl)ccc4)cn2)noc(C(O...,-1.82,12.699094,12.699094,0.092039,-2.255140,0.648321,14.192308,375.772,361.660,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,C[NH2+]CCCN1C2=CC=CC=C2CCC3=CC=CC=C31,1.20,2.515046,2.515046,1.095602,1.095602,0.843816,13.550000,267.396,244.212,...,0,1,0,1,1,1,1,0,1,0
1047,CN(C)CCCN1C2=CC=CC=C2SC3=CC=CC=C31,1.23,2.462963,2.462963,1.062269,1.062269,0.828858,13.250000,284.428,264.268,...,0,1,0,1,1,1,1,0,1,0
1048,CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2,1.30,6.083380,6.083380,0.016065,0.016065,0.784550,11.157895,255.361,234.193,...,1,1,0,1,1,1,1,1,1,0
1049,CNCCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl,1.40,6.182100,6.182100,0.793840,0.793840,0.834133,13.000000,304.846,287.710,...,0,1,0,1,1,1,1,0,1,0


In [9]:
regression_df_expanded_cleaned = filter_low_variance(
    regression_df_expanded,
    exclude_col_list=['SMILES', 'logBB'],
    threshold_level=0
)
regression_df_expanded_cleaned

Before removing the low-variance descriptors, the dataset has 4473 descriptors
After removing the zero-variance descriptors, the dataset has 4103 descriptors


Unnamed: 0,SMILES,logBB,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,CN1C(=NN=N1)SCC2=C(N3C(C(C3=O)(NC(=O)C(C4=CC=C...,-2.52,13.190522,13.190522,0.042537,-2.144257,0.133795,22.000000,520.480,500.320,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)OC6[C@...,-2.15,11.445328,11.445328,0.165306,-1.798901,0.346256,45.303030,461.467,434.251,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@...,-2.09,11.479044,11.479044,0.060963,-1.790095,0.359144,45.393939,461.467,434.251,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,CC1=NC=C(C=C1)CC2CNC(NC2=O)NCCCCC3=NC=C(C=C3C)Br,-1.88,12.391214,12.391214,0.061101,-0.159783,0.543803,19.464286,446.393,418.169,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,c1(c2c3n(c4c(C(N(C)C3)=O)c(Cl)ccc4)cn2)noc(C(O...,-1.82,12.699094,12.699094,0.092039,-2.255140,0.648321,14.192308,375.772,361.660,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,C[NH2+]CCCN1C2=CC=CC=C2CCC3=CC=CC=C31,1.20,2.515046,2.515046,1.095602,1.095602,0.843816,13.550000,267.396,244.212,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1047,CN(C)CCCN1C2=CC=CC=C2SC3=CC=CC=C31,1.23,2.462963,2.462963,1.062269,1.062269,0.828858,13.250000,284.428,264.268,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1048,CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2,1.30,6.083380,6.083380,0.016065,0.016065,0.784550,11.157895,255.361,234.193,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1049,CNCCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl,1.40,6.182100,6.182100,0.793840,0.793840,0.834133,13.000000,304.846,287.710,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


In [10]:
regression_df_expanded_cleaned.to_csv(
    'datasets\expanded_datasets\BBB_regression_expanded.csv.zip',
    index=False,
    compression='zip' # Have to use zip here since the classification
    # dataset will become very large. Zipped .csv files can be directly read
    # by pd.read_csv()
)
print('Done!')

Done!


## Classification

In [11]:
classification_df_expanded, classification_expansion_errors = dataset_feature_expansion(
    classification_df)
classification_df_expanded

Failed to patch pandas - unable to change molecule rendering


Generating 210 RDKit descriptors:   0%|          | 0/7807 [00:00<?, ?it/s]

Generating 4096 Morgan fingerprints:   0%|          | 0/7807 [00:00<?, ?it/s]

Generating 167 MACCS keys:   0%|          | 0/7807 [00:00<?, ?it/s]

Unnamed: 0,SMILES,BBB+/BBB-,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB-,12.341010,12.341010,0.023055,-3.794932,0.540588,11.428571,398.400,384.288,...,1,1,1,0,1,1,1,1,1,0
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB-,13.190522,13.190522,0.042537,-2.144257,0.133795,22.000000,520.480,500.320,...,1,1,1,1,1,1,1,1,1,0
2,Oc1c(I)cc(Cl)c2cccnc12,BBB-,9.654043,9.654043,0.195000,0.195000,0.758308,10.615385,305.502,300.462,...,1,0,0,0,1,1,1,1,1,0
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB-,8.544584,8.544584,0.532052,0.532052,0.272365,10.894737,342.266,326.138,...,0,1,0,1,1,1,1,0,1,0
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB-,11.445328,11.445328,0.165306,-1.798901,0.346256,45.303030,461.467,434.251,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7797,c1ccc(CN(CC2=NCCN2)c2ccccc2)cc1,BBB-,4.506501,4.506501,0.832250,0.832250,0.899820,13.700000,265.360,246.208,...,0,1,0,0,1,1,1,0,1,0
7798,CCOCCn1c(N2CCCN(C)CC2)nc2ccccc21,BBB+,5.564458,5.564458,0.733727,0.733727,0.793110,17.090909,302.422,276.214,...,1,1,0,1,1,1,1,1,1,0
7799,CN1CCC(=C2c3ccccc3CC(=O)c3sccc32)CC1,BBB+,12.589347,12.589347,0.264794,0.264794,0.732528,18.863636,309.434,290.282,...,0,1,0,1,1,1,1,1,1,0
7800,Cc1[nH]c(=O)c(C#N)cc1-c1ccncc1,BBB-,11.364205,11.364205,0.122604,-0.349254,0.778670,9.750000,211.224,202.152,...,0,0,0,1,1,1,1,1,1,0


In [12]:
classification_df_expanded_cleaned = filter_low_variance(
    classification_df_expanded,
    exclude_col_list=['SMILES', 'BBB+/BBB-'],
    threshold_level=0
)
classification_df_expanded_cleaned

Before removing the low-variance descriptors, the dataset has 4473 descriptors
After removing the zero-variance descriptors, the dataset has 4456 descriptors


Unnamed: 0,SMILES,BBB+/BBB-,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB-,12.341010,12.341010,0.023055,-3.794932,0.540588,11.428571,398.400,384.288,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB-,13.190522,13.190522,0.042537,-2.144257,0.133795,22.000000,520.480,500.320,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,Oc1c(I)cc(Cl)c2cccnc12,BBB-,9.654043,9.654043,0.195000,0.195000,0.758308,10.615385,305.502,300.462,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB-,8.544584,8.544584,0.532052,0.532052,0.272365,10.894737,342.266,326.138,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB-,11.445328,11.445328,0.165306,-1.798901,0.346256,45.303030,461.467,434.251,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7797,c1ccc(CN(CC2=NCCN2)c2ccccc2)cc1,BBB-,4.506501,4.506501,0.832250,0.832250,0.899820,13.700000,265.360,246.208,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
7798,CCOCCn1c(N2CCCN(C)CC2)nc2ccccc21,BBB+,5.564458,5.564458,0.733727,0.733727,0.793110,17.090909,302.422,276.214,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7799,CN1CCC(=C2c3ccccc3CC(=O)c3sccc32)CC1,BBB+,12.589347,12.589347,0.264794,0.264794,0.732528,18.863636,309.434,290.282,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7800,Cc1[nH]c(=O)c(C#N)cc1-c1ccncc1,BBB-,11.364205,11.364205,0.122604,-0.349254,0.778670,9.750000,211.224,202.152,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [13]:
classification_df_expanded_cleaned.to_csv(
    'datasets\expanded_datasets\BBB_classification_expanded.csv.zip',
    index=False,
    compression='zip'
)
print('Done!')

Done!


# Rebalance datasets
Before center and standardization

## Regression

In [15]:
regression_df_expanded_cleaned = pd.read_csv\
    ('datasets\expanded_datasets\BBB_regression_expanded.csv.zip')
regression_df_expanded_cleaned

Unnamed: 0,SMILES,logBB,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,CN1C(=NN=N1)SCC2=C(N3C(C(C3=O)(NC(=O)C(C4=CC=C...,-2.52,13.190522,13.190522,0.042537,-2.144257,0.133795,22.000000,520.480,500.320,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)OC6[C@...,-2.15,11.445328,11.445328,0.165306,-1.798901,0.346256,45.303030,461.467,434.251,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@...,-2.09,11.479044,11.479044,0.060963,-1.790095,0.359144,45.393939,461.467,434.251,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,CC1=NC=C(C=C1)CC2CNC(NC2=O)NCCCCC3=NC=C(C=C3C)Br,-1.88,12.391214,12.391214,0.061101,-0.159783,0.543803,19.464286,446.393,418.169,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,c1(c2c3n(c4c(C(N(C)C3)=O)c(Cl)ccc4)cn2)noc(C(O...,-1.82,12.699094,12.699094,0.092039,-2.255140,0.648321,14.192308,375.772,361.660,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,C[NH2+]CCCN1C2=CC=CC=C2CCC3=CC=CC=C31,1.20,2.515046,2.515046,1.095602,1.095602,0.843816,13.550000,267.396,244.212,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1047,CN(C)CCCN1C2=CC=CC=C2SC3=CC=CC=C31,1.23,2.462963,2.462963,1.062269,1.062269,0.828858,13.250000,284.428,264.268,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1048,CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2,1.30,6.083380,6.083380,0.016065,0.016065,0.784550,11.157895,255.361,234.193,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1049,CNCCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl,1.40,6.182100,6.182100,0.793840,0.793840,0.834133,13.000000,304.846,287.710,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


# Model training
Datasets transformation will be done along the way

## Regression

## Classification