# Import packages

In [1]:
import pickle  # Local Python (3.8) is fine with this. If you're suing Google
# colab, which uses a Python version of 3.6, you need to do import pickel5
# as pickle
import cloudpickle as cp
from urllib.request import urlopen

import numpy as np
import pandas as pd


from rdkit.Chem import PandasTools, AllChem, MACCSkeys
from mordred import Calculator, descriptors

from sklearn.feature_selection import VarianceThreshold  #For checking
# descriptors with low variance
from collinearity import SelectNonCollinear  #For throw out highly
# correlated variables

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


# Custom functions

In [2]:
from dataset_expansion import dataset_feature_expansion


def dump_pickle(content_to_dump,
                pickle_name_str):  #Dump the content_to_dump into a pickle. Remember to include ".pkl" in pickle_name_str
    cp.dump(content_to_dump, open(pickle_name_str,
                                  'wb'))  #Save the GridSearchCV object to be used later
    return


def load_cloud_pickle(
        github_url):  #Load pickles (contain trained models) from a GitHub link
    url_raw_file = github_url + '?raw=true'
    loaded_pickle_object = pickle.load(urlopen(url_raw_file))
    return loaded_pickle_object

# Read in data

## Regression

In [3]:
regression_df = pd.read_csv('datasets\cleaned_datasets\BBB_regression.csv')
regression_df

Unnamed: 0,SMILES,logBB
0,CN1C(=NN=N1)SCC2=C(N3C(C(C3=O)(NC(=O)C(C4=CC=C...,-2.52
1,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)OC6[C@...,-2.15
2,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@...,-2.09
3,CC1=NC=C(C=C1)CC2CNC(NC2=O)NCCCCC3=NC=C(C=C3C)Br,-1.88
4,c1(c2c3n(c4c(C(N(C)C3)=O)c(Cl)ccc4)cn2)noc(C(O...,-1.82
...,...,...
1053,C[NH2+]CCCN1C2=CC=CC=C2CCC3=CC=CC=C31,1.20
1054,CN(C)CCCN1C2=CC=CC=C2SC3=CC=CC=C31,1.23
1055,CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2,1.30
1056,CNCCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl,1.40


In [4]:
(regression_df['logBB'] <= -1.01).sum()  #These are BBB+

128

In [5]:
(regression_df['logBB'] >= -1).sum()  #These are BBB-

930

## Classification

In [6]:
classification_df = pd.read_csv(
    'datasets\cleaned_datasets\BBB_classification.csv'
)
classification_df

Unnamed: 0,SMILES,BBB+/BBB-
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB-
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB-
2,Oc1c(I)cc(Cl)c2cccnc12,BBB-
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB-
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB-
...,...,...
7802,c1ccc(CN(CC2=NCCN2)c2ccccc2)cc1,BBB-
7803,CCOCCn1c(N2CCCN(C)CC2)nc2ccccc21,BBB+
7804,CN1CCC(=C2c3ccccc3CC(=O)c3sccc32)CC1,BBB+
7805,Cc1[nH]c(=O)c(C#N)cc1-c1ccncc1,BBB-


In [7]:
classification_df['BBB+/BBB-'].value_counts()

BBB+/BBB-
BBB+    4956
BBB-    2851
Name: count, dtype: int64

# Dataset expansion
Add descriptors and fingerprints

In [8]:
num_2D_descriptors = len(Calculator(descriptors, ignore_3D=True).descriptors)
print('Package mordred provided {} descriptors'.format(num_2D_descriptors))

Package mordred provided 1613 descriptors


## Regression

In [9]:
regression_df_expanded = dataset_feature_expansion(regression_df)
regression_df_expanded

Failed to patch pandas - unable to change molecule rendering


Generating mordred descriptors:   0%|          | 0/1058 [00:00<?, ?it/s]

  1%|▏         | 14/1058 [00:11<13:16,  1.31it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  4%|▍         | 43/1058 [00:13<02:42,  6.23it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  4%|▍         | 45/1058 [00:14<03:52,  4.36it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  5%|▍         | 48/1058 [00:14<03:31,  4.78it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 1058/1058 [01:53<00:00,  9.29it/s]


Generating Morgan fingerprints:   0%|          | 0/1058 [00:00<?, ?it/s]

Generating MACCS keys:   0%|          | 0/1058 [00:00<?, ?it/s]

Unnamed: 0,SMILES,logBB,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,CN1C(=NN=N1)SCC2=C(N3C(C(C3=O)(NC(=O)C(C4=CC=C...,-2.52,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,4,0,45.430282,2.648849,5.297577,45.430282,...,1,1,1,1,1,1,1,1,1,0
1,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)OC6[C@...,-2.15,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,1,1,43.435426,2.709582,5.329713,43.435426,...,1,1,1,1,1,1,1,1,1,0
2,CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@...,-2.09,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,1,1,43.437782,2.710688,5.333783,43.437782,...,1,1,1,1,1,1,1,1,1,0
3,CC1=NC=C(C=C1)CC2CNC(NC2=O)NCCCCC3=NC=C(C=C3C)Br,-1.88,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,2,35.718617,2.330522,4.661044,35.718617,...,0,1,0,1,1,1,1,1,1,0
4,c1(c2c3n(c4c(C(N(C)C3)=O)c(Cl)ccc4)cn2)noc(C(O...,-1.82,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,32.776103,2.541137,4.9327,32.776103,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,C[NH2+]CCCN1C2=CC=CC=C2CCC3=CC=CC=C31,1.20,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,26.815455,2.447262,4.821858,26.815455,...,0,1,0,1,1,1,1,0,1,0
1054,CN(C)CCCN1C2=CC=CC=C2SC3=CC=CC=C31,1.23,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,26.150381,2.474851,4.949701,26.150381,...,0,1,0,1,1,1,1,0,1,0
1055,CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2,1.30,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,24.616633,2.336046,4.672091,24.616633,...,1,1,0,1,1,1,1,1,1,0
1056,CNCCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl,1.40,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,26.456725,2.484489,4.968979,26.456725,...,0,1,0,1,1,1,1,0,1,0


In [10]:
regression_df_expanded.to_csv(
    'datasets\expanded_datasets\BBB_regression_expanded.csv')
print('Done!')

Done!


## Classification

In [11]:
classification_df_expanded = dataset_feature_expansion(classification_df)
classification_df_expanded

Failed to patch pandas - unable to change molecule rendering


Generating mordred descriptors:   0%|          | 0/7807 [00:00<?, ?it/s]

  1%|          | 48/7807 [00:13<1:09:30,  1.86it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|▏         | 188/7807 [00:29<25:05,  5.06it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  3%|▎         | 206/7807 [00:30<08:50, 14.33it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  3%|▎         | 224/7807 [00:32<10:37, 11.89it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 11%|█         | 831/7807 [01:30<17:51,  6.51it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 16%|█▌        | 1251/7807 [02:14<18:10,  6.01it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 26%|██▌       | 2009/7807 [03:51<28:43,  3.36it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 7807/7807 [16:45<00:00,  7.76it/s]


Generating Morgan fingerprints:   0%|          | 0/7807 [00:00<?, ?it/s]

Generating MACCS keys:   0%|          | 0/7807 [00:00<?, ?it/s]

Unnamed: 0,SMILES,BBB+/BBB-,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,...,157_y,158_y,159_y,160_y,161_y,162_y,163_y,164_y,165_y,166_y
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB-,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,1,0,35.289886,2.38053,4.761059,35.289886,...,1,1,1,0,1,1,1,1,1,0
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB-,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,4,0,45.430282,2.648849,5.297577,45.430282,...,1,1,1,1,1,1,1,1,1,0
2,Oc1c(I)cc(Cl)c2cccnc12,BBB-,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,16.678194,2.425683,4.851365,16.678194,...,1,0,0,0,1,1,1,1,1,0
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB-,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,3,23.641772,2.237342,4.474683,23.641772,...,0,1,0,1,1,1,1,0,1,0
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB-,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,1,1,43.435426,2.709582,5.329713,43.435426,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7802,c1ccc(CN(CC2=NCCN2)c2ccccc2)cc1,BBB-,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,2,26.938755,2.326623,4.635214,26.938755,...,0,1,0,0,1,1,1,0,1,0
7803,CCOCCn1c(N2CCCN(C)CC2)nc2ccccc21,BBB+,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,28.885493,2.4832,4.768669,28.885493,...,1,1,0,1,1,1,1,1,1,0
7804,CN1CCC(=C2c3ccccc3CC(=O)c3sccc32)CC1,BBB+,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,29.305963,2.511472,4.918238,29.305963,...,0,1,0,1,1,1,1,1,1,0
7805,Cc1[nH]c(=O)c(C#N)cc1-c1ccncc1,BBB-,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,20.388826,2.390361,4.780722,20.388826,...,0,0,0,1,1,1,1,1,1,0


In [12]:
classification_df_expanded.to_csv(
    'datasets\expanded_datasets\BBB_classification_expanded.csv')
print('Done!')

Done!


# Dataset cleanup (feature selection)

In [None]:
feature_selector = VarianceThreshold(  #feature_selector is for getting rid
    # of low-variance features
    threshold=0  #Variance = 0 means to get rid of features with all the same
    # values
)

## Regression

In [None]:
# Remove low-variance features
print(
    'Before removing the zero-variance descriptors, the dataset has {} '
    'descriptors'.format(
        len(regression_df_expanded.columns)
    )
)

regression_features_df = regression_df_expanded.loc[
     :,
     ~regression_df_expanded.columns.isin(['SMILES', 'ROMol']) # Keep all
                         # the columns that aren't 'SMILES' or 'ROMol'
]

regression_features_df.columns = regression_features_df.columns.astype(str)
# Convert all column titles into str. This is required by feature_selector
# .fit_transform
feature_selector.fit_transform(regression_features_df)

regression_features_df_varianced = pd.DataFrame(
    feature_selector.fit_transform(
        regression_features_df
    ),
    columns=regression_features_df.columns[
        feature_selector.get_support(
            indices=True  # This returns the indices of the kept descriptors
        )
    ]
)

regression_df_expanded_varianced = pd.concat(
    [regression_df_expanded[
        [
            'SMILES',
            # 'ROMol' #Not adding the 'ROMol' column back since it's not
            # very useful for later modeling
        ]
    ],
    regression_features_df_varianced],
    axis=1
)

print(
    'After removing the zero-variance descriptors, the dataset has {} '
    'descriptors'.format(
        len(regression_df_expanded_varianced.columns)
    ),
)

regression_df_expanded_varianced

## Classification