To run this notebook, please install `astartes`: https://github.com/JacksonBurns/astartes

In [1]:
import pickle as pkl

import numpy as np
import pandas as pd
from rdkit.Chem.Scaffolds import MurckoScaffold

from astartes import train_val_test_split

In [2]:
CSV_PATH = '../data/delaney.csv'
df = pd.read_csv(CSV_PATH)
df

Unnamed: 0,smiles,logSolubility
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.770
1,Cc1occc1C(=O)Nc2ccccc2,-3.300
2,CC(C)=CCCC(C)=CC(=O),-2.060
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.870
4,c1ccsc1,-1.330
...,...,...
1123,FC(F)(F)C(Cl)Br,-1.710
1124,CNC(=O)ON=C(SC)C(=O)N(C)C,0.106
1125,CCSCCSP(=S)(OC)OC,-3.091
1126,CCC(C)C,-3.180


In [3]:
df.describe()

Unnamed: 0,logSolubility
count,1128.0
mean,-3.050102
std,2.096441
min,-11.6
25%,-4.3175
50%,-2.86
75%,-1.6
max,1.58


# Random Splits

In [4]:
RANDOM_SPLITS = []
sampler = "random"
for seed in range(5):
    # create 85:5:10 data split
    _, _, _, train_indices, val_indices, test_indices = train_val_test_split(
        np.arange(len(df)),
        train_size=0.85,
        val_size=0.05,
        test_size=0.1,
        sampler=sampler,
        random_state=seed,
        return_indices=True,
    )
    RANDOM_SPLITS.append([train_indices, val_indices, test_indices])

In [5]:
with open('delaney_splits_random.pkl', 'wb') as f:
    pkl.dump(RANDOM_SPLITS, f)

# Scaffold Splits

In [6]:
df['murcko'] = df.smiles.apply(MurckoScaffold.MurckoScaffoldSmiles)
df

Unnamed: 0,smiles,logSolubility,murcko
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.770,c1ccc(COC2CCCC(COC3CCCCO3)O2)cc1
1,Cc1occc1C(=O)Nc2ccccc2,-3.300,O=C(Nc1ccccc1)c1ccoc1
2,CC(C)=CCCC(C)=CC(=O),-2.060,
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.870,c1ccc2c(c1)ccc1c2ccc2c3ccccc3ccc21
4,c1ccsc1,-1.330,c1ccsc1
...,...,...,...
1123,FC(F)(F)C(Cl)Br,-1.710,
1124,CNC(=O)ON=C(SC)C(=O)N(C)C,0.106,
1125,CCSCCSP(=S)(OC)OC,-3.091,
1126,CCC(C)C,-3.180,


In [7]:
# 317 molecules do not match any scaffold
df.murcko.value_counts()

                               317
c1ccccc1                       254
c1ccc(-c2ccccc2)cc1             39
c1ccc2ccccc2c1                  22
O=C1CC(=O)NC(=O)N1              21
                              ... 
O=c1cc(-c2ccccc2)oc2ccccc12      1
O=C1C2C3C4CC5C3C1C5C42           1
C1=CC(C2C=CC=C2)C=C1             1
O=C1CCC2c3ccc4ccccc4c3CCC12      1
O=C1NC(c2ccccc2)c2ccccc21        1
Name: murcko, Length: 269, dtype: int64

In [9]:
SCAFFOLD_SPLITS = []
sampler = "scaffold"
for seed in range(5):
    # create 85:5:10 data split
    (
        X_train,
        X_val,
        X_test,
        train_labels,
        val_labels,
        test_labels,
        train_indices,
        val_indices,
        test_indices,
    ) = train_val_test_split(
        X=df.smiles.values,
        train_size=0.85,
        val_size=0.05,
        test_size=0.1,
        sampler=sampler,
        random_state=seed,
        return_indices=True,
    )
    print(len(train_indices), len(val_indices), len(test_indices))
    print(test_indices[:5])
    SCAFFOLD_SPLITS.append([train_indices, val_indices, test_indices])

960 56 112
[250 442 858 925 951]
960 56 112
[ 13 852 552 150 250]
960 56 112
[ 88 240 861 186 326]
960 56 112
[226 483 990 495 805]
960 56 112
[  5 401 602 286 353]


In [10]:
with open('delaney_splits_scaffold.pkl', 'wb') as f:
    pkl.dump(SCAFFOLD_SPLITS, f)