In [1]:
import numpy as np
from tqdm import tqdm

from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict

import random
import os

In [2]:
indir = '/scratch/midway3/jshe/data/qm9/raw/'
outdir = '/scratch/midway3/jshe/data/qm9/'

## Load Data

In [3]:
datas = {fname.split('.')[0] : np.load(indir + fname) for fname in os.listdir(indir)}

In [4]:
smiles = datas['smiles']

## Determine scaffolds

In [5]:
scaffold_dict = defaultdict(set)
for i, smile in tqdm(enumerate(list(smiles))):
    scaffold_dict[MurckoScaffold.MurckoScaffoldSmiles(smile)].add(i)

133875it [00:13, 9900.14it/s] 


## Create scaffold splits

In [6]:
scaffold_sets = iter(random.sample(list(scaffold_dict.values()), len(scaffold_dict)))

In [7]:
train_set = set()
while len(train_set) < (0.8 * len(smiles)):
    train_set.update(next(scaffold_sets))

validation_set = set()
while len(validation_set) < (0.1 * len(smiles)):
    validation_set.update(next(scaffold_sets))

test_set = set().union(*scaffold_sets)

train_indices, validation_indices, test_indices = list(train_set), list(validation_set), list(test_set)
print('Train:', len(train_indices))
print('Validation:', len(validation_indices))
print('Test:', len(test_indices))

Train: 107179
Validation: 16569
Test: 10127


In [8]:
y_mean = datas['y'][train_indices].mean(axis=0)
y_std = datas['y'][train_indices].std(axis=0)

In [10]:
for name, data in datas.items():
    if name == 'y_labels': continue
    np.save(outdir + 'train/' + name, data[train_indices])
    np.save(outdir + 'validation/' + name, data[validation_indices])
    np.save(outdir + 'test/' + name, data[test_indices])

for split in ['train/', 'validation/', 'test/']:
    np.save(outdir + split + 'y_labels.npy', datas['y_labels'])
    np.save(outdir + split + 'y_mean.npy', y_mean)
    np.save(outdir + split + 'y_std.npy', y_std)