In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict

import random
import os

In [2]:
smiles_file = 'smiles.csv'
y_file = 'norm_y.csv'
coordinates_file = 'coordinates.npy'

## Load Data

In [3]:
smiles = pd.read_csv(smiles_file)['SMILES']
y = pd.read_csv(y_file)
coordinates = np.load(coordinates_file)

## Determine scaffolds

In [4]:
def get_scaffold(smiles):
    return MurckoScaffold.MurckoScaffoldSmiles(smiles)

In [5]:
scaffold_dict = defaultdict(set)
for i, smile in tqdm(enumerate(smiles.to_list())):
    scaffold_dict[get_scaffold(smile)].add(i)

127356it [00:11, 10955.36it/s]


In [6]:
scaffold_sets = iter(random.sample(list(scaffold_dict.values()), len(scaffold_dict)))

## Create scaffold splits

In [7]:
train_set, validation_set = set(), set()

while len(train_set) < (0.8 * len(smiles)):
    train_set.update(next(scaffold_sets))

while len(validation_set) < (0.1 * len(smiles)):
    validation_set.update(next(scaffold_sets))

test_set = set().union(*scaffold_sets)

train_set, validation_set, test_set = list(train_set), list(validation_set), list(test_set)
print(len(train_set), len(validation_set), len(test_set))

101891 12738 12727


In [8]:
smiles[train_set].to_csv('./scaffold/smiles_train.csv', index=None)
y.loc[train_set].to_csv('./scaffold/y_train.csv', index=None)
np.save('./scaffold/coordinates_train.csv', coordinates[train_set])

smiles[validation_set].to_csv('./scaffold/smiles_validation.csv', index=None)
y.loc[validation_set].to_csv('./scaffold/y_validation.csv', index=None)
np.save('./scaffold/coordinates_validation.csv', coordinates[validation_set])

smiles[test_set].to_csv('./scaffold/smiles_test.csv', index=None)
y.loc[test_set].to_csv('./scaffold/y_test.csv', index=None)
np.save('./scaffold/coordinates_test.csv', coordinates[test_set])