# Split data

In [40]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from tqdm import tqdm
from rdkit.Chem import rdFingerprintGenerator
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'

from rdmc.mol import RDKitMol

def make_rdkit_mol(smi):
    return RDKitMol.FromSmiles(smi)._mol

In [41]:
hbi_unc_df = pd.read_csv("../data/hbi_unc.csv")

# Random split

In [3]:
unique_radicals = hbi_unc_df["radical_smiles"].unique()

In [4]:
train, test = train_test_split(unique_radicals, test_size=0.1, random_state=0)

In [5]:
train_df = hbi_unc_df[hbi_unc_df["radical_smiles"].isin(train)]
train_inds = list(train_df.index)

In [6]:
test_df = hbi_unc_df[hbi_unc_df["radical_smiles"].isin(test)]
test_inds = list(test_df.index)

In [7]:
with open("../data/splits/random.json", "w") as f:
    json.dump((train_inds, test_inds), f)

# Random split with validation

In [3]:
unique_radicals = hbi_unc_df["radical_smiles"].unique()
train, test = train_test_split(unique_radicals, test_size=0.1, random_state=0)
train, val = train_test_split(train, test_size=0.1 / 0.9, random_state=0)

In [8]:
train_df = hbi_unc_df[hbi_unc_df["radical_smiles"].isin(train)]
train_inds = list(train_df.index)
len(train_inds)

2257

In [9]:
val_df = hbi_unc_df[hbi_unc_df["radical_smiles"].isin(val)]
val_inds = list(val_df.index)
len(val_inds)

275

In [10]:
test_df = hbi_unc_df[hbi_unc_df["radical_smiles"].isin(test)]
test_inds = list(test_df.index)
len(test_inds)

277

In [11]:
with open("../data/splits/random_val.json", "w") as f:
    json.dump((train_inds, val_inds, test_inds), f)

# Cluster split

In [53]:
test_size = 0.1

In [42]:
cluster_df = pd.read_csv("../data/resonance_radical_cluster.csv")

In [46]:
cluster_count_dict = cluster_df["cluster"].value_counts().to_dict()
cluster_count_dict

{2: 418,
 3: 362,
 10: 305,
 9: 278,
 8: 238,
 5: 212,
 0: 203,
 4: 172,
 12: 168,
 7: 114,
 1: 112,
 6: 95,
 11: 65,
 13: 39,
 14: 28}

In [64]:
sorted_clusters = np.array(list(cluster_count_dict.keys())[::-1])
sorted_cluster_counts = np.array(list(cluster_count_dict.values())[::-1])
cluster_cumsums = np.cumsum(sorted_cluster_counts)
cluster_cumsums = cluster_cumsums / cluster_cumsums[-1]
ind = np.argmax(cluster_cumsums >= test_size)
test_clusters = sorted_clusters[:ind+1]
test_smis = cluster_df.loc[cluster_df["cluster"].isin(test_clusters), "resonance_radical_smiles"]
test_smis

352     C[C](C)C(C)(C)OOC(C)(C)C(C)=O
456            C[C](C)C(C)(C)OOC(C)=O
555      C[C](C)C(C)(C)OOC(C)(C)C(C)C
592           C[C](C)C(C)(C)OOCC(C)=O
630             COOC(C)(C)C(C)(C)O[O]
                    ...              
2706                  [CH2]C=CC1C=CC1
2707                   C=C[CH]C1C=CC1
2726           C1=CC2[C](C1)C1C=CC2C1
2766                   [CH]=C1C=CC=C1
2785                   C1=CC[CH]CCC=1
Name: resonance_radical_smiles, Length: 339, dtype: object

In [68]:
test_df = hbi_unc_df[hbi_unc_df["resonance_radical_smiles"].isin(test_smis)]
train_df = hbi_unc_df[~hbi_unc_df.index.isin(test_df.index)]

In [74]:
train_inds = list(train_df.index)
print(len(train_inds)/len(hbi_unc_df.index))
test_inds = list(test_df.index)
print(len(test_inds)/len(hbi_unc_df.index))

0.879316482734069
0.12068351726593093


In [70]:
with open("../data/splits/cluster.json", "w") as f:
    json.dump((train_inds, test_inds), f)