# Data Splits

We want to split the SynFerm data set into train and test data.
For now, we define a 0D and a 1D split.

### 0D Split
For the 0D split, we use a random train-test split.
Due to the size of the data, we do not need to use CV or repeated sampling.
We use a 90/10 split.

### 1D Split
For the 1D split, we provide use a (1D) GroupShuffleSplit.
Each individual split will be 90/10 train/test (of groups not samples!).
As groups, we use either initiator, monomer, or terminator.
For each of the 3 building blocks, we provide 3 splits for a total of 9 splits.

In [22]:
import sys
import os

sys.path.append(os.path.join("..", ".."))

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, train_test_split

from src.definitions import DATA_DIR

In [4]:
# Load data
df = pd.read_csv(DATA_DIR / "curated_data" / "synferm_dataset_2023-07-20_40433records.csv")
df.shape

In [5]:
df.head()

Unnamed: 0,experiment_id,I_long,M_long,T_long,product_A_smiles,I_smiles,M_smiles,T_smiles,reaction_smiles,reaction_smiles_atom_mapped,...,binary_H,scaled_A,scaled_B,scaled_C,scaled_D,scaled_E,scaled_F,scaled_G,scaled_H,major_A-C
0,10578,Ph023,Mon017,TerTH010,CC(C)(C)OC(=O)CC[C@@H](Cc1nnc(C=Cc2ccccc2)s1)N...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,Cl.NNC(=S)/C=C/c1ccccc1,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:2]([c:1]1[cH:13][cH:15][c:17]([C...,...,1.0,2.406501,1.281399,0.28207,0.0,0.413064,0.234782,5.510721,0.290641,A
1,10579,Ph023,Mon017,TerTH026,CC(C)(C)OC(=O)CC[C@@H](Cc1nnc(-c2cn[nH]c2)s1)N...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,Cl.NNC(=S)c1cn[nH]c1,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:2]([c:1]1[cH:13][cH:15][c:17]([C...,...,0.0,0.378474,0.928819,0.237341,0.0,0.064908,0.342595,5.754573,0.0,B
2,10580,Ph023,Mon017,TerTH015,CC(C)(C)OC(=O)CC[C@@H](Cc1nnc(-c2cc(Cl)cc(Cl)c...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,Cl.NNC(=S)c1cc(Cl)cc(Cl)c1,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:2]([c:1]1[cH:13][cH:15][c:17]([C...,...,0.0,0.921776,0.869821,0.041536,0.0,0.0,0.294589,5.655978,0.0,A
3,10581,Ph023,Mon017,TerTH020,CN(C)c1cccc(-c2nnc(C[C@H](CCC(=O)OC(C)(C)C)NC(...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,CN(C)c1cccc(C(=S)NN)c1.Cl,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:2]([c:1]1[cH:13][cH:15][c:17]([C...,...,1.0,2.117499,2.55085,0.073327,0.000838,0.283949,0.324134,6.655333,0.213819,B
4,10584,Ph023,Mon017,TerABT001,CC(C)(C)OC(=O)CC[C@@H](Cc1nc2ccccc2s1)NC(=O)c1...,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.[K+],CC(C)(C)OC(=O)CC[C@H]1C[C@]2(ON1)OC1(CCCCC1)OC2=O,Nc1ccccc1S,O=C(c1ccc(Cl)cc1)[B-](F)(F)F.CC(C)(C)OC(=O)CC[...,F[B-](F)(F)[C:2]([c:1]1[cH:16][cH:18][c:20]([C...,...,1.0,2.376621,0.011747,0.0,0.001575,0.209161,1.215448,7.303106,0.81199,A


## 0D split

In [10]:
idx_train, idx_test = train_test_split(list(range(len(df))), test_size=0.1, random_state=42)
len(idx_train), len(idx_test)

(36389, 4044)

In [11]:
with open(DATA_DIR / "curated_data" / "splits" / "0D_split" / "train_idx.csv", "w") as f:
    f.write("index\n")
    f.write("\n".join([str(i) for i in idx_train]))

In [12]:
with open(DATA_DIR / "curated_data" / "splits" / "0D_split" / "test_idx.csv", "w") as f:
    f.write("index\n")
    f.write("\n".join([str(i) for i in idx_test]))

## 1D split

In [34]:
# note: we are using a RandomState instance (as opposed to an int) because we will call split() 3 times and want a different, but reproducible, random state for each call to split
group_splitter = GroupShuffleSplit(n_splits=3, test_size=0.1, random_state=np.random.RandomState(42))

In [35]:
for i, (train_idx, test_idx) in enumerate(group_splitter.split(list(range(len(df))), groups=df["I_long"])):
    with open(DATA_DIR / "curated_data" / "splits" / "1D_split" / f"fold_{i}_train_idx.csv", "w") as f:
        f.write("index\n")
        f.write("\n".join([str(i) for i in train_idx]))
    with open(DATA_DIR / "curated_data" / "splits" / "1D_split" / f"fold_{i}_test_idx.csv", "w") as f:
        f.write("index\n")
        f.write("\n".join([str(i) for i in test_idx]))

In [36]:
for i, (train_idx, test_idx) in enumerate(group_splitter.split(list(range(len(df))), groups=df["M_long"])):
    with open(DATA_DIR / "curated_data" / "splits" / "1D_split" / f"fold_{i+3}_train_idx.csv", "w") as f:
        f.write("index\n")
        f.write("\n".join([str(i) for i in train_idx]))
    with open(DATA_DIR / "curated_data" / "splits" / "1D_split" / f"fold_{i+3}_test_idx.csv", "w") as f:
        f.write("index\n")
        f.write("\n".join([str(i) for i in test_idx]))

In [37]:
for i, (train_idx, test_idx) in enumerate(group_splitter.split(list(range(len(df))), groups=df["T_long"])):
    with open(DATA_DIR / "curated_data" / "splits" / "1D_split" / f"fold_{i+6}_train_idx.csv", "w") as f:
        f.write("index\n")
        f.write("\n".join([str(i) for i in train_idx]))
    with open(DATA_DIR / "curated_data" / "splits" / "1D_split" / f"fold_{i+6}_test_idx.csv", "w") as f:
        f.write("index\n")
        f.write("\n".join([str(i) for i in test_idx]))

## Show statistics for splits

In [64]:
# 0D split
lines = []
train_idx = pd.read_csv(DATA_DIR / "curated_data" / "splits" / "0D_split" / "train_idx.csv")["index"].values
test_idx = pd.read_csv(DATA_DIR / "curated_data" / "splits" / "0D_split" / "test_idx.csv")["index"].values
lines.append(f"0D split: {len(train_idx)} train, {len(test_idx)} ({len(test_idx)/(len(test_idx)+len(train_idx)):.1%}) test")
for i, item in (df["major_A-C"].iloc[train_idx].value_counts().sort_index() / len(train_idx)).items():
    lines.append(f"\tTraining set 'major_A-C' class {i}: {item:.1%}")
for i, item in (df["major_A-C"].iloc[test_idx].value_counts().sort_index() / len(test_idx)).items():
    lines.append(f"\tTest set 'major_A-C' class {i}: {item:.1%}")
# save stats to file
with open(DATA_DIR / "curated_data" / "splits" / "0D_split" / "split_statistics.txt", "w") as f:
    f.write("\n".join(lines))
print("\n".join(lines))

0D split: 36389 train, 4044 (10.0%) test
	Training set 'major_A-C' class A: 53.2%
	Training set 'major_A-C' class B: 21.9%
	Training set 'major_A-C' class C: 8.6%
	Training set 'major_A-C' class no_product: 16.2%
	Test set 'major_A-C' class A: 53.9%
	Test set 'major_A-C' class B: 21.5%
	Test set 'major_A-C' class C: 8.9%
	Test set 'major_A-C' class no_product: 15.7%


In [63]:
# 1D split
lines = []
for i in range(9):
    train_idx = pd.read_csv(DATA_DIR / "curated_data" / "splits" / "1D_split" / f"fold_{i}_train_idx.csv")["index"].values
    test_idx = pd.read_csv(DATA_DIR / "curated_data" / "splits" / "1D_split" / f"fold_{i}_test_idx.csv")["index"].values
    lines.append(f"Fold {i}: {len(train_idx)} train, {len(test_idx)} ({len(test_idx)/(len(test_idx)+len(train_idx)):.1%}) test")
    for i, item in (df["major_A-C"].iloc[train_idx].value_counts().sort_index() / len(train_idx)).items():
        lines.append(f"\tTraining set 'major_A-C' class {i}: {item:.1%}")
    for i, item in (df["major_A-C"].iloc[test_idx].value_counts().sort_index() / len(test_idx)).items():
        lines.append(f"\tTest set 'major_A-C' class {i}: {item:.1%}")

# save stats to file
with open(DATA_DIR / "curated_data" / "splits" / "1D_split" / "split_statistics.txt", "w") as f:
    f.write("\n".join(lines))
print("\n".join(lines))

Fold 0: 35657 train, 4776 (11.8%) test
	Training set 'major_A-C' class A: 52.3%
	Training set 'major_A-C' class B: 21.8%
	Training set 'major_A-C' class C: 9.3%
	Training set 'major_A-C' class no_product: 16.6%
	Test set 'major_A-C' class A: 60.5%
	Test set 'major_A-C' class B: 22.2%
	Test set 'major_A-C' class C: 4.0%
	Test set 'major_A-C' class no_product: 13.3%
Fold 1: 36166 train, 4267 (10.6%) test
	Training set 'major_A-C' class A: 53.6%
	Training set 'major_A-C' class B: 21.8%
	Training set 'major_A-C' class C: 8.8%
	Training set 'major_A-C' class no_product: 15.7%
	Test set 'major_A-C' class A: 50.1%
	Test set 'major_A-C' class B: 22.2%
	Test set 'major_A-C' class C: 7.5%
	Test set 'major_A-C' class no_product: 20.3%
Fold 2: 36895 train, 3538 (8.8%) test
	Training set 'major_A-C' class A: 53.1%
	Training set 'major_A-C' class B: 21.8%
	Training set 'major_A-C' class C: 9.2%
	Training set 'major_A-C' class no_product: 16.0%
	Test set 'major_A-C' class A: 54.9%
	Test set 'major_A-