# Data Splits Truncated

## Diastereomers
For the 0D split, this is not important, but for 1D/2D/3D, we want to define the groups such that diastereomers will always be in the same group. These splits will receive a `_dia` suffix.

## Synthetic data
For the 1D/2D/3D problems we use a synthetically amended data set. Splits of the synthetically ammended data set will receive a `_synthetic` suffix.

In [None]:
import pathlib
import sys

sys.path.append(str(pathlib.Path().resolve().parents[1]))

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, ShuffleSplit

from src.definitions import DATA_DIR
from src.util.train_test_split import GroupShuffleSplitND
from util import write_indices_and_stats

In [None]:
# Load data
data_filename = "synferm_dataset_2024-04-18_38586records.csv"
data_name = data_filename.rsplit("_", maxsplit=1)[0]
df = pd.read_csv(DATA_DIR / "curated_data" / data_filename)
df.shape

In [None]:
df.head()

In [None]:
# M_long_dia will be to sort diastereomers into the same group on group shuffle splits
diastereomers = {
    "Mon001": "Mon087",
    "Mon003": "Mon078",
    "Mon011": "Mon088",
    "Mon013": "Mon074",
    "Mon014": "Mon090",
    "Mon015": "Mon076",
    "Mon016": "Mon096",
    "Mon017": "Mon075",
    "Mon019": "Mon091",
    "Mon020": "Mon077",
    "Mon080": "Mon010",
}
df["M_long_dia"] = df["M_long"].replace(diastereomers)

## 0D split

In [None]:
def split_0d(splitter, inner_splitter):
    indices = []
    sizes = []
    pos_class = []
    for idx_train_val, idx_test in splitter.split(df):
        # inner split
        train, val = next(inner_splitter.split(idx_train_val))
        # use indices to index indices :P (we need to obtain indices referring to the original dataframe)
        idx_train = idx_train_val[train]
        idx_val = idx_train_val[val]
        # add to list
        indices.append((idx_train, idx_val, idx_test))
        sizes.append((len(idx_train), len(idx_val), len(idx_test)))
        pos_class.append(
            (np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_train]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_val]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_test]).to_numpy(),
            )
        )
        
    return indices, sizes, pos_class

In [None]:
splitter = ShuffleSplit(n_splits=9, test_size=0.1, random_state=10)
inner_splitter = ShuffleSplit(n_splits=1, test_size=0.1/0.9, random_state=np.random.RandomState(11))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class = split_0d(splitter, inner_splitter)

print(sizes)
#print(pos_class)

In [None]:
write_indices_and_stats(
    indices, sizes, pos_class, split_dimension=0, save_indices=True, train_size=80, total_size=len(df), data_name=data_name
)

In [None]:
splitter = ShuffleSplit(n_splits=9, test_size=0.3, random_state=12)
inner_splitter = ShuffleSplit(n_splits=1, test_size=0.3/0.7, random_state=np.random.RandomState(13))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class = split_0d(splitter, inner_splitter)

print(sizes)
#print(pos_class)

In [None]:
write_indices_and_stats(
    indices, sizes, pos_class, split_dimension=0, save_indices=True, train_size=40, total_size=len(df), data_name=data_name
)

In [None]:
splitter = ShuffleSplit(n_splits=9, test_size=0.4, random_state=14)
inner_splitter = ShuffleSplit(n_splits=1, test_size=0.4/0.6, random_state=np.random.RandomState(15))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class = split_0d(splitter, inner_splitter)

print(sizes)
#print(pos_class)

In [None]:
write_indices_and_stats(
    indices, sizes, pos_class, split_dimension=0, save_indices=True, train_size=20, total_size=len(df), data_name=data_name
)

In [None]:
splitter = ShuffleSplit(n_splits=9, test_size=0.45, random_state=16)
inner_splitter = ShuffleSplit(n_splits=1, test_size=0.45/0.55, random_state=np.random.RandomState(17))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class = split_0d(splitter, inner_splitter)

print(sizes)
#print(pos_class)

In [None]:
write_indices_and_stats(
    indices, sizes, pos_class, split_dimension=0, save_indices=True, train_size=10, total_size=len(df), data_name=data_name
)

In [None]:
splitter = ShuffleSplit(n_splits=9, test_size=0.45, random_state=18)
inner_splitter = ShuffleSplit(n_splits=1, test_size=0.5/0.55, random_state=np.random.RandomState(19))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class = split_0d(splitter, inner_splitter)

print(sizes)
#print(pos_class)

In [None]:
write_indices_and_stats(
    indices, sizes, pos_class, split_dimension=0, save_indices=True, train_size=5, total_size=len(df), data_name=data_name
)

In [None]:
splitter = ShuffleSplit(n_splits=9, test_size=0.475, random_state=20)
inner_splitter = ShuffleSplit(n_splits=1, test_size=0.5/0.525, random_state=np.random.RandomState(21))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class = split_0d(splitter, inner_splitter)

print(sizes)
#print(pos_class)

In [None]:
write_indices_and_stats(
    indices, sizes, pos_class, split_dimension=0, save_indices=True, train_size=2.5, total_size=len(df), data_name=data_name
)

In [None]:
splitter = ShuffleSplit(n_splits=9, test_size=0.4875, random_state=22)
inner_splitter = ShuffleSplit(n_splits=1, test_size=0.5/0.5125, random_state=np.random.RandomState(23))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class = split_0d(splitter, inner_splitter)

print(sizes)
#print(pos_class)

In [None]:
write_indices_and_stats(
    indices, sizes, pos_class, split_dimension=0, save_indices=True, train_size=1.25, total_size=len(df), data_name=data_name
)

In [None]:
splitter = ShuffleSplit(n_splits=9, test_size=0.49375, random_state=24)
inner_splitter = ShuffleSplit(n_splits=1, test_size=0.5/0.50625, random_state=np.random.RandomState(25))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class = split_0d(splitter, inner_splitter)

print(sizes)
#print(pos_class)

In [None]:
write_indices_and_stats(
    indices, sizes, pos_class, split_dimension=0, save_indices=True, train_size=0.625, total_size=len(df), data_name=data_name
)

## 1D split

In [None]:
def split_1d(splitter, inner_splitter):
    indices = []
    sizes = []
    pos_class = []
    unique_initiators = []
    unique_monomers = []
    unique_terminators = []
    for idx_train_val, idx_test in splitter.split(list(range(len(df))), groups=df["I_long"]):
        # inner split
        train, val = next(inner_splitter.split(idx_train_val, groups=df["I_long"][idx_train_val]))
        # use indices to index indices :P (we need to obtain indices referring to the original dataframe)
        idx_train = idx_train_val[train]
        idx_val = idx_train_val[val]
        # add to list
        indices.append((idx_train, idx_val, idx_test))
        sizes.append((len(idx_train), len(idx_val), len(idx_test)))
        pos_class.append(
            (np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_train]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_val]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_test]).to_numpy(),
            )
        )
        unique_initiators.append((len(df['I_long'][idx_train].drop_duplicates()), len(df['I_long'][idx_val].drop_duplicates()), len(df['I_long'][idx_test].drop_duplicates())))
        unique_monomers.append((len(df['M_long'][idx_train].drop_duplicates()), len(df['M_long'][idx_val].drop_duplicates()), len(df['M_long'][idx_test].drop_duplicates())))
        unique_terminators.append((len(df['T_long'][idx_train].drop_duplicates()), len(df['T_long'][idx_val].drop_duplicates()), len(df['T_long'][idx_test].drop_duplicates())))
    
    return indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators


In [None]:
splitter = GroupShuffleSplit(n_splits=9, test_size=0.1, random_state=26)
inner_splitter = GroupShuffleSplit(n_splits=1, test_size=0.1/0.9, random_state=np.random.RandomState(27))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_1d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=1, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=80
)

In [None]:
splitter = GroupShuffleSplit(n_splits=9, test_size=0.3, random_state=28)
inner_splitter = GroupShuffleSplit(n_splits=1, test_size=0.3/0.7, random_state=np.random.RandomState(29))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_1d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=1, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=40
)

In [None]:
splitter = GroupShuffleSplit(n_splits=9, test_size=0.4, random_state=30)
inner_splitter = GroupShuffleSplit(n_splits=1, test_size=0.4/0.6, random_state=np.random.RandomState(31))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_1d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=1, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=20
)

In [None]:
splitter = GroupShuffleSplit(n_splits=9, test_size=0.45, random_state=32)
inner_splitter = GroupShuffleSplit(n_splits=1, test_size=0.45/0.55, random_state=np.random.RandomState(33))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_1d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=1, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=10
)

In [None]:
splitter = GroupShuffleSplit(n_splits=9, test_size=0.45, random_state=34)
inner_splitter = GroupShuffleSplit(n_splits=1, test_size=0.5/0.55, random_state=np.random.RandomState(35))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_1d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=1, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=5
)

In [None]:
splitter = GroupShuffleSplit(n_splits=9, test_size=0.475, random_state=36)
inner_splitter = GroupShuffleSplit(n_splits=1, test_size=0.5/0.525, random_state=np.random.RandomState(37))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_1d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=1, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=2.5
)

## 2D split

In [None]:
def split_2d(splitter, inner_splitter):
    indices = []
    sizes = []
    pos_class = []
    unique_initiators = []
    unique_monomers = []
    unique_terminators = []
    for idx_train_val, idx_test in splitter.split(df, groups=df[["I_long", "M_long_dia"]]):
        train, val = next(inner_splitter.split(df.iloc[idx_train_val], groups=df[["I_long", "M_long_dia"]].iloc[idx_train_val]))
        # use indices to index indices :P (we need to obtain indices referring to the original dataframe)
        idx_train = idx_train_val[train]
        idx_val = idx_train_val[val]
        indices.append((idx_train, idx_val, idx_test))
        sizes.append((len(idx_train), len(idx_val), len(idx_test)))
        pos_class.append(
            (np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_train]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_val]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_test]).to_numpy(),
            )
        )
        unique_initiators.append((len(df['I_long'][idx_train].drop_duplicates()), len(df['I_long'][idx_val].drop_duplicates()), len(df['I_long'][idx_test].drop_duplicates())))
        unique_monomers.append((len(df['M_long_dia'][idx_train].drop_duplicates()), len(df['M_long_dia'][idx_val].drop_duplicates()), len(df['M_long'][idx_test].drop_duplicates())))
        unique_terminators.append((len(df['T_long'][idx_train].drop_duplicates()), len(df['T_long'][idx_val].drop_duplicates()), len(df['T_long'][idx_test].drop_duplicates())))
    
    return indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators


In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.1, random_state=38)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.1/0.9, random_state=np.random.RandomState(39))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=80
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.2, random_state=40)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.2/0.8, random_state=np.random.RandomState(41))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=60
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.3, random_state=42)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.3/0.7, random_state=np.random.RandomState(43))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=40
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.35, random_state=44)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.35/0.65, random_state=np.random.RandomState(45))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=30
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.4, random_state=46)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.4/0.6, random_state=np.random.RandomState(47))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=20
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.4, random_state=48)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.45/0.6, random_state=np.random.RandomState(49))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=15
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.45, random_state=50)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.45/0.55, random_state=np.random.RandomState(51))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=10
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.45, random_state=52)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.475/0.55, random_state=np.random.RandomState(53))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=7.5
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.45, random_state=54)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.5/0.55, random_state=np.random.RandomState(55))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=5
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.48, random_state=56)
inner_splitter = GroupShuffleSplitND(n_splits=1, train_size=2, random_state=np.random.RandomState(57))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=2
)

## 3D split

In [None]:
def split_3d(splitter, inner_splitter):
    indices = []
    sizes = []
    pos_class = []
    unique_initiators = []
    unique_monomers = []
    unique_terminators = []
    for idx_train_val, idx_test in splitter.split(df, groups=df[["I_long", "M_long_dia", "T_long"]]):
        train, val = next(inner_splitter.split(df.iloc[idx_train_val], groups=df[["I_long", "M_long_dia", "T_long"]].iloc[idx_train_val]))
        # use indices to index indices :P (we need to obtain indices referring to the original dataframe)
        idx_train = idx_train_val[train]
        idx_val = idx_train_val[val]
        indices.append((idx_train, idx_val, idx_test))
        sizes.append((len(idx_train), len(idx_val), len(idx_test)))
        pos_class.append(
            (np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_train]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_val]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_test]).to_numpy(),
            )
        )
        unique_initiators.append((len(df['I_long'][idx_train].drop_duplicates()), len(df['I_long'][idx_val].drop_duplicates()), len(df['I_long'][idx_test].drop_duplicates())))
        unique_monomers.append((len(df['M_long_dia'][idx_train].drop_duplicates()), len(df['M_long_dia'][idx_val].drop_duplicates()), len(df['M_long'][idx_test].drop_duplicates())))
        unique_terminators.append((len(df['T_long'][idx_train].drop_duplicates()), len(df['T_long'][idx_val].drop_duplicates()), len(df['T_long'][idx_test].drop_duplicates())))

    return indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators


In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.1, random_state=np.random.RandomState(358))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.1/0.9, random_state=np.random.RandomState(359))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=80
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.15, random_state=np.random.RandomState(60))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.15/0.85, random_state=np.random.RandomState(61))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=70
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.2, random_state=np.random.RandomState(62))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.2/0.8, random_state=np.random.RandomState(63))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=60
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.25, random_state=np.random.RandomState(64))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.25/0.75, random_state=np.random.RandomState(65))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=50
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.3, random_state=np.random.RandomState(66))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.3/0.7, random_state=np.random.RandomState(67))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=40
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.33, random_state=np.random.RandomState(68))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.33/0.67, random_state=np.random.RandomState(69))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=34
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.35, random_state=np.random.RandomState(70))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.35/0.65, random_state=np.random.RandomState(71))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=30
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.35, random_state=np.random.RandomState(72))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.40/0.65, random_state=np.random.RandomState(73))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=25
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.40, random_state=np.random.RandomState(74))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.40/0.60, random_state=np.random.RandomState(75))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=20
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.40, random_state=np.random.RandomState(76))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.45/0.60, random_state=np.random.RandomState(77))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=15
)

In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.45, random_state=np.random.RandomState(778))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.45/0.55, random_state=np.random.RandomState(779))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name=data_name,
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=10
)

## Synthetic data splits

### We don't do this for data `2024-04-18` as it was ineffective before and we don't have reason to believe this will change

How do we go about this?

For synthetic data, we will want to use only real data to evaluate. So the validation and test sets can only contain real data while in the training set, we combine real and synthetic data.

To do this, we can split the combined data as usual, but remove synthetic data before saving validation and test sets by comparing to the indices of real data points (`synferm_dataset_2024-01-31_195037records_synthetic_real-indices.txt`) we saved when preparing the synthetic data.

In [None]:
# Load data
data_filename = "synferm_dataset_2024-01-31_195037records_synthetic.csv"
data_name = data_filename.rsplit("_", maxsplit=1)[0]
df = pd.read_csv(DATA_DIR / "curated_data" / data_filename).astype({"binary_A": int, "binary_B": int, "binary_C": int})
df.shape

In [None]:
# M_long_dia will be to sort diastereomers into the same group on group shuffle splits
diastereomers = {
    "Mon001": "Mon087",
    "Mon003": "Mon078",
    "Mon011": "Mon088",
    "Mon013": "Mon074",
    "Mon014": "Mon090",
    "Mon015": "Mon076",
    "Mon016": "Mon096",
    "Mon017": "Mon075",
    "Mon019": "Mon091",
    "Mon020": "Mon077",
    "Mon080": "Mon010",
}
df["M_long_dia"] = df["M_long"].replace(diastereomers)

In [None]:
df.head()

In [None]:
# load indices that tell which data points are real
with open(DATA_DIR / "curated_data" / "synferm_dataset_2024-01-31_195037records_synthetic_real-indices.txt", "r") as f:
    real_idx = np.array([int(line.strip()) for line in f.readlines()])
real_idx.shape

### 1D synthetic split

In [None]:
def split_1d_synthetic(splitter, inner_splitter):
    indices = []
    sizes = []
    pos_class = []
    unique_initiators = []
    unique_monomers = []
    unique_terminators = []
    for idx_train_val, idx_test in splitter.split(list(range(len(df))), groups=df["I_long"]):
        # inner split
        train, val = next(inner_splitter.split(idx_train_val, groups=df["I_long"][idx_train_val]))
        # use indices to index indices :P (we need to obtain indices referring to the original dataframe)
        idx_train = idx_train_val[train]
        idx_val = idx_train_val[val]
        # eliminate all idx_val and idx_test that do not refer to real data
        idx_val = np.intersect1d(idx_val, real_idx)  # n.b. a side effect is that the indices are sorted, but that does not matter as shuffling is controlled in the dataloader
        idx_test = np.intersect1d(idx_test, real_idx)
        # add to list
        indices.append((idx_train, idx_val, idx_test))
        sizes.append((len(idx_train), len(idx_val), len(idx_test)))
        pos_class.append(
            (np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_train]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_val]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_test]).to_numpy(),
            )
        )
        unique_initiators.append((len(df['I_long'][idx_train].drop_duplicates()), len(df['I_long'][idx_val].drop_duplicates()), len(df['I_long'][idx_test].drop_duplicates())))
        unique_monomers.append((len(df['M_long'][idx_train].drop_duplicates()), len(df['M_long'][idx_val].drop_duplicates()), len(df['M_long'][idx_test].drop_duplicates())))
        unique_terminators.append((len(df['T_long'][idx_train].drop_duplicates()), len(df['T_long'][idx_val].drop_duplicates()), len(df['T_long'][idx_test].drop_duplicates())))
    
    return indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators


In [None]:
splitter = GroupShuffleSplit(n_splits=9, test_size=0.1, random_state=42)
inner_splitter = GroupShuffleSplit(n_splits=1, test_size=0.1/0.9, random_state=np.random.RandomState(42))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_1d_synthetic(splitter, inner_splitter)

print(sizes)
print()
print(pos_class)
print()
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name="synferm_dataset_2024-01-31_synthetic", 
    split_dimension=1, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=80
)

### 2D synthetic split

In [None]:
def split_2d_synthetic(splitter, inner_splitter):
    indices = []
    sizes = []
    pos_class = []
    unique_initiators = []
    unique_monomers = []
    unique_terminators = []
    for idx_train_val, idx_test in splitter.split(df, groups=df[["I_long", "M_long_dia"]]):
        train, val = next(inner_splitter.split(df.iloc[idx_train_val], groups=df[["I_long", "M_long_dia"]].iloc[idx_train_val]))
        # use indices to index indices :P (we need to obtain indices referring to the original dataframe)
        idx_train = idx_train_val[train]
        idx_val = idx_train_val[val]
        # eliminate all idx_val and idx_test that do not refer to real data
        idx_val = np.intersect1d(idx_val, real_idx)  # n.b. a side effect is that the indices are sorted, but that does not matter as shuffling is controlled in the dataloader
        idx_test = np.intersect1d(idx_test, real_idx)
        # add to list
        indices.append((idx_train, idx_val, idx_test))
        sizes.append((len(idx_train), len(idx_val), len(idx_test)))
        pos_class.append(
            (np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_train]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_val]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_test]).to_numpy(),
            )
        )
        unique_initiators.append((len(df['I_long'][idx_train].drop_duplicates()), len(df['I_long'][idx_val].drop_duplicates()), len(df['I_long'][idx_test].drop_duplicates())))
        unique_monomers.append((len(df['M_long_dia'][idx_train].drop_duplicates()), len(df['M_long_dia'][idx_val].drop_duplicates()), len(df['M_long'][idx_test].drop_duplicates())))
        unique_terminators.append((len(df['T_long'][idx_train].drop_duplicates()), len(df['T_long'][idx_val].drop_duplicates()), len(df['T_long'][idx_test].drop_duplicates())))
    
    return indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators


In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.2, random_state=42)
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.2/0.8, random_state=np.random.RandomState(4))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_2d_synthetic(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name="synferm_dataset_2024-01-31_synthetic", 
    split_dimension=2, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=60,
)

### 3D synthetic split

In [None]:
def split_3d_synthetic(splitter, inner_splitter):
    indices = []
    sizes = []
    pos_class = []
    unique_initiators = []
    unique_monomers = []
    unique_terminators = []
    for idx_train_val, idx_test in splitter.split(df, groups=df[["I_long", "M_long_dia", "T_long"]]):
        train, val = next(inner_splitter.split(df.iloc[idx_train_val], groups=df[["I_long", "M_long_dia", "T_long"]].iloc[idx_train_val]))
        # use indices to index indices :P (we need to obtain indices referring to the original dataframe)
        idx_train = idx_train_val[train]
        idx_val = idx_train_val[val]
        # eliminate all idx_val and idx_test that do not refer to real data
        idx_val = np.intersect1d(idx_val, real_idx)  # n.b. a side effect is that the indices are sorted, but that does not matter as shuffling is controlled in the dataloader
        idx_test = np.intersect1d(idx_test, real_idx)
        # add to list
        indices.append((idx_train, idx_val, idx_test))
        sizes.append((len(idx_train), len(idx_val), len(idx_test)))
        pos_class.append(
            (np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_train]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_val]).to_numpy(), 
             np.sum(df[['binary_A', 'binary_B', 'binary_C']].loc[idx_test]).to_numpy(),
            )
        )
        unique_initiators.append((len(df['I_long'][idx_train].drop_duplicates()), len(df['I_long'][idx_val].drop_duplicates()), len(df['I_long'][idx_test].drop_duplicates())))
        unique_monomers.append((len(df['M_long_dia'][idx_train].drop_duplicates()), len(df['M_long_dia'][idx_val].drop_duplicates()), len(df['M_long'][idx_test].drop_duplicates())))
        unique_terminators.append((len(df['T_long'][idx_train].drop_duplicates()), len(df['T_long'][idx_val].drop_duplicates()), len(df['T_long'][idx_test].drop_duplicates())))

    return indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators


In [None]:
splitter = GroupShuffleSplitND(n_splits=9, test_size=0.2, random_state=np.random.RandomState(42))  # here, we reuse the outer splitter as well, so we use RandomState
inner_splitter = GroupShuffleSplitND(n_splits=1, test_size=0.2/0.8, random_state=np.random.RandomState(42))  # we use a RandomState instance, not an int, because we will reuse this splitter several times

indices, sizes, pos_class, unique_initiators, unique_monomers, unique_terminators = split_3d_synthetic(splitter, inner_splitter)

print(sizes)
print(pos_class)
print(unique_initiators)
print(unique_monomers)
print(unique_terminators)

In [None]:
write_indices_and_stats(
    indices, 
    sizes, 
    pos_class,
    total_size=len(df),
    data_name="synferm_dataset_2024-01-31_synthetic", 
    split_dimension=3, 
    save_indices=True, 
    n_initiators=unique_initiators, 
    n_monomers=unique_monomers, 
    n_terminators=unique_terminators, 
    train_size=60,
)