## 2D data split

(adapted from canonical_data_split.ipynb)
We want a set of splits following this recipe:

- First take away the last 96 records, which are meant as an external validation set (this is the last plate ice-12-103)
- Of all remaining data, take away a 2D test set
- Of remaining data A, take away 2D validation set (10% of A)

The size of the 2D test set can only be chosen approximately, due to the nature of the split.  We aim for a 2D test set that is of a similar since as val.


In [None]:
import pathlib
import sys
import os
sys.path.append(os.path.abspath("../"))

import numpy as np
import pandas as pd

from src.train_test_split import GroupShuffleSplit2D
from src.data import SLAPData

In [None]:
# load data

data_path = os.path.abspath("../data/Data S4.csv")

data = SLAPData(data_path)

data.load_data_from_file()
data.split_reaction_smiles()

In [None]:
print(data.groups)

In [None]:
len(data.all_X)

In [None]:
splitter = GroupShuffleSplit2D(n_splits=5, test_size=10, random_state=42)

In [None]:
# we use only the first 763 records, as the validation plate starts after that
# (can be checked in generate_ml_datasets.ipynb).
# Note that this is only applicable to the LCMS data set, not the isolated yields, which have less entries
train_counter, val_counter, test_counter = 0, 0, 0
train_pos_class, val_pos_class, test_pos_class = 0, 0, 0
for i, (data_subset_A, test_2D) in enumerate(splitter.split(data.all_X[:763], groups=data.groups[:763])):
    
    # we take a (2D) validation set. Rest is training set
    inner_splitter = GroupShuffleSplit2D(n_splits=1, test_size=11, random_state=123)
    train_idx, val_2D_idx = next(inner_splitter.split(data_subset_A, groups=data.groups[data_subset_A]))
    train = data_subset_A[train_idx]    # it may be slightly confusing that we index a list of indices here, 
    val_2D = data_subset_A[val_2D_idx]  # but it is necessary, as the splitter returns indices for data_subset_A.
    
    # update counters
    train_counter += len(train)
    val_counter += len(val_2D)
    test_counter += len(test_2D)
    train_pos_class += np.sum(data.all_y[train])
    val_pos_class += np.sum(data.all_y[val_2D])
    test_pos_class += np.sum(data.all_y[test_2D])
    
    print(f"Statistics for fold {i}:")
    print(f"ID \t\t num \t|\t %positive")
    print(f"Train: \t\t {len(train)} \t|\t {np.mean(data.all_y[train]):.0%}")
    print(f"Val_2D: \t {len(val_2D)} \t|\t {np.mean(data.all_y[val_2D]):.0%}")
    print(f"Test_2D: \t {len(test_2D)} \t|\t {np.mean(data.all_y[test_2D]):.0%}")
    assert np.intersect1d(val_2D, test_2D).size == 0
    assert np.intersect1d(train, test_2D).size == 0
    assert np.intersect1d(train, val_2D).size == 0
    
    # save the indices
    save = False
    if save:
        save_path = pathlib.Path("../data/dataset_splits/LCMS_split_763records_2Dsplit_v2")
        save_path.mkdir(parents=True, exist_ok=True)
        pd.DataFrame(train).to_csv(save_path / f"fold{i}_train.csv", index=False, header=None)
        pd.DataFrame(val_2D).to_csv(save_path / f"fold{i}_val.csv", index=False, header=None)
        pd.DataFrame(test_2D).to_csv(save_path / f"fold{i}_test_2D.csv", index=False, header=None)

# summary statistics
n = train_counter + val_counter + test_counter
print("\nSummary statistics:")
print(f"Split sizes: {train_counter/n:.0%} train, {val_counter/n:.0%} val, {test_counter/n:.0%} test")
print(f"Class balance (positive class ratio): {train_pos_class/train_counter:.0%} train, {val_pos_class/val_counter:.0%} val, {test_pos_class/test_counter:.0%} test")


### 10-fold ShuffleSplit
Same as above, but with 10 folds

In [None]:
splitter = GroupShuffleSplit2D(n_splits=10, test_size=10, random_state=42)

In [None]:
# we use only the first 763 records, as the validation plate starts after that
# (can be checked in generate_ml_datasets.ipynb).
# Note that this is only applicable to the LCMS data set, not the isolated yields, which have less entries

train_counter, val_counter, test_counter = 0, 0, 0
train_pos_class, val_pos_class, test_pos_class = 0, 0, 0

for i, (data_subset_A, test_2D) in enumerate(splitter.split(data.all_X[:763], groups=data.groups[:763])):
    
    # we take a (2D) validation set. Rest is training set
    inner_splitter = GroupShuffleSplit2D(n_splits=1, test_size=11, random_state=123)
    train_idx, val_2D_idx = next(inner_splitter.split(data_subset_A, groups=data.groups[data_subset_A]))
    train= data_subset_A[train_idx]    # it may be slightly confusing that we index a list of indices here, 
    val_2D = data_subset_A[val_2D_idx]  # but it is necessary, as the splitter returns indices for data_subset_A.
    
    # update counters
    train_counter += len(train)
    val_counter += len(val_2D)
    test_counter += len(test_2D)
    train_pos_class += np.sum(data.all_y[train])
    val_pos_class += np.sum(data.all_y[val_2D])
    test_pos_class += np.sum(data.all_y[test_2D])
    
    print(f"Statistics for fold {i}:")
    print(f"ID \t\t num \t|\t %positive")
    print(f"Train: \t\t {len(train)} \t|\t {np.mean(data.all_y[train]):.0%}")
    print(f"Val_2D: \t {len(val_2D)} \t|\t {np.mean(data.all_y[val_2D]):.0%}")
    print(f"Test_2D: \t {len(test_2D)} \t|\t {np.mean(data.all_y[test_2D]):.0%}")
    assert np.intersect1d(val_2D, test_2D).size == 0
    assert np.intersect1d(train, test_2D).size == 0
    assert np.intersect1d(train, val_2D).size == 0
    
    
    # save the indices
    save = False
    if save:
        save_path = pathlib.Path("../data/dataset_splits/LCMS_split_763records_2Dsplit_10fold_v2")
        save_path.mkdir(parents=True, exist_ok=True)
        pd.DataFrame(train).to_csv(save_path / f"fold{i}_train.csv", index=False, header=None)
        pd.DataFrame(val_2D).to_csv(save_path / f"fold{i}_val.csv", index=False, header=None)
        pd.DataFrame(test_2D).to_csv(save_path / f"fold{i}_test_2D.csv", index=False, header=None)
    
# summary statistics
n = train_counter + val_counter + test_counter
print("\nSummary statistics:")
print(f"Split sizes: {train_counter/n:.0%} train, {val_counter/n:.0%} val, {test_counter/n:.0%} test")
print(f"Class balance (positive class ratio): {train_pos_class/train_counter:.0%} train, {val_pos_class/val_counter:.0%} val, {test_pos_class/test_counter:.0%} test")
