# Splitting HECKTOR Dataset into folds
I filter out problematic examples and split train set into 5 CV train/val sets.

In [1]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import pickle
import os

In [2]:
# Defining data paths:
MODE = "test_data"

if MODE == "test_data": 
    endpoint = pd.read_csv("/home/jakub/research/HECKTOR/Data/raw_data/hecktor2022/hecktor2022_testing/hecktor2022_endpoint_testing.csv")
    labels_path = "/home/jakub/research/HECKTOR/Data/filtered_labels/test_labels"

elif MODE == "train_data":  
    endpoint = pd.read_csv("/home/jakub/research/HECKTOR/Data/raw_data/hecktor2022/hecktor2022_training/hecktor2022_patient_endpoint_training.csv")
    labels_path = "/home/jakub/research/HECKTOR/Data/filtered_labels/train_labels"

os.makedirs(labels_path, exist_ok=True)

## Splitting data into 5CV folds.

Some examples are corrupted. In particular, there are examples with broken files -> with zeroed slices.
Also there are files which don't include mask labels corresponding to GTVp (primary Gross Tumor Volume). Such files were ommited and are listed below:|

In [3]:
problematic_train_files = [
    "CHUM-016",
    "CHUM-029",
    "CHUM-065",
    "CHUP-029",
    "CHUP-032",
    "CHUS-076",
    "CHUV-008",
    "MDA-029",
    "MDA-036",
    "MDA-048",
    "MDA-061",
    "MDA-091",
    "MDA-107",
    "MDA-121",
    "MDA-124",
    "MDA-128",
    "MDA-166",
    "MDA-169",
    "MDA-179",
    "MDA-180",
    "MDA-192",
    "MDA-200",
    "MDA-201"
]

problematic_test_files = [
    "MDA-270",
    "MDA-298",
    "MDA-308",
    "MDA-309",
    "MDA-310",
    "MDA-319",
    "MDA-363",
    "MDA-368",
    "MDA-375",
    "MDA-381",
    "MDA-382",
    "MDA-388",
    "MDA-392",
    "CHB-013",
    "CHB-017",
    "CHB-026",
    "CHB-040",
    "CHB-058"
]

if MODE == "train_data":
    endpoint_filtered=endpoint[~endpoint['PatientID'].isin(problematic_train_files)]
    endpoint_filtered.to_csv(os.path.join(labels_path, "train_labels_filtered.csv"))

    # Save one fold with all train data
    with open(os.path.join(labels_path, f"train_fold_all.pkl"), 'wb') as file:
        train_indices=list(range(len(endpoint_filtered["PatientID"])))
        pickle.dump(train_indices, file)
    
    # Split data into 5CV and save train/val folds
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    folds = skf.split(X=endpoint_filtered["PatientID"], y=endpoint_filtered["Relapse"])

    for i, (train_indices, val_indices) in enumerate(folds):
        with open(os.path.join(labels_path, f"train_fold_{i+1}.pkl"), 'wb') as file:
            pickle.dump(list(train_indices), file)
        with open(os.path.join(labels_path, f"val_fold_{i+1}.pkl"), 'wb') as file:
            pickle.dump(list(val_indices), file)
else:
    endpoint_filtered=endpoint[~endpoint['PatientID'].isin(problematic_test_files)]
    endpoint_filtered.to_csv(os.path.join(labels_path, "test_labels_filtered.csv"))