In [None]:
import pandas as pd
import numpy as np
import math

if not 'precision' in locals():
  precision = "gene" # allele or gene

if not 'input_file' in locals():
  input_file = f"../../data_10x/customDatasets/{precision}/beta_concatenated.tsv"
df = pd.read_csv(input_file, sep='\t', low_memory=False)

if not 'beta_output_folder' in locals():
  beta_output_folder = f"../../data_10x/splitted_data/{precision}/beta"

if not 'validation_file_name' in locals():
  validation_file_name = "validation.tsv"

if not 'train_file_name' in locals():
  train_file_name = "train.tsv"

if not 'aimed_test_ratio' in locals():
  aimed_test_ratio = 0.2



First the data entries (without negative data) is analysed.

In [None]:
tcr_key = 'TRB_CDR3'

distinct_tcrs = df.drop_duplicates(subset=[tcr_key], keep="first", inplace=False)
unique_epitopes = df.drop_duplicates(subset=["Epitope"], keep=False, inplace=False)
unique_tcrs = df.drop_duplicates(subset=[tcr_key], keep=False, inplace=False)

print(f"distinct tcr's: {len(distinct_tcrs)} from {len(df)}")
print(f"unique tcr's: {len(unique_tcrs)} from {len(df)}")
print(f"unique epitopes: {len(unique_epitopes['Epitope'])} from {len(df)}")

Now a train and validation set is created.

In [None]:
# Aufteilen in Train und Validation
df_train = pd.merge(df, unique_tcrs, how='left', indicator=True)
df_train = df_train[df_train['_merge'] == 'left_only']
df_train["task"] = ""

train_epitopes = set(df_train["Epitope"])

df_validation = unique_tcrs.copy()
df_validation["task"] = df_validation["Epitope"].apply(lambda x: 'TPP3' if x not in train_epitopes else 'TPP2')

validation_ratio = len(df_validation) / (len(df_validation) + len(df_train))

print(f"Train Data: {len(df_train)} entries")
print(f"Validation Data: {len(df_validation)} entries")
print(f"Validation TPP1: {(df_validation['task'] == 'TPP1').sum()}")
print(f"Validation TPP2: {(df_validation['task'] == 'TPP2').sum()}")
print(f"Validation TPP3: {(df_validation['task'] == 'TPP3').sum()}")
print(f"Validation TPP4: {(df_validation['task'] == 'TPP4').sum()}")
print(f"Train/Validation Ratio: {(1-validation_ratio)}/{validation_ratio}")

if the test ratio doesn't match

In [None]:
# Falls das Verhältnis nicht passt, wird es angepasst
if validation_ratio > aimed_validation_ratio:
    abundant_validation_count = math.ceil((validation_ratio - aimed_validation_ratio) * (len(df_validation) + len(df_train)))
    print(f"{abundant_validation_count} Entries will be moved from Validation to Train")
    
    filtered_rows = df_validation[df_validation["task"] == "TPP2"]
    if len(filtered_rows) < abundant_validation_count:
        raise ValueError("Not enough entries with 'task' == 'TPP2' to move.")

    rows_to_move = filtered_rows.head(abundant_validation_count)
    df_train = pd.concat([df_train, rows_to_move], ignore_index=True)
    df_validation = df_validation.drop(rows_to_move.index)

elif validation_ratio < aimed_validation_ratio:
    missing_validation_count = math.ceil((aimed_validation_ratio - validation_ratio) * (len(df_validation) + len(df_train)))
    print(f"{missing_validation_count} Entries need to be shifted from Train to Validation")

    # Verschiebe Daten von Train → Validation
    filtered_rows = df_train[df_train["task"] == "TPP1"].head(missing_validation_count)
    df_validation = pd.concat([df_validation, filtered_rows], ignore_index=True)
    df_train = df_train.drop(filtered_rows.index)

In [None]:
# Entferne unnötige Spalten
df_train.drop(columns=["_merge", "tcr_key"], inplace=True, errors='ignore')
df_validation.drop(columns=["_merge", "tcr_key"], inplace=True, errors='ignore')

# Speichere die neuen Dateien
df_train.to_csv(f"{beta_output_folder}/{train_file_name}", sep="\t", index=False)
df_validation.to_csv(f"{beta_output_folder}/{validation_file_name}", sep="\t", index=False)

print(f"Train: {len(df_train)} Einträge")
print(f"Validation: {len(df_validation)} Einträge")