In [None]:
import pandas as pd
import numpy as np

if not 'precision' in locals():
  precision = "gene" # allele or gene

if not 'input_file' in locals():
  input_file = f"../../data_10x/customDatasets/{precision}/paired_concatenated.tsv"
df = pd.read_csv(input_file, sep='\t', low_memory=False)

if not 'paired_output_folder' in locals():
  paired_output_folder = f"../../data_10x/splitted_data/{precision}/paired"

if not 'validation_file_name' in locals():
  validation_file_name = "validation.tsv"

if not 'train_file_name' in locals():
  train_file_name = "train.tsv"

if not 'aimed_validation_ratio' in locals():
  aimed_validation_ratio = 0.2

First the data entries (without negative data) is analysed.

In [None]:
tcr_key = "tcr_key"

df[tcr_key] = df['TRA_CDR3'].astype(str) + '_' + df['TRB_CDR3'].astype(str)


distinct_tcrs = df.drop_duplicates(subset=[tcr_key], keep="first", inplace=False)
unique_epitopes = df.drop_duplicates(subset=["Epitope"], keep=False, inplace=False)
unique_tcrs = df.drop_duplicates(subset=[tcr_key], keep=False, inplace=False)


print(f"distinct tcr's: {len(distinct_tcrs)} from {len(df)}")
print(f"unique tcr's: {len(unique_tcrs)} from {len(df)}")
print(f"unique epitopes: {len(unique_epitopes['Epitope'])} from {len(df)}")

Now a train and validation set is created.

In [None]:
df_train = pd.merge(df, unique_tcrs, how='left', indicator=True)
df_train = df_train[df_train['_merge'] == 'left_only']

seen_epitopes = set(df_train["Epitope"])
seen_tcrs = set(df_train["tcr_key"])

def assign_tpp(row):
    """Weist eine TPP-Kategorie gemäß der Definition zu."""
    epitope_seen = row["Epitope"] in seen_epitopes
    tcr_seen = row["tcr_key"] in seen_tcrs

    if epitope_seen and tcr_seen:
        return "TPP1"  # Both TCR & Epitope seen
    elif epitope_seen and not tcr_seen:
        return "TPP2"  # Epitope seen, but TCR unknown
    elif not epitope_seen and not tcr_seen:
        return "TPP3"  # Neither TCR nor Epitope seen
    elif not epitope_seen and tcr_seen:
        return "TPP4"  # TCR seen, but Epitope unknown
    return "Unknown"  # Falls etwas schiefgeht

df_train["task"] = df_train.apply(assign_tpp, axis=1)

df_validation = df.sample(frac=0.2, random_state=42)
df_validation["task"] = df_validation.apply(assign_tpp, axis=1)

number_of_TPP4 = (df_validation['task'] == 'TPP4').sum()
number_of_TPP3 = (df_validation['task'] == 'TPP3').sum()
number_of_TPP2 = (df_validation['task'] == 'TPP2').sum()
number_of_TPP1 = (df_validation['task'] == 'TPP1').sum()
validation_ratio = len(df_validation) / (len(df_train) + len(df_validation))

print(f"train data has {len(df_train)} entries")
print(f"validation data has {len(df_validation)} entries")
print(f"validation data has {number_of_TPP1} TPP1 tasks (unseen tcr & seen epitopes).")
print(f"validation data has {number_of_TPP2} TPP2 tasks (unseen tcr & seen epitopes).")
print(f"validation data has {number_of_TPP3} TPP3 tasks (unseen tcr & unseen epitope).")
print(f"validation data has {number_of_TPP4} TPP4 tasks (unseen tcr & unseen epitope).")
print(f"the train/validation ratio is {(1-validation_ratio)}/{validation_ratio}")

In [None]:
if validation_ratio < aimed_validation_ratio:
    missing_validation_count = math.ceil((aimed_validation_ratio - validation_ratio) * (len(df_validation) + len(df_train)))
    print(f"{missing_validation_count} Entries need to be shifted from Train to Validation")

    # Bevorzuge `TPP1`-Einträge für den Shift
    filtered_rows = df_train[df_train["task"] == "TPP1"].head(missing_validation_count)

    # Falls nicht genug `TPP1` vorhanden sind, nutze `TPP2` als Fallback
    if len(filtered_rows) < missing_validation_count:
        print(f"Not enough TPP1 entries available ({len(filtered_rows)} found), using TPP2 as fallback!")
        additional_needed = missing_validation_count - len(filtered_rows)
        fallback_rows = df_train[df_train["task"] == "TPP2"].head(additional_needed)
        filtered_rows = pd.concat([filtered_rows, fallback_rows], ignore_index=True)

    # Verschiebe `TPP1` (und ggf. `TPP2`) von Train → Validation
    print("**Verschiebe von Train → Validation:**")
    print(f"Train (vorher): {len(df_train)} entries")
    print(f"Validation (vorher): {len(df_validation)} entries")

    df_validation = pd.concat([df_validation, filtered_rows], ignore_index=True)
    df_train = df_train.drop(filtered_rows.index)

    print(f"Moved {len(filtered_rows)} entries from Train to Validation")
    print(f"Train (nachher): {len(df_train)} entries")
    print(f"Validation (nachher): {len(df_validation)} entries")


In [None]:
# Speicherung der gesplitteten Daten
df_train.drop(columns=["_merge", "tcr_key"], inplace=True, errors='ignore')
df_validation.drop(columns=["_merge", "tcr_key"], inplace=True, errors='ignore')

df_train.to_csv(f"{paired_output_folder}/{train_file_name}", sep="\t", index=False)
df_validation.to_csv(f"{paired_output_folder}/{validation_file_name}", sep="\t", index=False)

In [None]:
print(f"train data has {len(df_train)} entries")
print(f"validation data has {len(df_validation)} entries")
print(f"validation data has {number_of_TPP1} TPP1 tasks (unseen tcr & seen epitopes).")
print(f"validation data has {number_of_TPP2} TPP2 tasks (unseen tcr & seen epitopes).")
print(f"validation data has {number_of_TPP3} TPP3 tasks (unseen tcr & unseen epitope).")
print(f"validation data has {number_of_TPP4} TPP4 tasks (unseen tcr & unseen epitope).")
print(f"the train/validation ratio is {(1-validation_ratio)}/{validation_ratio}")