In [138]:
import os
import pandas as pd

In [3]:
import sys
!"{sys.executable}" -m pip install tidytcells



In [4]:
# set precision of mhc and V/J values (gene or allele)
precision = 'allele'

In [5]:
# this function is not thread safe
def create_folders_if_not_exists(folders):
  for path in folders:
    if not os.path.exists(path):
      os.makedirs(path)

In [6]:
pipeline_data = '../../../../data'
pipeline_data_plain = f'{pipeline_data}/plain_datasets'
pipeline_data_cleaned = f'{pipeline_data}/cleaned_datasets'
pipeline_data_concatenated = f'{pipeline_data}/concatenated_datasets'
pipeline_data_splitted = f'{pipeline_data}/splitted_datasets'
pipeline_data_temp_bucket = f'{pipeline_data}/temp'

pipeline_folders = [pipeline_data, pipeline_data_plain, pipeline_data_cleaned, pipeline_data_concatenated, pipeline_data_splitted, pipeline_data_temp_bucket]

create_folders_if_not_exists(pipeline_folders)

## Data Preparation

### VDJdb

In [7]:
# prepare directories
VDJdb_data_plain = f'{pipeline_data_plain}/VDJdb'
VDJdb_data_cleaned = f'{pipeline_data_cleaned}/VDJdb'
VDJdb_data_fitted = f'{pipeline_data_temp_bucket}/VDJdb'

VDJdb_folders = [VDJdb_data_plain, VDJdb_data_cleaned, VDJdb_data_fitted]
create_folders_if_not_exists(VDJdb_folders)

fitted_beta_file = 'VDJdb_beta_fitted.tsv'
fitted_paired_file = 'VDJdb_paired_fitted.tsv'

In [8]:
# prepare parameters for notebook VDJdb fit data paired
input_file = f'{VDJdb_data_plain}/VDJdb_paired_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_paired_file

# fit paired VDJdb data
%run ../VDJdb/fit_data_vdjdb_paired.ipynb

In [9]:
# prepare parameters for notebook VDJdb fit data beta
input_file = f'{VDJdb_data_plain}/VDJdb_beta_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_beta_file

# fit beta VDJdb data
%run ../VDJdb/fit_data_vdjdb_beta.ipynb

In [10]:
# prepare parameters for notebook VDJdb clean data paired
input_file = f'{VDJdb_data_fitted}/{fitted_paired_file}'
cleaned_file_paired = 'VDJdb_cleaned_data_paired.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

# clean paired VDJdb data
%run ../VDJdb/clean_data_vdjdb_paired.ipynb

MHC Class I has 27414 entries
whole dataframe has 28119 entries
filtered to only use MHC Class I. Length of dataset: 27414


In [11]:
# prepare parameters for notebook VDJdb clean data beta
input_file = f'{VDJdb_data_fitted}/{fitted_beta_file}'
cleaned_file_beta = 'VDJdb_cleaned_data_beta.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'

# clean beta VDJdb data
%run ../VDJdb/clean_data_vdjdb_beta.ipynb

MHC Class I has 46507 entries
whole dataframe has 49042 entries
filtered to only use MHC Class I. Length of dataset: 46507


In [12]:
VDJdb_cleaned_beta_output = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'
VDJdb_cleaned_paired_output = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

In [13]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'

vdjdb_beta_read_path = VDJdb_cleaned_beta_output
vdjdb_paired_read_path = VDJdb_cleaned_paired_output

output_file_beta = 'beta_concatenated_test.tsv'
output_file_paired = 'paired_concatenated_test.tsv'

create_folders_if_not_exists([custom_dataset_path])

%run ../concatDatasets_onlytest.ipynb

length of beta_df: 46507




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 7188 entries removed.
removed all duplicates (CDR3, Epitope) from distinct values (most_important_columns, keep=False). 1435 entries removed.
beta removed entries df length: 1435


Number of groups formed: 655
1435 can be re-added to the no-duplicated dataframe
from the plain dataset which has 46507 entries, 7188 entries have been removed.
for beta dataset :
size difference is: 7188
  39319 information score cleaned: 6.0
  46507 information score dropout: 6.0
✅ Nach Duplikat-Filter (Train/Val): final_beta_df enthält 9105 Einträge.
final_beta_df length = 9105
length of paired_df: 27414




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 687 entries removed.
removed all duplicates from distinct values (cultivated columns, keep=False). 246 entries removed.
paired removed entries df length: 246


246 can be re-added to the no-duplicated dataframe
from the plain dataset which has 27414 entries, 687 entries have been removed.
for paired dataset:
size difference is: 687
  26727 information score cleaned: 8.976241254162458
  27414 information score dropout: 8.975888232290071
final_paired_df length: 26727


In [24]:
import shutil
import os

# Define source folder where the files are currently stored
source_folder = f'{pipeline_data_concatenated}/{precision}/'

# Define file names
output_file_beta = 'beta_concatenated_test.tsv'
output_file_paired = 'paired_concatenated_test.tsv'

# Define destination folders
destination_beta_folder = f'{pipeline_data_splitted}/{precision}/beta/'
destination_paired_folder = f'{pipeline_data_splitted}/{precision}/paired/'

# Ensure destination folders exist
os.makedirs(destination_beta_folder, exist_ok=True)
os.makedirs(destination_paired_folder, exist_ok=True)

# Copy files
shutil.copy(os.path.join(source_folder, output_file_beta), os.path.join(destination_beta_folder, 'test_prenegsamples.tsv'))
shutil.copy(os.path.join(source_folder, output_file_paired), os.path.join(destination_paired_folder, 'test_prenegsamples.tsv'))

print(f'Beta file copied successfully to {destination_beta_folder}test_prenegsamples.tsv')
print(f'Paired file copied successfully to {destination_paired_folder}test_prenegsamples.tsv')

Beta file copied successfully to ../../../../data/splitted_datasets/allele/beta/test_prenegsamples.tsv
Paired file copied successfully to ../../../../data/splitted_datasets/allele/paired/test_prenegsamples.tsv


In [25]:
# Define file paths
beta_file_path = f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv'
paired_file_path = f'{pipeline_data_splitted}/{precision}/paired/test_prenegsamples.tsv'

# Load beta dataset
beta_df = pd.read_csv(beta_file_path, sep='\t')

# Load paired dataset
paired_df = pd.read_csv(paired_file_path, sep='\t')

# Calculate unique values for beta dataset
unique_tcr_beta = beta_df['TRB_CDR3'].nunique()
unique_epitope_beta = beta_df['Epitope'].nunique()

# Calculate unique values for paired dataset
unique_tcr_paired = paired_df['TRB_CDR3'].nunique()
unique_epitope_paired = paired_df['Epitope'].nunique()

# Print results for beta dataset
print("\nBeta Dataset:")
print(f"- Unique TCRs: {unique_tcr_beta}")
print(f"- Unique Epitope: {unique_epitope_beta}")

# Print results for paired dataset
print("\nPaired Dataset:")
print(f"- Unique TCRs: {unique_tcr_paired}")
print(f"- Unique Epitope: {unique_epitope_paired}")


Beta Dataset:
- Unique TCRs: 8464
- Unique Epitope: 293

Paired Dataset:
- Unique TCRs: 21101
- Unique Epitope: 825


## Negative Data

In [62]:
#Daten einlesen

combined_donors_path = f'{pipeline_data_plain}/10x/combined_donors_consensus_annotations.csv'
all_donors_consensus = pd.read_csv(combined_donors_path, sep=',')

print("Consensus: ", all_donors_consensus.head())

all_donors_meta_path = f'{pipeline_data_plain}/10x/meta.csv'
all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')

print("Meta: ", all_donors_meta.head())

Consensus:                 barcode   donor  \
0   AAACCTGAGACAAAGG-4  donor1   
1  AAACCTGAGACTGTAA-34  donor1   
2   AAACCTGAGAGCCCAA-5  donor1   
3  AAACCTGAGAGCTGCA-24  donor1   
4   AAACCTGAGAGGGATA-8  donor1   

                                  cell_clono_cdr3_aa  \
0  TRA:CAASVSIWTGTASKLTF;TRA:CAAWDMEYGNKLVF;TRB:C...   
1                                    TRB:CASDTPVGQFF   
2                 TRA:CASYTDKLIF;TRB:CASSGGSISTDTQYF   
3                                 TRB:CASSGGQSSYEQYF   
4          TRA:CAASGYGNTGRRALTF;TRB:CASSQDPAGGYNEQFF   

                                  cell_clono_cdr3_nt     CD3  CD19  CD45RA  \
0  TRA:TGTGCAGCAAGCGTTAGTATTTGGACCGGCACTGCCAGTAAA...  2125.0   0.0   912.0   
1              TRB:TGTGCCAGCGATACCCCGGTTGGGCAGTTCTTC  1023.0   0.0  2028.0   
2  TRA:TGTGCTTCCTACACCGACAAGCTCATCTTT;TRB:TGCGCCA...  1598.0   3.0  3454.0   
3     TRB:TGCGCCAGCAGTGGCGGACAGAGCTCCTACGAGCAGTACTTC   298.0   1.0   880.0   
4  TRA:TGTGCAGCAAGCGGGTATGGAAACACGGGCAGGAGAGCACTT...  10

  all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')


### Beta

In [66]:
#Dieser Code für ganzen Datensatz laufen lassen
import re
import pandas as pd

# Festlegen der Batch-Größe für die Verarbeitung
batch_size = 1000  # Passe diese Zahl je nach Speicherressourcen an

# Identifizieren von Epitope-Spalten, aber ohne "NR(B0801)_AAKGRGAAL_NC_binder"
epitope_columns = [col for col in all_donors_consensus.columns if '_binder' in col and col != "NR(B0801)_AAKGRGAAL_NC_binder"]

# Liste für alle Batch-Ergebnisse
all_batches = []

# Verarbeite `all_donors_consensus` in Batches
for batch_start in range(0, len(all_donors_consensus), batch_size):
    print("Batch Start: ", batch_start)
    # Batch definieren
    batch = all_donors_consensus.iloc[batch_start:batch_start + batch_size]
    
    # Filtern auf Zeilen, die 'TRB:' enthalten
    batch_trb = batch[batch['cell_clono_cdr3_aa'].str.contains("TRB:", na=False)]

    # Liste, um Zeilen für diesen Batch zu speichern
    expanded_rows = []
    
    # Iteriere durch jede Zeile im Batch
    for _, row in batch_trb.iterrows():
        for col in epitope_columns:
            # Extrahiere MHC und Epitope
            match = re.match(r'([A-Z0-9]+)_([A-Z]+)_.*_binder', col)
            if match:
                mhc_raw, epitope = match.groups()
                mhc_formatted = f'HLA-{mhc_raw[0]}*{mhc_raw[1:3]}:{mhc_raw[3:]}'

                # Füge `Epitope` und `MHC` zur Zeile hinzu
                new_row = row.copy()
                new_row['Epitope'] = epitope
                new_row['MHC'] = mhc_formatted

                # Füge neue Zeile zur Batch-Liste hinzu
                expanded_rows.append(new_row)
    
    # Erstelle einen DataFrame aus dem Batch
    batch_df = pd.DataFrame(expanded_rows)
    all_batches.append(batch_df)  # Speichere den Batch in der Liste

# Kombiniere alle Batch-Ergebnisse zu einem DataFrame
expanded_df = pd.concat(all_batches, ignore_index=True)

# Nur die TRB-Chain-Einträge in `all_donors_meta` beibehalten
all_donors_meta_trb = all_donors_meta[all_donors_meta['chain'] == 'TRB']

# Zusammenführen der beiden DataFrames basierend auf der 'barcode' Spalte
merged_df = pd.merge(all_donors_meta_trb, expanded_df[['barcode', 'Epitope', 'MHC']], on='barcode', how='inner')

# Spalten umbenennen und Format anpassen
merged_df = merged_df.rename(columns={
    'barcode': 'TCR_name',
    'v_gene': 'TRBV',
    'j_gene': 'TRBJ',
    'c_gene': 'TRBC',
    'cdr3': 'TRB_CDR3'
})

# Fehlende Spalten auffüllen
desired_columns = ['TCR_name', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TRBC', 'Epitope', 'MHC', 'Binding', 'task']
for col in desired_columns:
    if col not in merged_df.columns:
        merged_df[col] = 'nan' if col == 'task' else '0'

# Nur die gewünschten Spalten beibehalten und Zeilen mit `None` in `TRB_CDR3` entfernen
final_df = merged_df[desired_columns]
final_df = final_df[final_df['TRB_CDR3'] != 'None']

final_df = final_df[final_df['TRB_CDR3'].notna() & (final_df['TRB_CDR3'] != '')]

# Ausgabe des ersten Teils des Ergebnisses zur Überprüfung
print(final_df.head())

# Speichern des kombinierten DataFrames
output_path = f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv'
final_df.to_csv(output_path, index=False)


Batch Start:  0
Batch Start:  1000
Batch Start:  2000
Batch Start:  3000
Batch Start:  4000
Batch Start:  5000
Batch Start:  6000
Batch Start:  7000
Batch Start:  8000
Batch Start:  9000
Batch Start:  10000
Batch Start:  11000
Batch Start:  12000
Batch Start:  13000
Batch Start:  14000
Batch Start:  15000
Batch Start:  16000
Batch Start:  17000
Batch Start:  18000
Batch Start:  19000
Batch Start:  20000
Batch Start:  21000
Batch Start:  22000
Batch Start:  23000
Batch Start:  24000
Batch Start:  25000
Batch Start:  26000
Batch Start:  27000
Batch Start:  28000
Batch Start:  29000
Batch Start:  30000
Batch Start:  31000
Batch Start:  32000
Batch Start:  33000
Batch Start:  34000
Batch Start:  35000
Batch Start:  36000
Batch Start:  37000
Batch Start:  38000
Batch Start:  39000
Batch Start:  40000
Batch Start:  41000
Batch Start:  42000
Batch Start:  43000
Batch Start:  44000
Batch Start:  45000
Batch Start:  46000
Batch Start:  47000
Batch Start:  48000
Batch Start:  49000
Batch Start: 

### Beta Samples generieren für Test File

In [141]:
# prepare parameters for beta dataset
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'

read_path_train = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
read_path_test = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'
read_path_validation = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/beta/'
output_path = f"{pipeline_data_splitted}/{precision}/beta/new/negatives"
train_output_name = "train_neg.tsv"
validation_output_name = "val_neg.tsv"
test_output_name = "test_neg.tsv"

create_folders_if_not_exists([temp_path])

%run ../negative_samples/negative_samples_beta.ipynb

Using device: cuda:0
Loading: Rostlab/prot_t5_xl_half_uniref50-enc


In [142]:
import pandas as pd

# --- Dateipfade ---
read_path_train = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{train_output_name}"
read_path_val = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{validation_output_name}"
read_path_test = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{test_output_name}"

# --- Dateien laden ---
train_neg = pd.read_csv(read_path_train, sep='\t')
val_neg = pd.read_csv(read_path_val, sep='\t')
test_neg = pd.read_csv(read_path_test, sep='\t')

# --- Spalte "source" setzen ---
for df in [train_neg, val_neg, test_neg]:
    df["source"] = "generated"

# --- Zurückspeichern ---
train_neg.to_csv(read_path_train, sep='\t', index=False)
val_neg.to_csv(read_path_val, sep='\t', index=False)
test_neg.to_csv(read_path_test, sep='\t', index=False)

print("✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.")

✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.


In [144]:
# 🔁 Wiederverwendbare Funktion zum Entfernen
def remove_conflicting_negatives(pos_df, neg_df, name):
    dupe_cols = ["Epitope", "TRB_CDR3"]
    merged = pd.merge(neg_df, pos_df[dupe_cols], on=dupe_cols, how="inner")
    print(f"❌ {name.upper()}: Entferne {len(merged)} Duplikate mit Konflikt.")
    
    # Nur die, die NICHT im positiven vorkommen → behalten
    cleaned_neg = neg_df.merge(merged[dupe_cols], on=dupe_cols, how="left", indicator=True)
    cleaned_neg = cleaned_neg[cleaned_neg["_merge"] == "left_only"].drop(columns=["_merge"])
    print(f"✅ {name.upper()}: Übrig nach Cleaning: {len(cleaned_neg)}")
    return cleaned_neg
# Neue Clean-Dateipfade
neg_base_path = f"{pipeline_data_splitted}/{precision}/beta/new/negatives"

train_neg_clean = remove_conflicting_negatives(train_pos, train_neg, "train")
train_neg_clean.to_csv(f"{neg_base_path}/train_neg_clean.tsv", sep="\t", index=False)

val_neg_clean = remove_conflicting_negatives(val_pos, val_neg, "validation")
val_neg_clean.to_csv(f"{neg_base_path}/val_neg_clean.tsv", sep="\t", index=False)

test_neg_clean = remove_conflicting_negatives(test_pos, test_neg, "test")
test_neg_clean.to_csv(f"{neg_base_path}/test_neg_clean.tsv", sep="\t", index=False)


❌ TRAIN: Entferne 157965 Duplikate mit Konflikt.
✅ TRAIN: Übrig nach Cleaning: 624111
❌ VALIDATION: Entferne 31437 Duplikate mit Konflikt.
✅ VALIDATION: Übrig nach Cleaning: 189566
❌ TEST: Entferne 1085 Duplikate mit Konflikt.
✅ TEST: Übrig nach Cleaning: 51705


In [146]:
import pandas as pd

# === Load negative datasets ===
train_neg = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/train_neg_clean.tsv", sep='\t')
val_neg = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/val_neg_clean.tsv", sep='\t')
test_neg = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/test_neg_clean.tsv", sep='\t')

# === Load positive datasets ===
train_pos = pd.read_csv(output_train_path, sep='\t')
val_pos = pd.read_csv(output_val_path, sep='\t')
test_pos = pd.read_csv(test_output_path, sep='\t')

# === Spalten zur Duplikatprüfung ===
dupe_cols = ["Epitope", "TRB_CDR3"]

# === Funktion zum Vergleich ===
def check_duplicates(pos_df, neg_df, name):
    merged = pd.merge(pos_df, neg_df, on=dupe_cols, suffixes=('_pos', '_neg'))
    print(f"\n🔎 {name.upper()}:")
    print(f"→ Duplikate (gesamt): {len(merged)}")
    if not merged.empty:
        print("→ Aufschlüsselung nach Binding (pos/neg):")
        print(merged[["Binding_pos", "Binding_neg"]].value_counts().sort_index())

# === Checks durchführen ===
check_duplicates(train_pos, train_neg, "train")
check_duplicates(val_pos, val_neg, "validation")
check_duplicates(test_pos, test_neg, "test")



🔎 TRAIN:
→ Duplikate (gesamt): 0

🔎 VALIDATION:
→ Duplikate (gesamt): 0

🔎 TEST:
→ Duplikate (gesamt): 0


In [47]:
import pandas as pd

def create_balanced_negatives_with_all_epitopes(
    neg_source_1,
    neg_source_2,
    target_neg_count,
    used_pairs=set(),
    vdjdb_tcrs=set(),
    vdjdb_epitopes=set(),
    ensure_all_neg_epitopes=True
):
    neg_source_1 = neg_source_1.copy()
    neg_source_2 = neg_source_2.copy()
    neg_source_1['source'] = '10X'
    neg_source_2['source'] = 'generated'

    # --- VDJdb & verwendete Paare filtern ---
    def filter_df(df, remove_epitopes=False):
        df = df[~df['TRB_CDR3'].isin(vdjdb_tcrs)].copy()
        if remove_epitopes:
            df = df[~df['Epitope'].isin(vdjdb_epitopes)]
        df['Pair'] = list(map(tuple, df[['Epitope', 'TRB_CDR3']].values))
        df = df[~df['Pair'].isin(used_pairs)]
        return df

    neg_source_1 = filter_df(neg_source_1)
    neg_source_2 = filter_df(neg_source_2)

    # --- Vorverarbeitung für TPP-Filterung ---
    trainval_tcrs = set(pd.concat([train_final, val_final])['TRB_CDR3'])
    trainval_epitopes = set(pd.concat([train_final, val_final])['Epitope'])

    def is_task(row, task):
        return classify_task(row['TRB_CDR3'], row['Epitope'], trainval_tcrs, trainval_epitopes) == task
    
    # TPP3 & TPP4 vollständig übernehmen
    tpp3_10x = neg_source_1[neg_source_1.apply(lambda r: is_task(r, 'TPP3'), axis=1)]
    tpp4_10x = neg_source_1[neg_source_1.apply(lambda r: is_task(r, 'TPP4'), axis=1)]
    
    # TPP2 begrenzen auf max. 7000
    tpp2_full = neg_source_1[neg_source_1.apply(lambda r: is_task(r, 'TPP2'), axis=1)]
    tpp2_10x = tpp2_full.sample(n=min(7000, len(tpp2_full)), random_state=42)
    
    # TPP1 auffüllen mit Rest
    used_pairs_in_10x = set(pd.concat([tpp3_10x, tpp4_10x, tpp2_10x])[['Epitope', 'TRB_CDR3']].apply(tuple, axis=1))
    
    tpp1_pool = neg_source_1[
        neg_source_1.apply(lambda r: is_task(r, 'TPP1'), axis=1) &
        ~neg_source_1[['Epitope', 'TRB_CDR3']].apply(tuple, axis=1).isin(used_pairs_in_10x)
    ]
    
    # Wieviele TPP1 wir brauchen, um Gesamtziel zu erreichen:
    already_selected = len(tpp3_10x) + len(tpp4_10x) + len(tpp2_10x)
    tpp1_needed = max(target_neg_count - already_selected, 0)
    
    tpp1_10x = tpp1_pool.sample(n=min(tpp1_needed, len(tpp1_pool)), random_state=42)
    
    # Kombinieren zu neuer neg_source_1 (10X)
    neg_source_1 = pd.concat([tpp3_10x, tpp4_10x, tpp2_10x, tpp1_10x], ignore_index=True)
    print(f"✅ Neue neg_source_1 aus 10X enthält {len(neg_source_1)} Beispiele (TPP3: {len(tpp3_10x)}, TPP4: {len(tpp4_10x)}, TPP2: {len(tpp2_10x)}, TPP1: {len(tpp1_10x)})")


    # --- Mindestens 1x alle Epitope aus beiden Quellen übernehmen ---
    def ensure_epitope_coverage(df):
        guaranteed = []
        for epitope in df['Epitope'].unique():
            group = df[df['Epitope'] == epitope]
            if not group.empty:
                guaranteed.append(group.sample(1, random_state=42))
        return pd.concat(guaranteed, ignore_index=True)

    guaranteed_1 = ensure_epitope_coverage(neg_source_1)
    guaranteed_2 = ensure_epitope_coverage(neg_source_2)
    guaranteed_df = pd.concat([guaranteed_1, guaranteed_2], ignore_index=True)

    # Begrenze garantierte, falls sie zu groß geworden sind
    if len(guaranteed_df) > target_neg_count:
        print(f"Zu viele garantierte Negative ({len(guaranteed_df)}), trimme auf Zielmenge {target_neg_count}")
        guaranteed_df = guaranteed_df.sample(n=target_neg_count, random_state=42)

    # --- Stratified Sampling für Restauffüllung ---
    def stratified_sample(df, n):
        epitope_groups = df.groupby('Epitope')
        unique_epitopes = list(epitope_groups.groups.keys())
        print(f"→ Stratified sampling from {len(df)} rows | {len(unique_epitopes)} unique epitopes | need {n} samples")
    
        # Schritt 1: Garantiert 1 Sample pro Epitope
        guaranteed = [group.sample(1, random_state=42) for _, group in epitope_groups]
        guaranteed_df = pd.concat(guaranteed, ignore_index=True)
    
        remaining_n = n - len(guaranteed_df)
        if remaining_n <= 0:
            return guaranteed_df.sample(n=n, random_state=42)
    
        # Schritt 2: Aufstocken durch gewichtetes Sampling
        remaining_pool = df.drop(index=guaranteed_df.index, errors='ignore')
    
        # Gewichte: Häufigkeit pro Epitope → normalize
        epitope_counts = remaining_pool['Epitope'].value_counts()
        remaining_pool = remaining_pool.copy()
        remaining_pool['weight'] = remaining_pool['Epitope'].map(epitope_counts)
        total = remaining_pool['weight'].sum()
        remaining_pool['weight'] = remaining_pool['weight'] / total
    
        print(f"→ Stratified fill-in: drawing {remaining_n} samples weighted by epitope frequency")
    
        replace = len(remaining_pool) < remaining_n
        if replace:
            print(f"Achtung: Sampling mit Replacement (n={remaining_n}, pool={len(remaining_pool)})")
        
        sampled_rest = remaining_pool.sample(
            n=remaining_n,
            weights='weight',
            replace=replace,
            random_state=42
        )
        
        final_df = pd.concat([guaranteed_df, sampled_rest], ignore_index=True)
        return final_df

    remaining_needed = target_neg_count - len(guaranteed_df)
    if remaining_needed <= 0:
        print(f"Es wurden bereits {len(guaranteed_df)} garantierte Samples übernommen (mehr als benötigt).")
        final_df = guaranteed_df.sample(n=target_neg_count, random_state=42)
    else:
        # Garantierte rausnehmen
        used_idx_1 = guaranteed_1.index if not guaranteed_1.empty else []
        used_idx_2 = guaranteed_2.index if not guaranteed_2.empty else []

        remaining_1 = neg_source_1
        remaining_2 = neg_source_2

        half = remaining_needed // 2
        rest = remaining_needed - half

        sample_1 = stratified_sample(remaining_1, half)
        sample_2 = stratified_sample(remaining_2, rest)

        final_df = pd.concat([guaranteed_df, sample_1, sample_2], ignore_index=True)

    return final_df.drop(columns=['Pair'])

def classify_task(tcr, epitope, train_tcrs, train_epitopes):
    seen_tcr = tcr in train_tcrs
    seen_epi = epitope in train_epitopes
    if seen_tcr and seen_epi:
        return 'TPP1'
    elif not seen_tcr and seen_epi:
        return 'TPP2'
    elif not seen_tcr and not seen_epi:
        return 'TPP3'
    elif seen_tcr and not seen_epi:
        return 'TPP4'

In [149]:
# === Datei- und Pfadangaben ===

# Originaldaten (10X)
beta = pd.read_csv(f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv', sep=',')

# Generierte Negativdaten
neg_ba_train = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/train_neg_clean.tsv", sep='\t')
neg_ba_val = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/val_neg_clean.tsv", sep='\t')
neg_ba_test = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/test_neg_clean.tsv", sep='\t')

# Positive Beispiele
train_pos = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv', sep='\t')
val_pos = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/validation_prenegsamples.tsv', sep='\t')
test_preneg = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv', sep='\t')

# Output-Ziele
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'

# VDJdb zum Herausfiltern
vdjdb_df = pd.read_csv(VDJdb_cleaned_beta_output, sep='\t')
vdjdb_tcrs = set(vdjdb_df['TRB_CDR3'])
vdjdb_epitopes = set(vdjdb_df['Epitope'])

  train_pos = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv', sep='\t')


In [154]:
# === TESTDATEN VERARBEITUNG ===
# Zielmenge: 1:5 Verhältnis
num_test_pos = len(test_preneg)
test_neg_target = num_test_pos * 5

neg_10x = beta[beta['Binding'] == 0]
train_final = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv", sep='\t')
val_final = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv", sep='\t')

# Paare aus Train/Val ausschließen
used_trainval_pairs = set(
    map(tuple, pd.concat([train_final, val_final])[['Epitope', 'TRB_CDR3']].values)
)

# Negative Samples für Test generieren
test_neg = create_balanced_negatives_with_all_epitopes(
    neg_source_1=neg_10x,
    neg_source_2=neg_ba_test,
    target_neg_count=test_neg_target,
    used_pairs=used_trainval_pairs,
    vdjdb_tcrs=vdjdb_tcrs,
    vdjdb_epitopes=vdjdb_epitopes
)

# Combine & save
test_final = pd.concat([test_preneg, test_neg], ignore_index=True).sample(frac=1, random_state=42)
test_final.to_csv(test_output_path, sep='\t', index=False)

df_check = pd.read_csv(test_output_path, sep='\t')
print(df_check['Binding'].value_counts())

# Ausgabe
print("✅ Testset erfolgreich erstellt & gespeichert.")
print(f"Test: {len(test_final)} Beispiele")
print(f"- Binding=1: {test_final['Binding'].value_counts().get(1, 0)}")
print(f"- Binding=0: {test_final['Binding'].value_counts().get(0, 0)}")
print(f"- Unique Epitope: {test_final['Epitope'].nunique()}")
print(f"- Unique TCRs: {test_final['TRB_CDR3'].nunique()}")

✅ Neue neg_source_1 aus 10X enthält 44500 Beispiele (TPP3: 0, TPP4: 0, TPP2: 5047, TPP1: 39453)
→ Stratified sampling from 44500 rows | 49 unique epitopes | need 21900 samples
→ Stratified fill-in: drawing 21851 samples weighted by epitope frequency
→ Stratified sampling from 37996 rows | 650 unique epitopes | need 21901 samples
→ Stratified fill-in: drawing 21251 samples weighted by epitope frequency
Binding
0    44500
1     8900
Name: count, dtype: int64
✅ Testset erfolgreich erstellt & gespeichert.
Test: 53400 Beispiele
- Binding=1: 8900
- Binding=0: 44500
- Unique Epitope: 724
- Unique TCRs: 34479


In [155]:
import pandas as pd

# --- Pfade zu finalen Splits ---
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path   = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path  = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"

# --- Dateien einlesen ---
train_df = pd.read_csv(output_train_path, sep='\t')
val_df   = pd.read_csv(output_val_path, sep='\t')
test_df  = pd.read_csv(test_output_path, sep='\t')

# --- Funktion zur Bearbeitung ---
def clean_and_update(df):
    if 'weight' in df.columns:
        df = df.drop(columns=['weight'])
    if 'Epitope MHC MHC class' in df.columns:
        df = df.drop(columns=['Epitope MHC MHC class'])
    if 'pair_count' in df.columns:
        df = df.drop(columns=['pair_count'])
    if 'epi_count' in df.columns:
        df = df.drop(columns=['epi_count'])
    if 'pair' in df.columns:
        df = df.drop(columns=['pair'])
    if 'source' not in df.columns:
        df['source'] = ''
    df.loc[df['Binding'] == 1, 'source'] = 'datasets'
    return df

# --- Anwenden ---
train_df = clean_and_update(train_df)
val_df   = clean_and_update(val_df)
test_df  = clean_and_update(test_df)

# --- Zurückschreiben (überschreibt die Dateien direkt) ---
train_df.to_csv(output_train_path, sep='\t', index=False)
val_df.to_csv(output_val_path, sep='\t', index=False)
test_df.to_csv(test_output_path, sep='\t', index=False)

print("✅ Gewünschte Änderungen in den finalen Splits wurden vorgenommen und gespeichert.")

✅ Gewünschte Änderungen in den finalen Splits wurden vorgenommen und gespeichert.


In [137]:
'''# === Shift von TPP3 und TPP4 aus Val zu Test, weil dort zu wenig
output_val_path   = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path  = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"

val_final   = pd.read_csv(output_val_path, sep='\t')
test_final  = pd.read_csv(test_output_path, sep='\t')

print("\n📊 Alte Testverteilung:")
print(test_final.groupby(['task', 'Binding']).size().unstack(fill_value=0))

print("\n📊 Alte Validationverteilung:")
print(val_final.groupby(['task', 'Binding']).size().unstack(fill_value=0))

# === 1. Finde TPP3/TPP4 NON-BINDER in val_final
val_tpp3_neg = val_final[
    (val_final['Binding'] == 0) & (val_final['task'] == 'TPP3')
]

val_tpp4_neg = val_final[
    (val_final['Binding'] == 0) & (val_final['task'] == 'TPP4')
]

# === 2. Ziehe jeweils 50%
# === Für TPP3: (Epitope, TRB_CDR3) einmalig
val_tpp3_neg["pair"] = val_tpp3_neg[['Epitope', 'TRB_CDR3']].apply(tuple, axis=1)
pair_counts = val_tpp3_neg["pair"].value_counts()
val_tpp3_neg["pair_count"] = val_tpp3_neg["pair"].map(pair_counts)

unique_pairs = val_tpp3_neg[val_tpp3_neg["pair_count"] == 1]
dupe_pairs   = val_tpp3_neg[val_tpp3_neg["pair_count"] > 1]

target_tpp3 = int(len(val_tpp3_neg) * 0.5)

val_tpp3_half = pd.concat([
    unique_pairs,
    dupe_pairs.sample(n=max(0, target_tpp3 - len(unique_pairs)), random_state=42)
], ignore_index=True)

# === Für TPP4: Epitope einmalig
epi_counts = val_tpp4_neg["Epitope"].value_counts()
val_tpp4_neg["epi_count"] = val_tpp4_neg["Epitope"].map(epi_counts)

unique_epi = val_tpp4_neg[val_tpp4_neg["epi_count"] == 1]
dupe_epi   = val_tpp4_neg[val_tpp4_neg["epi_count"] > 1]

target_tpp4 = int(len(val_tpp4_neg) * 0.5)

val_tpp4_half = pd.concat([
    unique_epi,
    dupe_epi.sample(n=max(0, target_tpp4 - len(unique_epi)), random_state=42)
], ignore_index=True)


tpp3_4_half = pd.concat([val_tpp3_half, val_tpp4_half])  # kein ignore_index

print(f"✅ Verschiebe {len(tpp3_4_half)} TPP3/4-Non-Binder von Validation → Test")

# === 3. In Testset einfügen
test_final_updated = pd.concat([test_final, tpp3_4_half], ignore_index=True)

# === 4. Entferne aus val_final alle Zeilen mit denselben (Epitope, TRB_CDR3)
moved_pairs = set(tpp3_4_half[['Epitope', 'TRB_CDR3']].apply(tuple, axis=1))

val_final_updated = val_final[
    ~val_final[['Epitope', 'TRB_CDR3']].apply(tuple, axis=1).isin(moved_pairs)
]

# === 5. Speichern
test_final_updated.to_csv(test_output_path, sep='\t', index=False)
val_final_updated.to_csv(output_val_path, sep='\t', index=False)

# === 6. Kontrolle
df_test_check = pd.read_csv(test_output_path, sep='\t')
df_val_check = pd.read_csv(output_val_path, sep='\t')

print("\n📊 Neue Testverteilung:")
print(df_test_check.groupby(['task', 'Binding']).size().unstack(fill_value=0))

print("\n📊 Neue Validationverteilung:")
print(df_val_check.groupby(['task', 'Binding']).size().unstack(fill_value=0))
'''

'# === Shift von TPP3 und TPP4 aus Val zu Test, weil dort zu wenig\noutput_val_path   = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"\ntest_output_path  = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"\n\nval_final   = pd.read_csv(output_val_path, sep=\'\t\')\ntest_final  = pd.read_csv(test_output_path, sep=\'\t\')\n\nprint("\n📊 Alte Testverteilung:")\nprint(test_final.groupby([\'task\', \'Binding\']).size().unstack(fill_value=0))\n\nprint("\n📊 Alte Validationverteilung:")\nprint(val_final.groupby([\'task\', \'Binding\']).size().unstack(fill_value=0))\n\n# === 1. Finde TPP3/TPP4 NON-BINDER in val_final\nval_tpp3_neg = val_final[\n    (val_final[\'Binding\'] == 0) & (val_final[\'task\'] == \'TPP3\')\n]\n\nval_tpp4_neg = val_final[\n    (val_final[\'Binding\'] == 0) & (val_final[\'task\'] == \'TPP4\')\n]\n\n# === 2. Ziehe jeweils 50%\n# === Für TPP3: (Epitope, TRB_CDR3) einmalig\nval_tpp3_neg["pair"] = val_tpp3_neg[[\'Epitope\', \'TRB_CDR3\']].apply(tuple, ax

## Task Classification 
The classification in the split notebook correct for positive only data. After adding negative data, some classifications might be wrong.

In [103]:
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta/new'
test_file_name = 'test.tsv'
validation_file_name = 'validation.tsv'
train_file_name = 'train.tsv'

In [None]:
# do the classification for paired data
paired = True
test_data_path = f'{paired_output_folder}/{test_file_name}'
validation_data_path = f'{paired_output_folder}/{validation_file_name}'
train_data_path = f'{paired_output_folder}/{train_file_name}'

%run ../data_preparation/classification.ipynb

  df_test = pd.read_csv(test_data_path, sep="\t")


test data has 82726 TPP1 tasks (seen tcr & seen epitopes).
test data has 76994 TPP2 tasks (unseen tcr & seen epitopes).
test data has 600 TPP3 tasks (unseen tcr & unseen epitope).
test data has 42 TPP4 tasks (seen tcr & unseen epitope).


In [None]:
# extended classification for paired data
train_path = f'{paired_output_folder}/{train_file_name}'
validation_path = f'{paired_output_folder}/{validation_file_name}'
test_path = f'{paired_output_folder}/{test_file_name}'
output_path = f'{paired_output_folder}/test_reclassified_paired_specific.tsv'
paired_data_path = paired_output_folder
alpha_cdr3_name = 'TRA_CDR3'
beta_cdr3_name = 'TRB_CDR3'
epitope_name = 'Epitope'
task_name = 'task'

%run ../data_preparation/paired_reclassification_testonly.ipynb

allele
../../data/splitted_datasets/allele/paired/train.tsv


  df_test = pd.read_csv(test_path, sep="\t", index_col=False)


train+validate data has 72656 entries
test data has 160362 entries
test data has 128885 TPP1 tasks (old value: 82726) (seen tcr & seen epitopes).
test data has 30835 TPP2 tasks (old value: 76994) (unseen tcr & seen epitopes).
test data has 508 TPP3 tasks (old value: 600) (unseen tcr & unseen epitope).
test data has 134 TPP4 tasks (old value: 42) (seen tcr & unseen epitope).
the train/test ratio is 0.31180423829918724/0.6881957617008128
../../data/splitted_datasets/allele/paired/test_reclassified_paired_specific.tsv
/home/ubuntu/arina/BA-Cancer-Immunotherapy
uploading dataset to dataset-allele


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marina-frohofer[0m ([33mba_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./../../data/splitted_datasets/allele/paired)... Done. 0.2s


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [156]:
# do the classification for beta data
paired = False
train_data_path = f'{beta_output_folder}/{train_file_name}'
validation_data_path = f'{beta_output_folder}/{validation_file_name}'
test_data_path = f'{beta_output_folder}/{test_file_name}'

%run ../data_preparation/classification.ipynb


📊 Bevor Testverteilung:
Binding      0
task          
TPP1     21859
TPP2       665
TPP4        27
TPP1  TPP4 0 IVTDFSVIK CASSVDGNTEAFF
TPP1  TPP4 0 FLYALALLL CASRSLAPARPSEQFF
TPP1  TPP4 0 ELAGIGILTV CASSLLAGGPNEQFF
TPP1  TPP4 0 IYSKHTPINL CASSYRTGSSYNEQFF
TPP1  TPP4 0 AVFDRKSDAK CASSYSKAGGPGEDTQYF
TPP1  TPP4 0 MIELSLIDFYLCFLAFLLFLVLIML CASSEGTGLYEQYF
TPP1  TPP4 0 ELAGIGILTV CASRGNSYEQYF
TPP1  TPP4 0 ELAGIGILTV CASRGNSYEQYF
TPP1  TPP4 0 QYDPVAALF CASSSQEGIEAFF
TPP1  TPP4 0 YLNDHLEPWI CASSLPRAGGTYEQYF
TPP1  TPP4 0 AVFDRKSDAK CASSLDTLSYNEQFF
TPP1  TPP4 0 QPRAPIRPI CASSLGGLAKQETQYF
TPP1  TPP4 0 RIAAWMATY CASSLASGGEQFF
TPP1  TPP4 0 ELAGIGILTV CASSLPWAGVLNTEAFF
TPP1  TPP4 0 ELAGIGILTV CASSLLAGAGETQYF
TPP1  TPP4 0 SLEGGGLGY CASSLLLANSYNEQFF
TPP1  TPP4 0 QPRAPIRPI CASSLPWAGVLNTEAFF
TPP1  TPP4 0 KLGGALQAK CAFQEASYGYTF
TPP1  TPP4 0 RIAAWMATY CASSEMAGGLEAFF
TPP1  TPP4 0 ELAGIGILTV CASSEMTGNTEAFF
TPP1  TPP4 0 IYSKHTPINL CASSYRTGSSYNEQFF
TPP1  TPP4 0 KVLEYVIKV CASSEMTGNTEAFF
TPP1  TPP4 0 GILGFVFT

In the next two cells the classification is checked. If the output says "Classification is correct", everything is fine.

In [None]:
# check task classification paired
splitted_data_path = paired_output_folder

%run ../data_preparation/check_task_classification_paired.ipynb

  df_test = pd.read_csv(test_file, sep="\t")


train+validate data has 72656 entries
test data has 160362 entries
test data has 82726 TPP1 tasks (seen tcr & seen epitopes).
test data has 76994 TPP2 tasks (unseen tcr & seen epitopes).
test data has 600 TPP3 tasks (unseen tcr & unseen epitope).
test data has 42 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.31180423829918724/0.6881957617008128
Classification is correct.
Correctness summary:
is_correct
True    160362
Name: count, dtype: int64


In [157]:
# check task classification beta
splitted_data_path = beta_output_folder

%run ../data_preparation/check_task_classification_beta.ipynb

train data has 840756 entries
test data has 53400 entries
test data has 42335 TPP1 tasks (seen tcr & seen epitopes).
test data has 10863 TPP2 tasks (unseen tcr & seen epitopes).
test data has 98 TPP3 tasks (unseen tcr & unseen epitope).
test data has 104 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.9516456769061926/0.04835432309380739
Classification is correct.
Correctness summary:
is_correct
True    53400
Name: count, dtype: int64


## Upload dataset

In [38]:
import os
print(os.listdir(path_to_data))


['.ipynb_checkpoints', 'validation_prenegsamples.tsv', 'test.tsv', 'train.tsv', 'test_reclassified_paired_specific.tsv', 'validation.tsv', 'train_prenegsamples.tsv', 'validate_reclassified_paired_specific.tsv', 'test_prenegsamples.tsv']


In [159]:
from dotenv import load_dotenv, find_dotenv
load_dotenv()

# upload paired data
path_to_data = f'{pipeline_data_splitted}/{precision}/paired'
dataset_name = f'paired_{precision}'
#main_project_name = os.getenv("MAIN_PROJECT_NAME")
main_project_name = f"dataset-{precision}"

%run ../upload_datasets.ipynb

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


uploading dataset to dataset-allele


[34m[1mwandb[0m: Currently logged in as: [33marina-frohofer[0m ([33mba_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./../../../../data/splitted_datasets/allele/paired)... Done. 0.2s


In [161]:
# upload beta data
path_to_data = f'{pipeline_data_splitted}/{precision}/beta'
dataset_name = f'beta_{precision}'

%run ../upload_datasets.ipynb

uploading dataset to dataset-allele


[34m[1mwandb[0m: Adding directory to artifact (./../../../../data/splitted_datasets/allele/beta)... Done. 1.4s


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

## Create Embeddings >> ProtBert

In [162]:
import torch
print(torch.cuda.is_available())  # Sollte True zurückgeben
print(torch.version.cuda)  # Sollte die richtige CUDA-Version anzeigen

True
12.4


In [None]:
path_paired_test = f"{pipeline_data_splitted}/{precision}/paired/test.tsv"
path_paired_validation = f"{pipeline_data_splitted}/{precision}/paired/validation.tsv"
path_paired_train = f"{pipeline_data_splitted}/{precision}/paired/train.tsv"
path_beta_test = f"{pipeline_data_splitted}/{precision}/beta/test.tsv"
path_beta_validation = f"{pipeline_data_splitted}/{precision}/beta/validation.tsv"
path_beta_train = f"{pipeline_data_splitted}/{precision}/beta/train.tsv"


path_paired = f"{pipeline_data}/embeddings/temp/{precision}/paired_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_paired)])
df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)
df_paired_validation = pd.read_csv(path_paired_validation, sep="\t", index_col=False)
df_paired_train = pd.read_csv(path_paired_train, sep="\t", index_col=False)
df_paired = pd.concat([df_paired_test, df_paired_validation, df_paired_train])
df_paired.to_csv(path_paired, sep="\t", index=False)

# paired
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz TRA_CDR3
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz TRB_CDR3
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz Epitope

path_beta = f"{pipeline_data}/embeddings/temp/{precision}/beta_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_beta)])
df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_test, df_beta_validation, df_beta_train])
df_beta.to_csv(path_beta, sep="\t", index=False)

# beta
%run ../generateEmbeddingsProtBERT.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/TRB_beta_embeddings.npz TRB_CDR3
%run ../generateEmbeddingsProtBERT.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/Epitope_beta_embeddings.npz Epitope

  df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)


Using GPU: Tesla T4
Loading: Rostlab/prot_bert
Model is on device: cuda:0
Processing Batch:  0 64
Processing Batch:  64 128
Processing Batch:  128 192
Processing Batch:  192 256
Processing Batch:  256 320
Processing Batch:  320 384
Processing Batch:  384 448
Processing Batch:  448 512
Processing Batch:  512 576
Processing Batch:  576 640
Processing Batch:  640 704
Processing Batch:  704 768
Processing Batch:  768 832
Processing Batch:  832 896
Processing Batch:  896 960
Processing Batch:  960 1024
Processing Batch:  1024 1088
Processing Batch:  1088 1152
Processing Batch:  1152 1216
Processing Batch:  1216 1280
Processing Batch:  1280 1344
Processing Batch:  1344 1408
Processing Batch:  1408 1472
Processing Batch:  1472 1536
Processing Batch:  1536 1600
Processing Batch:  1600 1664
Processing Batch:  1664 1728
Processing Batch:  1728 1792
Processing Batch:  1792 1856
Processing Batch:  1856 1920
Processing Batch:  1920 1984
Processing Batch:  1984 2048
Processing Batch:  2048 2112
Proc

In [9]:
import numpy as np

# Funktion, um Embeddings korrekt zu laden
def load_embeddings(file_path):
    npz_data = np.load(file_path)
    all_keys = list(npz_data.keys())

    # Falls Embeddings als einzelne Sequenzen gespeichert sind
    if len(all_keys) > 1:
        all_values = [npz_data[k] for k in all_keys]
        return np.vstack(all_values)  # Alles zusammenfügen
    else:
        return npz_data[all_keys[0]]

# Embeddings für TRA, TRB und Epitope laden
tra_embeddings = load_embeddings(f"{pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz")
trb_embeddings = load_embeddings(f"{pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz")
epitope_embeddings = load_embeddings(f"{pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz")

# Ausgabe der finalen Shapes
print(f"📌 TRA Embedding Shape: {tra_embeddings.shape}")
print(f"📌 TRB Embedding Shape: {trb_embeddings.shape}")
print(f"📌 Epitope Embedding Shape: {epitope_embeddings.shape}")


📌 TRA Embedding Shape: (596417, 1024)
📌 TRB Embedding Shape: (694427, 1024)
📌 Epitope Embedding Shape: (12870, 1024)


In [11]:
import pandas as pd

# Beispielpfade für Train-, Test-, und Validierungsdatensätze für alle vier Kategorien
base_path = pipeline_data_splitted

# Definierte Pfade für alle vier Kategorien
datasets = {
    "paired_gene": {
        "train": f"{base_path}/gene/paired/train.tsv",
        "test": f"{base_path}/gene/paired/test.tsv",
        "validation": f"{base_path}/gene/paired/validation.tsv"
    },
    "beta_gene": {
        "train": f"{base_path}/gene/beta/train.tsv",
        "test": f"{base_path}/gene/beta/test.tsv",
        "validation": f"{base_path}/gene/beta/validation.tsv"
    }
}

# Berechnung der Anzahl der Zeilen für jedes Set
results = {}
for dataset_name, paths in datasets.items():
    # Daten laden
    train_df = pd.read_csv(paths["train"], sep='\t')
    test_df = pd.read_csv(paths["test"], sep='\t')
    validation_df = pd.read_csv(paths["validation"], sep='\t')
    
    # Anzahl der Zeilen berechnen
    train_length = len(train_df)
    test_length = len(test_df)
    validation_length = len(validation_df)
    total_length = train_length + test_length + validation_length
    
    # Zähle die Anzahl der Bindings 1 und 0 in jedem Datensatz
    train_binding_counts = train_df['Binding'].value_counts()
    test_binding_counts = test_df['Binding'].value_counts()
    validation_binding_counts = validation_df['Binding'].value_counts()
    
    # Zähle die Anzahl der TPP1, TPP2, TPP3 Einträge in jedem Datensatz
    train_task_counts = train_df['task'].value_counts()
    test_task_counts = test_df['task'].value_counts()
    validation_task_counts = validation_df['task'].value_counts()

    # Ergebnisse speichern
    results[dataset_name] = {
        "Train": train_length,
        "Train_Binding_1": train_binding_counts.get(1, 0),
        "Train_Binding_0": train_binding_counts.get(0, 0),
        "Train_TPP1": train_task_counts.get("TPP1", 0),
        "Train_TPP2": train_task_counts.get("TPP2", 0),
        "Train_TPP3": train_task_counts.get("TPP3", 0),
        "Train_TPP4": train_task_counts.get("TPP4", 0),
        "Test": test_length,
        "Test_Binding_1": test_binding_counts.get(1, 0),
        "Test_Binding_0": test_binding_counts.get(0, 0),
        "Test_TPP1": test_task_counts.get("TPP1", 0),
        "Test_TPP2": test_task_counts.get("TPP2", 0),
        "Test_TPP3": test_task_counts.get("TPP3", 0),
        "Test_TPP4": test_task_counts.get("TPP4", 0),
        "Validation": validation_length,
        "Validation_Binding_1": validation_binding_counts.get(1, 0),
        "Validation_Binding_0": validation_binding_counts.get(0, 0),
        "Validation_TPP1": validation_task_counts.get("TPP1", 0),
        "Validation_TPP2": validation_task_counts.get("TPP2", 0),
        "Validation_TPP3": validation_task_counts.get("TPP3", 0),
        "Validation_TPP4": validation_task_counts.get("TPP4", 0),
        "Total": total_length
    }

# Ergebnisse anzeigen
for dataset, lengths in results.items():
    print(f'--- {dataset.replace("_", " ").title()} ---')
    print(f'Anzahl der Zeilen im Trainingsdatensatz: {lengths["Train"]} (Binding=1: {lengths["Train_Binding_1"]}, Binding=0: {lengths["Train_Binding_0"]}, TPP1: {lengths["Train_TPP1"]}, TPP2: {lengths["Train_TPP2"]}, TPP3: {lengths["Train_TPP3"]})')
    print(f'Anzahl der Zeilen im Testdatensatz: {lengths["Test"]} (Binding=1: {lengths["Test_Binding_1"]}, Binding=0: {lengths["Test_Binding_0"]}, TPP1: {lengths["Test_TPP1"]}, TPP2: {lengths["Test_TPP2"]}, TPP3: {lengths["Test_TPP3"]})')
    print(f'Anzahl der Zeilen im Validierungsdatensatz: {lengths["Validation"]} (Binding=1: {lengths["Validation_Binding_1"]}, Binding=0: {lengths["Validation_Binding_0"]}, TPP1: {lengths["Validation_TPP1"]}, TPP2: {lengths["Validation_TPP2"]}, TPP3: {lengths["Validation_TPP3"]})')
    print(f'Gesamtanzahl der Zeilen (Train + Test + Validation): {lengths["Total"]}\n')

# Optional: Ergebnisse in einer Übersichtstabelle darstellen
summary_data = []
for dataset, lengths in results.items():
    summary_data.append({
        "Dataset": dataset.replace("_", " ").title(),
        "Train": lengths["Train"],
        "Train_Binding_1": lengths["Train_Binding_1"],
        "Train_Binding_0": lengths["Train_Binding_0"],
        "Train_TPP1": lengths["Train_TPP1"],
        "Train_TPP2": lengths["Train_TPP2"],
        "Train_TPP3": lengths["Train_TPP3"],
        "Train_TPP4": lengths["Train_TPP4"],
        "Test": lengths["Test"],
        "Test_Binding_1": lengths["Test_Binding_1"],
        "Test_Binding_0": lengths["Test_Binding_0"],
        "Test_TPP1": lengths["Test_TPP1"],
        "Test_TPP2": lengths["Test_TPP2"],
        "Test_TPP3": lengths["Test_TPP3"],
        "Test_TPP4": lengths["Test_TPP4"],
        "Validation": lengths["Validation"],
        "Validation_Binding_1": lengths["Validation_Binding_1"],
        "Validation_Binding_0": lengths["Validation_Binding_0"],
        "Validation_TPP1": lengths["Validation_TPP1"],
        "Validation_TPP2": lengths["Validation_TPP2"],
        "Validation_TPP3": lengths["Validation_TPP3"],
        "Validation_TPP4": lengths["Validation_TPP4"],
        "Total": lengths["Total"]
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df)


  train_df = pd.read_csv(paths["train"], sep='\t')


--- Paired Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 67422 (Binding=1: 33711, Binding=0: 33711, TPP1: 0, TPP2: 0, TPP3: 0)
Anzahl der Zeilen im Testdatensatz: 43356 (Binding=1: 7226, Binding=0: 36130, TPP1: 27972, TPP2: 15095, TPP3: 289)
Anzahl der Zeilen im Validierungsdatensatz: 43344 (Binding=1: 7224, Binding=0: 36120, TPP1: 0, TPP2: 0, TPP3: 0)
Gesamtanzahl der Zeilen (Train + Test + Validation): 154122

--- Beta Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 251750 (Binding=1: 125875, Binding=0: 125875, TPP1: 0, TPP2: 0, TPP3: 0)
Anzahl der Zeilen im Testdatensatz: 161844 (Binding=1: 26974, Binding=0: 134870, TPP1: 140896, TPP2: 20645, TPP3: 299)
Anzahl der Zeilen im Validierungsdatensatz: 161838 (Binding=1: 26973, Binding=0: 134865, TPP1: 0, TPP2: 0, TPP3: 0)
Gesamtanzahl der Zeilen (Train + Test + Validation): 575432

       Dataset   Train  Train_Binding_1  Train_Binding_0  Train_TPP1  \
0  Paired Gene   67422            33711            33711           0   
1    Bet

In [10]:
import pandas as pd

# Beispielpfade für Train-, Test-, und Validierungsdatensätze für alle vier Kategorien
base_path = pipeline_data_splitted

# Definierte Pfade für alle vier Kategorien
datasets = {
    "paired_gene": {
        "train": f"{base_path}/gene/paired/train.tsv",
        "test": f"{base_path}/gene/paired/test.tsv",
        "validation": f"{base_path}/gene/paired/validation.tsv"
    },
    "paired_allele": {
        "train": f"{base_path}/allele/paired/train.tsv",
        "test": f"{base_path}/allele/paired/test.tsv",
        "validation": f"{base_path}/allele/paired/validation.tsv"
    },
    "beta_gene": {
        "train": f"{base_path}/gene/beta/train.tsv",
        "test": f"{base_path}/gene/beta/test.tsv",
        "validation": f"{base_path}/gene/beta/validation.tsv"
    },
    "beta_allele": {
        "train": f"{base_path}/allele/beta/train.tsv",
        "test": f"{base_path}/allele/beta/test.tsv",
        "validation": f"{base_path}/allele/beta/validation.tsv"
    }
}

# Berechnung der Anzahl der Zeilen für jedes Set
results = {}
for dataset_name, paths in datasets.items():
    # Daten laden
    train_df = pd.read_csv(paths["train"], sep='\t')
    test_df = pd.read_csv(paths["test"], sep='\t')
    validation_df = pd.read_csv(paths["validation"], sep='\t')
    
    # Anzahl der Zeilen berechnen
    train_length = len(train_df)
    test_length = len(test_df)
    validation_length = len(validation_df)
    total_length = train_length + test_length + validation_length
    
    # Zähle die Anzahl der Bindings 1 und 0 in jedem Datensatz
    train_binding_counts = train_df['Binding'].value_counts()
    test_binding_counts = test_df['Binding'].value_counts()
    validation_binding_counts = validation_df['Binding'].value_counts()
    
    # Zähle die Anzahl der TPP1, TPP2, TPP3 Einträge in jedem Datensatz
    train_task_counts = train_df['task'].value_counts()
    test_task_counts = test_df['task'].value_counts()
    validation_task_counts = validation_df['task'].value_counts()

    # Ergebnisse speichern
    results[dataset_name] = {
        "Train": train_length,
        "Train_Binding_1": train_binding_counts.get(1, 0),
        "Train_Binding_0": train_binding_counts.get(0, 0),
        "Train_TPP1": train_task_counts.get("TPP1", 0),
        "Train_TPP2": train_task_counts.get("TPP2", 0),
        "Train_TPP3": train_task_counts.get("TPP3", 0),
        "Train_TPP4": train_task_counts.get("TPP4", 0),
        "Test": test_length,
        "Test_Binding_1": test_binding_counts.get(1, 0),
        "Test_Binding_0": test_binding_counts.get(0, 0),
        "Test_TPP1": test_task_counts.get("TPP1", 0),
        "Test_TPP2": test_task_counts.get("TPP2", 0),
        "Test_TPP3": test_task_counts.get("TPP3", 0),
        "Test_TPP4": test_task_counts.get("TPP4", 0),
        "Validation": validation_length,
        "Validation_Binding_1": validation_binding_counts.get(1, 0),
        "Validation_Binding_0": validation_binding_counts.get(0, 0),
        "Validation_TPP1": validation_task_counts.get("TPP1", 0),
        "Validation_TPP2": validation_task_counts.get("TPP2", 0),
        "Validation_TPP3": validation_task_counts.get("TPP3", 0),
        "Validation_TPP4": validation_task_counts.get("TPP4", 0),
        "Total": total_length
    }

# Ergebnisse anzeigen
for dataset, lengths in results.items():
    print(f'--- {dataset.replace("_", " ").title()} ---')
    print(f'Anzahl der Zeilen im Trainingsdatensatz: {lengths["Train"]} (Binding=1: {lengths["Train_Binding_1"]}, Binding=0: {lengths["Train_Binding_0"]}, TPP1: {lengths["Train_TPP1"]}, TPP2: {lengths["Train_TPP2"]}, TPP3: {lengths["Train_TPP3"]})')
    print(f'Anzahl der Zeilen im Testdatensatz: {lengths["Test"]} (Binding=1: {lengths["Test_Binding_1"]}, Binding=0: {lengths["Test_Binding_0"]}, TPP1: {lengths["Test_TPP1"]}, TPP2: {lengths["Test_TPP2"]}, TPP3: {lengths["Test_TPP3"]})')
    print(f'Anzahl der Zeilen im Validierungsdatensatz: {lengths["Validation"]} (Binding=1: {lengths["Validation_Binding_1"]}, Binding=0: {lengths["Validation_Binding_0"]}, TPP1: {lengths["Validation_TPP1"]}, TPP2: {lengths["Validation_TPP2"]}, TPP3: {lengths["Validation_TPP3"]})')
    print(f'Gesamtanzahl der Zeilen (Train + Test + Validation): {lengths["Total"]}\n')

# Optional: Ergebnisse in einer Übersichtstabelle darstellen
summary_data = []
for dataset, lengths in results.items():
    summary_data.append({
        "Dataset": dataset.replace("_", " ").title(),
        "Train": lengths["Train"],
        "Train_Binding_1": lengths["Train_Binding_1"],
        "Train_Binding_0": lengths["Train_Binding_0"],
        "Train_TPP1": lengths["Train_TPP1"],
        "Train_TPP2": lengths["Train_TPP2"],
        "Train_TPP3": lengths["Train_TPP3"],
        "Train_TPP4": lengths["Train_TPP4"],
        "Test": lengths["Test"],
        "Test_Binding_1": lengths["Test_Binding_1"],
        "Test_Binding_0": lengths["Test_Binding_0"],
        "Test_TPP1": lengths["Test_TPP1"],
        "Test_TPP2": lengths["Test_TPP2"],
        "Test_TPP3": lengths["Test_TPP3"],
        "Test_TPP4": lengths["Test_TPP4"],
        "Validation": lengths["Validation"],
        "Validation_Binding_1": lengths["Validation_Binding_1"],
        "Validation_Binding_0": lengths["Validation_Binding_0"],
        "Validation_TPP1": lengths["Validation_TPP1"],
        "Validation_TPP2": lengths["Validation_TPP2"],
        "Validation_TPP3": lengths["Validation_TPP3"],
        "Validation_TPP4": lengths["Validation_TPP4"],
        "Total": lengths["Total"]
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df)


FileNotFoundError: [Errno 2] No such file or directory: '../../data/splitted_datasets/allele/paired/train.tsv'