In [1]:
import os
import pandas as pd

In [2]:
import sys
!"{sys.executable}" -m pip install tidytcells



In [3]:
# set precision of mhc and V/J values (gene or allele)
precision = 'allele'

In [4]:
# this function is not thread safe
def create_folders_if_not_exists(folders):
  for path in folders:
    if not os.path.exists(path):
      os.makedirs(path)

In [5]:
pipeline_data = '../../../../data'
pipeline_data_plain = f'{pipeline_data}/plain_datasets'
pipeline_data_cleaned = f'{pipeline_data}/cleaned_datasets'
pipeline_data_concatenated = f'{pipeline_data}/concatenated_datasets'
pipeline_data_splitted = f'{pipeline_data}/splitted_datasets'
pipeline_data_temp_bucket = f'{pipeline_data}/temp'

pipeline_folders = [pipeline_data, pipeline_data_plain, pipeline_data_cleaned, pipeline_data_concatenated, pipeline_data_splitted, pipeline_data_temp_bucket]

create_folders_if_not_exists(pipeline_folders)

## Data Preparation

### VDJdb

In [27]:
# prepare directories
VDJdb_data_plain = f'{pipeline_data_plain}/VDJdb'
VDJdb_data_cleaned = f'{pipeline_data_cleaned}/VDJdb'
VDJdb_data_fitted = f'{pipeline_data_temp_bucket}/VDJdb'

VDJdb_folders = [VDJdb_data_plain, VDJdb_data_cleaned, VDJdb_data_fitted]
create_folders_if_not_exists(VDJdb_folders)

fitted_beta_file = 'VDJdb_beta_fitted.tsv'
fitted_paired_file = 'VDJdb_paired_fitted.tsv'

In [28]:
# prepare parameters for notebook VDJdb fit data paired
input_file = f'{VDJdb_data_plain}/VDJdb_paired_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_paired_file

# fit paired VDJdb data
%run ../VDJdb/fit_data_vdjdb_paired.ipynb

In [29]:
# prepare parameters for notebook VDJdb fit data beta
input_file = f'{VDJdb_data_plain}/VDJdb_beta_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_beta_file

# fit beta VDJdb data
%run ../VDJdb/fit_data_vdjdb_beta.ipynb

In [30]:
# prepare parameters for notebook VDJdb clean data paired
input_file = f'{VDJdb_data_fitted}/{fitted_paired_file}'
cleaned_file_paired = 'VDJdb_cleaned_data_paired.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

# clean paired VDJdb data
%run ../VDJdb/clean_data_vdjdb_paired.ipynb

MHC Class I has 27414 entries
whole dataframe has 28119 entries
filtered to only use MHC Class I. Length of dataset: 27414


In [31]:
# prepare parameters for notebook VDJdb clean data beta
input_file = f'{VDJdb_data_fitted}/{fitted_beta_file}'
cleaned_file_beta = 'VDJdb_cleaned_data_beta.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'

# clean beta VDJdb data
%run ../VDJdb/clean_data_vdjdb_beta.ipynb

MHC Class I has 46507 entries
whole dataframe has 49042 entries
filtered to only use MHC Class I. Length of dataset: 46507


In [32]:
VDJdb_cleaned_beta_output = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'
VDJdb_cleaned_paired_output = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

In [12]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'

vdjdb_beta_read_path = VDJdb_cleaned_beta_output
vdjdb_paired_read_path = VDJdb_cleaned_paired_output

output_file_beta = 'beta_concatenated_test.tsv'
output_file_paired = 'paired_concatenated_test.tsv'

create_folders_if_not_exists([custom_dataset_path])

%run ../concatDatasets_onlytest.ipynb

length of beta_df: 46507




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 7188 entries removed.
removed all duplicates (CDR3, Epitope) from distinct values (most_important_columns, keep=False). 1435 entries removed.
beta removed entries df length: 1435


Number of groups formed: 655
1435 can be re-added to the no-duplicated dataframe
from the plain dataset which has 46507 entries, 7188 entries have been removed.
for beta dataset :
size difference is: 7188
  39319 information score cleaned: 6.0
  46507 information score dropout: 6.0
✅ Nach Duplikat-Filter (Train/Val): final_beta_df enthält 9105 Einträge.
final_beta_df length = 9105
length of paired_df: 27414




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 687 entries removed.
removed all duplicates from distinct values (cultivated columns, keep=False). 246 entries removed.
paired removed entries df length: 246


246 can be re-added to the no-duplicated dataframe
from the plain dataset which has 27414 entries, 687 entries have been removed.
for paired dataset:
size difference is: 687
  26727 information score cleaned: 8.976241254162458
  27414 information score dropout: 8.975888232290071
final_paired_df length: 26727


In [13]:
import shutil
import os

# Define source folder where the files are currently stored
source_folder = f'{pipeline_data_concatenated}/{precision}/'

# Define file names
output_file_beta = 'beta_concatenated_test.tsv'
output_file_paired = 'paired_concatenated_test.tsv'

# Define destination folders
destination_beta_folder = f'{pipeline_data_splitted}/{precision}/beta/'
destination_paired_folder = f'{pipeline_data_splitted}/{precision}/paired/'

# Ensure destination folders exist
os.makedirs(destination_beta_folder, exist_ok=True)
os.makedirs(destination_paired_folder, exist_ok=True)

# Copy files
shutil.copy(os.path.join(source_folder, output_file_beta), os.path.join(destination_beta_folder, 'test_prenegsamples.tsv'))
shutil.copy(os.path.join(source_folder, output_file_paired), os.path.join(destination_paired_folder, 'test_prenegsamples.tsv'))

print(f'Beta file copied successfully to {destination_beta_folder}test_prenegsamples.tsv')
print(f'Paired file copied successfully to {destination_paired_folder}test_prenegsamples.tsv')

Beta file copied successfully to ../../../../data/splitted_datasets/allele/beta/test_prenegsamples.tsv
Paired file copied successfully to ../../../../data/splitted_datasets/allele/paired/test_prenegsamples.tsv


In [14]:
# Define file paths
beta_file_path = f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv'
paired_file_path = f'{pipeline_data_splitted}/{precision}/paired/test_prenegsamples.tsv'

# Load beta dataset
beta_df = pd.read_csv(beta_file_path, sep='\t')

# Load paired dataset
paired_df = pd.read_csv(paired_file_path, sep='\t')

# Calculate unique values for beta dataset
unique_tcr_beta = beta_df['TRB_CDR3'].nunique()
unique_epitope_beta = beta_df['Epitope'].nunique()

# Calculate unique values for paired dataset
unique_tcr_paired = paired_df['TRB_CDR3'].nunique()
unique_epitope_paired = paired_df['Epitope'].nunique()

# Print results for beta dataset
print("\nBeta Dataset:")
print(f"- Unique TCRs: {unique_tcr_beta}")
print(f"- Unique Epitope: {unique_epitope_beta}")

# Print results for paired dataset
print("\nPaired Dataset:")
print(f"- Unique TCRs: {unique_tcr_paired}")
print(f"- Unique Epitope: {unique_epitope_paired}")


Beta Dataset:
- Unique TCRs: 8612
- Unique Epitope: 293

Paired Dataset:
- Unique TCRs: 21101
- Unique Epitope: 825


In [124]:
import pandas as pd

# Pfade
train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_path = f"{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv"

# Einlesen
train_df = pd.read_csv(train_path, sep="\t")
val_df = pd.read_csv(val_path, sep="\t")
test_df = pd.read_csv(test_path, sep="\t")

# Kombiniere bekannte TCRs & Epitopes aus Train + Validation
known_tcrs = set(train_df["TRB_CDR3"]) | set(val_df["TRB_CDR3"])
known_epitopes = set(train_df["Epitope"]) | set(val_df["Epitope"])

# Task-Tagging-Funktion
def calculate_task(row):
    epitope_exists = row['Epitope'] in known_epitopes
    trb_cdr3_exists = row['TRB_CDR3'] in known_tcrs

    if epitope_exists and trb_cdr3_exists:
        return 'TPP1'
    elif epitope_exists and not trb_cdr3_exists:
        return 'TPP2'
    elif not epitope_exists and not trb_cdr3_exists:
        return 'TPP3'
    elif not epitope_exists and trb_cdr3_exists:
        return 'TPP4'
    return "UNDEFINED"

# TPP-Spalte hinzufügen
test_df["task"] = test_df.apply(calculate_task, axis=1)

# Ergebnis anzeigen
print(test_df["task"].value_counts())

# speichern
test_df.to_csv(test_path, sep="\t", index=False)

  val_df = pd.read_csv(val_path, sep="\t")


task
TPP2    6803
TPP1    1129
TPP3    1047
TPP4     126
Name: count, dtype: int64


## Negative Data

In [62]:
#Daten einlesen

combined_donors_path = f'{pipeline_data_plain}/10x/combined_donors_consensus_annotations.csv'
all_donors_consensus = pd.read_csv(combined_donors_path, sep=',')

print("Consensus: ", all_donors_consensus.head())

all_donors_meta_path = f'{pipeline_data_plain}/10x/meta.csv'
all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')

print("Meta: ", all_donors_meta.head())

Consensus:                 barcode   donor  \
0   AAACCTGAGACAAAGG-4  donor1   
1  AAACCTGAGACTGTAA-34  donor1   
2   AAACCTGAGAGCCCAA-5  donor1   
3  AAACCTGAGAGCTGCA-24  donor1   
4   AAACCTGAGAGGGATA-8  donor1   

                                  cell_clono_cdr3_aa  \
0  TRA:CAASVSIWTGTASKLTF;TRA:CAAWDMEYGNKLVF;TRB:C...   
1                                    TRB:CASDTPVGQFF   
2                 TRA:CASYTDKLIF;TRB:CASSGGSISTDTQYF   
3                                 TRB:CASSGGQSSYEQYF   
4          TRA:CAASGYGNTGRRALTF;TRB:CASSQDPAGGYNEQFF   

                                  cell_clono_cdr3_nt     CD3  CD19  CD45RA  \
0  TRA:TGTGCAGCAAGCGTTAGTATTTGGACCGGCACTGCCAGTAAA...  2125.0   0.0   912.0   
1              TRB:TGTGCCAGCGATACCCCGGTTGGGCAGTTCTTC  1023.0   0.0  2028.0   
2  TRA:TGTGCTTCCTACACCGACAAGCTCATCTTT;TRB:TGCGCCA...  1598.0   3.0  3454.0   
3     TRB:TGCGCCAGCAGTGGCGGACAGAGCTCCTACGAGCAGTACTTC   298.0   1.0   880.0   
4  TRA:TGTGCAGCAAGCGGGTATGGAAACACGGGCAGGAGAGCACTT...  10

  all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')


### Beta

In [66]:
#Dieser Code für ganzen Datensatz laufen lassen
import re
import pandas as pd

# Festlegen der Batch-Größe für die Verarbeitung
batch_size = 1000  # Passe diese Zahl je nach Speicherressourcen an

# Identifizieren von Epitope-Spalten, aber ohne "NR(B0801)_AAKGRGAAL_NC_binder"
epitope_columns = [col for col in all_donors_consensus.columns if '_binder' in col and col != "NR(B0801)_AAKGRGAAL_NC_binder"]

# Liste für alle Batch-Ergebnisse
all_batches = []

# Verarbeite `all_donors_consensus` in Batches
for batch_start in range(0, len(all_donors_consensus), batch_size):
    print("Batch Start: ", batch_start)
    # Batch definieren
    batch = all_donors_consensus.iloc[batch_start:batch_start + batch_size]
    
    # Filtern auf Zeilen, die 'TRB:' enthalten
    batch_trb = batch[batch['cell_clono_cdr3_aa'].str.contains("TRB:", na=False)]

    # Liste, um Zeilen für diesen Batch zu speichern
    expanded_rows = []
    
    # Iteriere durch jede Zeile im Batch
    for _, row in batch_trb.iterrows():
        for col in epitope_columns:
            # Extrahiere MHC und Epitope
            match = re.match(r'([A-Z0-9]+)_([A-Z]+)_.*_binder', col)
            if match:
                mhc_raw, epitope = match.groups()
                mhc_formatted = f'HLA-{mhc_raw[0]}*{mhc_raw[1:3]}:{mhc_raw[3:]}'

                # Füge `Epitope` und `MHC` zur Zeile hinzu
                new_row = row.copy()
                new_row['Epitope'] = epitope
                new_row['MHC'] = mhc_formatted

                # Füge neue Zeile zur Batch-Liste hinzu
                expanded_rows.append(new_row)
    
    # Erstelle einen DataFrame aus dem Batch
    batch_df = pd.DataFrame(expanded_rows)
    all_batches.append(batch_df)  # Speichere den Batch in der Liste

# Kombiniere alle Batch-Ergebnisse zu einem DataFrame
expanded_df = pd.concat(all_batches, ignore_index=True)

# Nur die TRB-Chain-Einträge in `all_donors_meta` beibehalten
all_donors_meta_trb = all_donors_meta[all_donors_meta['chain'] == 'TRB']

# Zusammenführen der beiden DataFrames basierend auf der 'barcode' Spalte
merged_df = pd.merge(all_donors_meta_trb, expanded_df[['barcode', 'Epitope', 'MHC']], on='barcode', how='inner')

# Spalten umbenennen und Format anpassen
merged_df = merged_df.rename(columns={
    'barcode': 'TCR_name',
    'v_gene': 'TRBV',
    'j_gene': 'TRBJ',
    'c_gene': 'TRBC',
    'cdr3': 'TRB_CDR3'
})

# Fehlende Spalten auffüllen
desired_columns = ['TCR_name', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TRBC', 'Epitope', 'MHC', 'Binding', 'task']
for col in desired_columns:
    if col not in merged_df.columns:
        merged_df[col] = 'nan' if col == 'task' else '0'

# Nur die gewünschten Spalten beibehalten und Zeilen mit `None` in `TRB_CDR3` entfernen
final_df = merged_df[desired_columns]
final_df = final_df[final_df['TRB_CDR3'] != 'None']

final_df = final_df[final_df['TRB_CDR3'].notna() & (final_df['TRB_CDR3'] != '')]

# Ausgabe des ersten Teils des Ergebnisses zur Überprüfung
print(final_df.head())

# Speichern des kombinierten DataFrames
output_path = f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv'
final_df.to_csv(output_path, index=False)


Batch Start:  0
Batch Start:  1000
Batch Start:  2000
Batch Start:  3000
Batch Start:  4000
Batch Start:  5000
Batch Start:  6000
Batch Start:  7000
Batch Start:  8000
Batch Start:  9000
Batch Start:  10000
Batch Start:  11000
Batch Start:  12000
Batch Start:  13000
Batch Start:  14000
Batch Start:  15000
Batch Start:  16000
Batch Start:  17000
Batch Start:  18000
Batch Start:  19000
Batch Start:  20000
Batch Start:  21000
Batch Start:  22000
Batch Start:  23000
Batch Start:  24000
Batch Start:  25000
Batch Start:  26000
Batch Start:  27000
Batch Start:  28000
Batch Start:  29000
Batch Start:  30000
Batch Start:  31000
Batch Start:  32000
Batch Start:  33000
Batch Start:  34000
Batch Start:  35000
Batch Start:  36000
Batch Start:  37000
Batch Start:  38000
Batch Start:  39000
Batch Start:  40000
Batch Start:  41000
Batch Start:  42000
Batch Start:  43000
Batch Start:  44000
Batch Start:  45000
Batch Start:  46000
Batch Start:  47000
Batch Start:  48000
Batch Start:  49000
Batch Start: 

### Beta Samples generieren für Test File

In [61]:
# prepare parameters for beta dataset
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'

read_path_train = f"{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv" #fürs erste mal durchführen train_prenegsamples und new weg
read_path_test = f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv' #fürs erste mal durchführen test_prenegsamples und new weg
read_path_validation = f"{pipeline_data_splitted}/{precision}/beta/validation_prenegsamples.tsv" #fürs erste mal durchführen validation_prenegsamples und new weg
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/beta/'
output_path = f"{pipeline_data_splitted}/{precision}/beta/new/negatives"
train_output_name = "train_neg5.tsv"
validation_output_name = "val_neg5.tsv"
test_output_name = "test_neg5.tsv"

create_folders_if_not_exists([temp_path])

%run ../negative_samples/negative_samples_beta.ipynb

  beta_train_df = pd.read_csv(read_path_train, sep="\t")


Using device: cuda:0
Loading: Rostlab/prot_t5_xl_half_uniref50-enc


In [62]:
import pandas as pd

# --- Dateipfade ---
read_path_train = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{train_output_name}"
read_path_val = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{validation_output_name}"
read_path_test = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{test_output_name}"

# --- Dateien laden ---
train_neg = pd.read_csv(read_path_train, sep='\t')
val_neg = pd.read_csv(read_path_val, sep='\t')
test_neg = pd.read_csv(read_path_test, sep='\t')

# --- Spalte "source" setzen ---
for df in [train_neg, val_neg, test_neg]:
    df["source"] = "generated"

# --- Zurückspeichern ---
train_neg.to_csv(read_path_train, sep='\t', index=False)
val_neg.to_csv(read_path_val, sep='\t', index=False)
test_neg.to_csv(read_path_test, sep='\t', index=False)

print("✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.")

✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.


In [63]:
# === Load negative datasets ===
train_neg = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/train_neg5.tsv", sep='\t')
val_neg = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/val_neg5.tsv", sep='\t')
test_neg = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/test_neg5.tsv", sep='\t')

# === Load positive datasets ===
train_pos = pd.read_csv(output_train_path, sep='\t')
val_pos = pd.read_csv(output_val_path, sep='\t')
test_pos = pd.read_csv(test_output_path, sep='\t')

# 🔁 Wiederverwendbare Funktion zum Entfernen
def remove_conflicting_negatives(pos_df, neg_df, name):
    dupe_cols = ["Epitope", "TRB_CDR3"]
    merged = pd.merge(neg_df, pos_df[dupe_cols], on=dupe_cols, how="inner")
    print(f"❌ {name.upper()}: Entferne {len(merged)} Duplikate mit Konflikt.")
    
    # Nur die, die NICHT im positiven vorkommen → behalten
    cleaned_neg = neg_df.merge(merged[dupe_cols], on=dupe_cols, how="left", indicator=True)
    cleaned_neg = cleaned_neg[cleaned_neg["_merge"] == "left_only"].drop(columns=["_merge"])
    print(f"✅ {name.upper()}: Übrig nach Cleaning: {len(cleaned_neg)}")
    return cleaned_neg
# Neue Clean-Dateipfade
neg_base_path = f"{pipeline_data_splitted}/{precision}/beta/new/negatives"

train_neg_clean = remove_conflicting_negatives(train_pos, train_neg, "train")
train_neg_clean.to_csv(f"{neg_base_path}/train_neg_clean5.tsv", sep="\t", index=False)

val_neg_clean = remove_conflicting_negatives(val_pos, val_neg, "validation")
val_neg_clean.to_csv(f"{neg_base_path}/val_neg_clean5.tsv", sep="\t", index=False)

test_neg_clean = remove_conflicting_negatives(test_pos, test_neg, "test")
test_neg_clean.to_csv(f"{neg_base_path}/test_neg_clean5.tsv", sep="\t", index=False)


❌ TRAIN: Entferne 14469 Duplikate mit Konflikt.
✅ TRAIN: Übrig nach Cleaning: 125409
❌ VALIDATION: Entferne 4459 Duplikate mit Konflikt.
✅ VALIDATION: Übrig nach Cleaning: 30401
❌ TEST: Entferne 12 Duplikate mit Konflikt.
✅ TEST: Übrig nach Cleaning: 9055


In [67]:
import pandas as pd
import os

dupe_cols = ["Epitope", "TRB_CDR3"]

# === Negativ-Dateipfade pro Split ===
paths = {
    "train": [f"{pipeline_data_splitted}/{precision}/beta/new/negatives/train_neg_clean{i}.tsv" for i in ["", "2", "3", "4", "5"]],
    "validation": [f"{pipeline_data_splitted}/{precision}/beta/new/negatives/val_neg_clean{i}.tsv" for i in ["", "2", "3", "4", "5"]],
    "test": [f"{pipeline_data_splitted}/{precision}/beta/new/negatives/test_neg_clean{i}.tsv" for i in ["", "2", "3", "4", "5"]],
}

# === Positivdaten laden ===
train_pos = pd.read_csv(read_path_train, sep="\t")
val_pos   = pd.read_csv(read_path_validation, sep="\t")
test_pos  = pd.read_csv(read_path_test,  sep="\t")

positives = {
    "train": train_pos,
    "validation": val_pos,
    "test": test_pos
}

# === Helper-Funktion: alle N Files kombinieren, deduplizieren, auf Leaks prüfen ===
def check_negatives_multiple(split_name, file_list, pos_df):
    print(f"\n=== {split_name.upper()} ===")
    
    neg_list = []
    for path in file_list:
        if os.path.exists(path):
            df = pd.read_csv(path, sep="\t")
            print(f"📄 Gelesen: {os.path.basename(path)} → {len(df)} Einträge")
            neg_list.append(df)
        else:
            print(f"⚠️ Datei nicht gefunden: {path}")

    # Alle kombinieren
    combined = pd.concat(neg_list, ignore_index=True)
    print(f"🔢 Gesamt vor Deduplikation: {len(combined)}")

    # Deduplikation
    combined.drop_duplicates(subset=dupe_cols, inplace=True)
    print(f"✅ Nach Deduplikation: {len(combined)}")

    # Leak-Check gegen Positives
    merged = pd.merge(combined, pos_df, on=dupe_cols)
    print(f"🚨 Überschneidungen mit Positiven: {len(merged)}")
    if not merged.empty:
        print(merged[dupe_cols].head(3))

    return combined

# === Ausführen für alle Splits ===
combined_negatives = {}
for split in ["train", "validation", "test"]:
    combined_negatives[split] = check_negatives_multiple(
        split,
        paths[split],
        positives[split]
    )

# Speicherpfade definieren
output_dir = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/combined_no_duplicates"
os.makedirs(output_dir, exist_ok=True)

# Exportieren pro Split
combined_negatives["train"].to_csv(f"{output_dir}/train_neg_combined.tsv", sep="\t", index=False)
combined_negatives["validation"].to_csv(f"{output_dir}/val_neg_combined.tsv", sep="\t", index=False)
combined_negatives["test"].to_csv(f"{output_dir}/test_neg_combined.tsv", sep="\t", index=False)

print("✅ Kombinierte Negative wurden gespeichert ohne Duplikate.")


=== TRAIN ===
📄 Gelesen: train_neg_clean.tsv → 134756 Einträge
📄 Gelesen: train_neg_clean2.tsv → 131262 Einträge
📄 Gelesen: train_neg_clean3.tsv → 125544 Einträge
📄 Gelesen: train_neg_clean4.tsv → 125795 Einträge
📄 Gelesen: train_neg_clean5.tsv → 125409 Einträge
🔢 Gesamt vor Deduplikation: 642766
✅ Nach Deduplikation: 597003
🚨 Überschneidungen mit Positiven: 139737
           Epitope        TRB_CDR3
0  VLPFNDGVYFASTEK  CASIWAGLGQPQHF
1        GLCTLVAML  CASSVQGLGQPQHF
2        GLCTLVAML   CASTPPGSETQYF

=== VALIDATION ===
📄 Gelesen: val_neg_clean.tsv → 32805 Einträge
📄 Gelesen: val_neg_clean2.tsv → 32349 Einträge
📄 Gelesen: val_neg_clean3.tsv → 30375 Einträge
📄 Gelesen: val_neg_clean4.tsv → 30329 Einträge
📄 Gelesen: val_neg_clean5.tsv → 30401 Einträge
🔢 Gesamt vor Deduplikation: 156259
✅ Nach Deduplikation: 130637
🚨 Überschneidungen mit Positiven: 0

=== TEST ===
📄 Gelesen: test_neg_clean.tsv → 9059 Einträge
📄 Gelesen: test_neg_clean2.tsv → 9048 Einträge
📄 Gelesen: test_neg_clean3.tsv

In [68]:
import pandas as pd

def create_balanced_negatives_with_all_epitopes(
    neg_source_1,
    neg_source_2,
    target_neg_count,
    used_pairs=set(),
    vdjdb_tcrs=set(),
    vdjdb_epitopes=set(),
    ensure_all_neg_epitopes=True,
    tpp_target_distribution=None
):
    neg_source_1 = neg_source_1.copy()
    neg_source_2 = neg_source_2.copy()
    neg_source_1['source'] = '10X'
    neg_source_2['source'] = 'generated'

    # --- VDJdb & verwendete Paare filtern ---
    def filter_df(df, remove_epitopes=False):
        df = df[~df['TRB_CDR3'].isin(vdjdb_tcrs)].copy()
        if remove_epitopes:
            df = df[~df['Epitope'].isin(vdjdb_epitopes)]
    
        df['Pair'] = list(map(tuple, df[['Epitope', 'TRB_CDR3']].values))
    
        # Entferne negative Paare, die in positiven Beispielen (used_pairs) vorkommen
        df = df[~df['Pair'].isin(used_pairs)]
    
        # Entferne Duplikate innerhalb der negativen Quellen
        df = df.drop_duplicates(subset=["Epitope", "TRB_CDR3"])
    
        return df

    neg_source_1 = filter_df(neg_source_1)
    neg_source_2 = filter_df(neg_source_2)

    # --- Vorverarbeitung für TPP-Filterung ---
    trainval_tcrs = set(pd.concat([train_final, val_final])['TRB_CDR3'])
    trainval_epitopes = set(pd.concat([train_final, val_final])['Epitope'])

    # Mapping beider Quellen nach TPP-Typ
    combined_sources = pd.concat([neg_source_1, neg_source_2], ignore_index=True)
    
    task_map = {
        "TPP1": [],
        "TPP2": [],
        "TPP3": [],
        "TPP4": []
    }
    
    for _, row in combined_sources.iterrows():
        task = classify_task(row["TRB_CDR3"], row["Epitope"], trainval_tcrs, trainval_epitopes)
        task_map[task].append(row)

    # Konvertieren zu DataFrames
    task_dfs = {task: pd.DataFrame(rows) for task, rows in task_map.items()}

    neg_selected = []
    for task, ratio in tpp_target_distribution.items():
        count = int(target_neg_count * ratio)
        df_pool = task_dfs.get(task, pd.DataFrame())
        if not df_pool.empty:
            count = min(count, len(df_pool))
            neg_selected.append(df_pool.sample(n=count, random_state=42))

    neg_source_1 = pd.concat(neg_selected, ignore_index=True)
    print("✅ Neue Verteilung:")
    for task, df in task_dfs.items():
        print(f"{task}: {len(df)} → verwendet: {len([d for d in neg_selected if d.shape[0] and classify_task(d.iloc[0]['TRB_CDR3'], d.iloc[0]['Epitope'], trainval_tcrs, trainval_epitopes) == task])}")


    # --- Mindestens 1x alle Epitope aus beiden Quellen übernehmen ---
    def ensure_epitope_coverage(df):
        guaranteed = []
        for epitope in df['Epitope'].unique():
            group = df[df['Epitope'] == epitope]
            if not group.empty:
                guaranteed.append(group.sample(1, random_state=42))
        return pd.concat(guaranteed, ignore_index=True)

    guaranteed_1 = ensure_epitope_coverage(neg_source_1)
    guaranteed_2 = ensure_epitope_coverage(neg_source_2)
    guaranteed_df = pd.concat([guaranteed_1, guaranteed_2], ignore_index=True)

    # Begrenze garantierte, falls sie zu groß geworden sind
    if len(guaranteed_df) > target_neg_count:
        print(f"Zu viele garantierte Negative ({len(guaranteed_df)}), trimme auf Zielmenge {target_neg_count}")
        guaranteed_df = guaranteed_df.sample(n=target_neg_count, random_state=42)

    # --- Stratified Sampling für Restauffüllung ---
    def stratified_sample(df, n):
        epitope_groups = df.groupby('Epitope')
        unique_epitopes = list(epitope_groups.groups.keys())
        print(f"→ Stratified sampling from {len(df)} rows | {len(unique_epitopes)} unique epitopes | need {n} samples")
    
        # Schritt 1: Garantiert 1 Sample pro Epitope
        guaranteed = [group.sample(1, random_state=42) for _, group in epitope_groups]
        guaranteed_df = pd.concat(guaranteed, ignore_index=True)
    
        remaining_n = n - len(guaranteed_df)
        if remaining_n <= 0:
            return guaranteed_df.sample(n=n, random_state=42)
    
        # Schritt 2: Aufstocken durch gewichtetes Sampling
        remaining_pool = df.drop(index=guaranteed_df.index, errors='ignore')
    
        # Gewichte: Häufigkeit pro Epitope → normalize
        epitope_counts = remaining_pool['Epitope'].value_counts()
        remaining_pool = remaining_pool.copy()
        remaining_pool['weight'] = remaining_pool['Epitope'].map(epitope_counts)
        total = remaining_pool['weight'].sum()
        remaining_pool['weight'] = remaining_pool['weight'] / total
    
        print(f"→ Stratified fill-in: drawing {remaining_n} samples weighted by epitope frequency")
    
        replace = len(remaining_pool) < remaining_n
        if replace:
            print(f"Achtung: Sampling mit Replacement (n={remaining_n}, pool={len(remaining_pool)})")
        
        sampled_rest = remaining_pool.sample(
            n=remaining_n,
            weights='weight',
            replace=replace,
            random_state=42
        )
        
        final_df = pd.concat([guaranteed_df, sampled_rest], ignore_index=True)
        return final_df

    remaining_needed = target_neg_count - len(guaranteed_df)
    if remaining_needed <= 0:
        print(f"Es wurden bereits {len(guaranteed_df)} garantierte Samples übernommen (mehr als benötigt).")
        final_df = guaranteed_df.sample(n=target_neg_count, random_state=42)
    else:
        # Garantierte rausnehmen
        used_idx_1 = guaranteed_1.index if not guaranteed_1.empty else []
        used_idx_2 = guaranteed_2.index if not guaranteed_2.empty else []

        remaining_1 = neg_source_1
        remaining_2 = neg_source_2

        half = remaining_needed // 2
        rest = remaining_needed - half

        sample_1 = stratified_sample(remaining_1, half)
        sample_2 = stratified_sample(remaining_2, rest)

        final_df = pd.concat([guaranteed_df, sample_1, sample_2], ignore_index=True)

    return final_df.drop(columns=['Pair'])

def classify_task(tcr, epitope, train_tcrs, train_epitopes):
    seen_tcr = tcr in train_tcrs
    seen_epi = epitope in train_epitopes
    if seen_tcr and seen_epi:
        return 'TPP1'
    elif not seen_tcr and seen_epi:
        return 'TPP2'
    elif not seen_tcr and not seen_epi:
        return 'TPP3'
    elif seen_tcr and not seen_epi:
        return 'TPP4'

In [69]:
# === Datei- und Pfadangaben ===

# Originaldaten (10X)
beta = pd.read_csv(f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv', sep=',')

# Generierte Negativdaten
neg_ba_train = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/combined_no_duplicates/train_neg_combined.tsv", sep='\t')
neg_ba_val   = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/combined_no_duplicates/val_neg_combined.tsv", sep='\t')
neg_ba_test  = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/combined_no_duplicates/test_neg_combined.tsv", sep='\t')

# Positive Beispiele
train_pos = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv', sep='\t')
val_pos = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/validation_prenegsamples.tsv', sep='\t')
test_preneg = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv', sep='\t')

# Output-Ziele
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'

# VDJdb zum Herausfiltern
vdjdb_df = pd.read_csv(VDJdb_cleaned_beta_output, sep='\t')
vdjdb_tcrs = set(vdjdb_df['TRB_CDR3'])
vdjdb_epitopes = set(vdjdb_df['Epitope'])

  train_pos = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv', sep='\t')


In [73]:
# === TESTDATEN VERARBEITUNG ===
# Zielmenge: 1:5 Verhältnis
num_test_pos = len(test_preneg)
test_neg_target = num_test_pos * 5

neg_10x = beta[beta['Binding'] == 0]
train_final = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv", sep='\t')
val_final = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv", sep='\t')

# Paare aus Train/Val ausschließen
used_trainval_pairs = set(
    map(tuple, pd.concat([train_final, val_final])[['Epitope', 'TRB_CDR3']].values)
)

# Zielverteilung Test
tpp_dist = {
    "TPP1": 0.4,
    "TPP2": 0.3,
    "TPP3": 0.2,
    "TPP4": 0.1
}

test_neg = create_balanced_negatives_with_all_epitopes(
    neg_source_1=neg_10x,
    neg_source_2=neg_ba_test,
    target_neg_count=test_neg_target,
    used_pairs=used_trainval_pairs,
    vdjdb_tcrs=vdjdb_tcrs,
    vdjdb_epitopes=vdjdb_epitopes,
    tpp_target_distribution=tpp_dist  
)

# Combine & save
test_final = pd.concat([test_preneg, test_neg], ignore_index=True).sample(frac=1, random_state=42)
test_final.to_csv(test_output_path, sep='\t', index=False)

df_check = pd.read_csv(test_output_path, sep='\t')
print(df_check['Binding'].value_counts())

# Ausgabe
print("✅ Testset erfolgreich erstellt & gespeichert.")
print(f"Test: {len(test_final)} Beispiele")
print(f"- Binding=1: {test_final['Binding'].value_counts().get(1, 0)}")
print(f"- Binding=0: {test_final['Binding'].value_counts().get(0, 0)}")
print(f"- Unique Epitope: {test_final['Epitope'].nunique()}")
print(f"- Unique TCRs: {test_final['TRB_CDR3'].nunique()}")

✅ Neue Verteilung:
TPP1: 2710167 → verwendet: 1
TPP2: 5684 → verwendet: 1
TPP3: 0 → verwendet: 0
TPP4: 0 → verwendet: 0


ValueError: No objects to concatenate

In [56]:
import pandas as pd

# --- Pfade zu finalen Splits ---
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path   = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path  = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"

# --- Dateien einlesen ---
train_df = pd.read_csv(output_train_path, sep='\t')
val_df   = pd.read_csv(output_val_path, sep='\t')
test_df  = pd.read_csv(test_output_path, sep='\t')

# --- Funktion zur Bearbeitung ---
def clean_and_update(df):
    if 'weight' in df.columns:
        df = df.drop(columns=['weight'])
    if 'Epitope MHC MHC class' in df.columns:
        df = df.drop(columns=['Epitope MHC MHC class'])
    if 'pair_count' in df.columns:
        df = df.drop(columns=['pair_count'])
    if 'epi_count' in df.columns:
        df = df.drop(columns=['epi_count'])
    if 'pair' in df.columns:
        df = df.drop(columns=['pair'])
    if 'source' not in df.columns:
        df['source'] = ''
    df.loc[df['Binding'] == 1, 'source'] = 'datasets'
    return df

# --- Anwenden ---
train_df = clean_and_update(train_df)
val_df   = clean_and_update(val_df)
test_df  = clean_and_update(test_df)

# --- Zurückschreiben (überschreibt die Dateien direkt) ---
train_df.to_csv(output_train_path, sep='\t', index=False)
val_df.to_csv(output_val_path, sep='\t', index=False)
test_df.to_csv(test_output_path, sep='\t', index=False)

print("✅ Gewünschte Änderungen in den finalen Splits wurden vorgenommen und gespeichert.")

✅ Gewünschte Änderungen in den finalen Splits wurden vorgenommen und gespeichert.


In [57]:
import pandas as pd

# Pfad zur Test-Datei
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"

# Laden
df_test = pd.read_csv(test_path, sep="\t")

# Separiere Binding = 0 und = 1
df_neg = df_test[df_test["Binding"] == 0]
df_pos = df_test[df_test["Binding"] == 1]

# Entferne exakte Duplikate unter den negativen Beispielen
df_neg_cleaned = df_neg.drop_duplicates(subset=["Epitope", "TRB_CDR3", "Binding"])

# Kombiniere wieder
df_test_cleaned = pd.concat([df_pos, df_neg_cleaned], ignore_index=True)

# Speichern
df_test_cleaned.to_csv(test_path, sep="\t", index=False)

# Check zur Sicherheit
remaining_dups = df_test_cleaned.duplicated(subset=["Epitope", "TRB_CDR3", "Binding"]).sum()
print(f"✅ Exakte Duplikate im Testset entfernt. Verbleibend: {remaining_dups}")


✅ Exakte Duplikate im Testset entfernt. Verbleibend: 97


In [62]:
import pandas as pd

# Lade die Negativdatenquellen
beta = pd.read_csv(f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv', sep=',')
neg_ba_train = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/train_neg_clean.tsv", sep='\t')
neg_ba_val = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/val_neg_clean.tsv", sep='\t')
neg_ba_test = pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/negatives/test_neg_clean.tsv", sep='\t')


train_file = f'{pipeline_data_splitted}/{precision}/beta/new/train.tsv'
validation_file = f'{pipeline_data_splitted}/{precision}/beta/new/validation.tsv'
test_file = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'

# Lade die bestehenden train/val/test-Sets
train_df = pd.read_csv(train_file, sep="\t")
val_df = pd.read_csv(validation_file, sep="\t")
test_df = pd.read_csv(test_file, sep="\t")

# Already used (TRB_CDR3, Epitope) pairs
used_pairs = set(pd.concat([train_df, val_df, test_df])[["TRB_CDR3", "Epitope"]].apply(tuple, axis=1))

# Klassifizierungsfunktion
def classify_task(tcr, epitope, train_tcrs, train_epitopes):
    seen_tcr = tcr in train_tcrs
    seen_epi = epitope in train_epitopes
    if seen_tcr and seen_epi:
        return 'TPP1'
    elif not seen_tcr and seen_epi:
        return 'TPP2'
    elif not seen_tcr and not seen_epi:
        return 'TPP3'
    elif seen_tcr and not seen_epi:
        return 'TPP4'

# Kontext für Klassifizierung
trainval_tcrs = set(pd.concat([train_df, val_df])["TRB_CDR3"])
trainval_epitopes = set(pd.concat([train_df, val_df])["Epitope"])

# Kombinierte Quellen
all_neg_sources = pd.concat([beta, neg_ba_train, neg_ba_val, neg_ba_test], ignore_index=True)

# Finde TPP3 inline (ohne neue Spalte)
def is_tpp3_not_used(row):
    return (
        classify_task(row["TRB_CDR3"], row["Epitope"], trainval_tcrs, trainval_epitopes) == "TPP3" and
        (row["TRB_CDR3"], row["Epitope"]) not in used_pairs
    )

tpp3_new = all_neg_sources[all_neg_sources.apply(is_tpp3_not_used, axis=1)]

# Füge zu Test hinzu
test_df_extended = pd.concat([test_df, tpp3_new], ignore_index=True)

# Speichern
test_df_extended.to_csv(test_file, sep="\t", index=False)

# Ausgabe: Anzahl neuer TPP3
print(f"✅ Hinzugefügte neue TPP3-Beispiele: {len(tpp3_new)}")

✅ Hinzugefügte neue TPP3-Beispiele: 53


### Task Klassifikation an Validation Angleichung

#### Für Test File

In [190]:
# prepare parameters for beta dataset
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'

read_path_train = f"{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv" #fürs erste mal durchführen train_prenegsamples und new weg
read_path_test = f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv' #fürs erste mal durchführen test_prenegsamples und new weg
read_path_validation = f"{pipeline_data_splitted}/{precision}/beta/validation_prenegsamples.tsv" #fürs erste mal durchführen validation_prenegsamples und new weg
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/beta/'
output_path = f"{pipeline_data_splitted}/{precision}/beta/new/negatives"
train_output_name = "train_neg_tpp3.tsv"
validation_output_name = "val_neg_tpp3.tsv"
test_output_name = "test_neg_tpp3.tsv"

create_folders_if_not_exists([temp_path])

%run ../negative_samples/negative_samples_beta_tpp_test.ipynb

Using device: cuda:0
Loading: Rostlab/prot_t5_xl_half_uniref50-enc


In [191]:
import pandas as pd

# --- Dateipfade ---
read_path_val = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{validation_output_name}"
read_path_test = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{test_output_name}"

# --- Dateien laden ---
val_neg = pd.read_csv(read_path_val, sep='\t')
test_neg = pd.read_csv(read_path_test, sep='\t')

# --- Spalte "source" setzen ---
for df in [val_neg, test_neg]:
    df["source"] = "generated"

# --- Zurückspeichern ---
val_neg.to_csv(read_path_val, sep='\t', index=False)
test_neg.to_csv(read_path_test, sep='\t', index=False)

print("✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.")

✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.


In [192]:
import pandas as pd

# --- Pfade ---
train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"

neg_val_path = f"{output_path}/{validation_output_name}"
neg_test_path = f"{output_path}/{test_output_name}"

# --- Daten laden ---
train_df = pd.read_csv(train_path, sep='\t')
val_df = pd.read_csv(val_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

neg_val_df = pd.read_csv(neg_val_path, sep='\t')
neg_test_df = pd.read_csv(neg_test_path, sep='\t')

# --- Existing train/val/test Kontext für Klassifizierung ---
trainval_tcrs = set(pd.concat([train_df, val_df])['TRB_CDR3'])
trainval_epitopes = set(pd.concat([train_df, val_df])['Epitope'])

# --- Klassifikationsfunktion ---
def classify_task(tcr, epitope):
    seen_tcr = tcr in trainval_tcrs
    seen_epi = epitope in trainval_epitopes
    if seen_tcr and seen_epi:
        return 'TPP1'
    elif not seen_tcr and seen_epi:
        return 'TPP2'
    elif not seen_tcr and not seen_epi:
        return 'TPP3'
    elif seen_tcr and not seen_epi:
        return 'TPP4'

# --- Tasks für neue Negatives zuweisen ---
neg_val_df['task_predicted'] = neg_val_df.apply(lambda row: classify_task(row['TRB_CDR3'], row['Epitope']), axis=1)
neg_test_df['task_predicted'] = neg_test_df.apply(lambda row: classify_task(row['TRB_CDR3'], row['Epitope']), axis=1)

# --- Prüfen wie viele wirklich TPP3 sind ---
print("\n📊 Predicted Tasks Verteilung für neue Validation-Negative:")
print(neg_val_df['task_predicted'].value_counts())

print("\n📊 Predicted Tasks Verteilung für neue Test-Negative:")
print(neg_test_df['task_predicted'].value_counts())

# --- Prüfen auf Duplikate (Epitope, TCR) gegen bestehende Beispiele ---
used_pairs = set(pd.concat([train_df, val_df, test_df])[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

neg_val_pairs = set(neg_val_df[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))
neg_test_pairs = set(neg_test_df[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

dups_val = neg_val_pairs.intersection(used_pairs)
dups_test = neg_test_pairs.intersection(used_pairs)

print(f"\\n🔎 Validation: {len(dups_val)} Duplikate gefunden.")
print(f"🔎 Test: {len(dups_test)} Duplikate gefunden.")


  val_df = pd.read_csv(val_path, sep='\t')
  test_df = pd.read_csv(test_path, sep='\t')



📊 Predicted Tasks Verteilung für neue Validation-Negative:
task_predicted
TPP1    2426
Name: count, dtype: int64

📊 Predicted Tasks Verteilung für neue Test-Negative:
task_predicted
TPP3    1030
Name: count, dtype: int64
\n🔎 Validation: 122 Duplikate gefunden.
🔎 Test: 645 Duplikate gefunden.


In [193]:
# --- Lade bestehende Testdaten ---
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"
test_df = pd.read_csv(test_path, sep='\t')

# --- Lade neu generierte Negative ---
neg_test_path = f"{output_path}/{test_output_name}"
neg_test_df = pd.read_csv(neg_test_path, sep='\t')

# --- Positive Paare (TRB_CDR3, Epitope) aus bestehenden Daten ---
existing_pairs = set(pd.concat([
    pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv", sep='\t'),
    pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv", sep='\t'),
    test_df
])[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

# --- Prüfe neue Negative auf Duplikate ---
neg_test_df['pair'] = list(zip(neg_test_df['TRB_CDR3'], neg_test_df['Epitope']))

# --- Nur behalten, was kein Duplikat ist ---
neg_test_df_clean = neg_test_df[~neg_test_df['pair'].isin(existing_pairs)].drop(columns=['pair'])

print(f"✅ {len(neg_test_df_clean)} saubere neue TPP2-Negative im Test übrig.")

# --- Test aktualisieren ---
test_df_final = pd.concat([test_df, neg_test_df_clean], ignore_index=True)
test_df_final.to_csv(test_path, sep='\t', index=False)

print("✅ Testset erfolgreich aktualisiert.")

  test_df = pd.read_csv(test_path, sep='\t')
  pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv", sep='\t'),


✅ 375 saubere neue TPP2-Negative im Test übrig.
✅ Testset erfolgreich aktualisiert.


In [195]:
# --- Test laden ---
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"
test_df = pd.read_csv(test_path, sep='\t')

# --- TPP1-Daten herausfiltern ---
tpp1_df = test_df[test_df['task'] == 'TPP1']

# --- Non-Binder und Binder trennen ---
tpp1_nonbinder = tpp1_df[tpp1_df['Binding'] == 0]
tpp1_binder = tpp1_df[tpp1_df['Binding'] == 1]

print(f"🔎 Aktuell TPP1 Test: {len(tpp1_binder)} Binder und {len(tpp1_nonbinder)} Non-Binder")

# --- Zielverhältnis ---
target_binder_percentage = 18  # 18% Binder

# --- Berechne gewünschte Gesamtanzahl an Samples für TPP1 ---
target_total = int(len(tpp1_binder) / (target_binder_percentage / 100))
target_nonbinder = target_total - len(tpp1_binder)

print(f"🎯 Ziel: {len(tpp1_binder)} Binder und {target_nonbinder} Non-Binder")

# --- Reduziere Non-Binder auf Zielmenge ---
tpp1_nonbinder_reduced = tpp1_nonbinder.sample(n=target_nonbinder, random_state=42)

# --- Neues TPP1-Set bauen ---
tpp1_final = pd.concat([tpp1_binder, tpp1_nonbinder_reduced], ignore_index=True)

print(f"✅ Neue TPP1 Größe: {len(tpp1_final)} Beispiele (Binder: {tpp1_final['Binding'].sum()}, Non-Binder: {len(tpp1_final) - tpp1_final['Binding'].sum()})")

# --- Den Rest der Test behalten ---
test_df_rest = test_df[test_df['task'] != 'TPP1']

# --- Neues Test-Set bauen ---
test_df_new = pd.concat([test_df_rest, tpp1_final], ignore_index=True)

# --- Speichern ---
test_df_new.to_csv(test_path, sep='\t', index=False)

print("✅ Test-Set erfolgreich aktualisiert mit neuem TPP1-Verhältnis.")

  test_df = pd.read_csv(test_path, sep='\t')


🔎 Aktuell TPP1 Test: 1129 Binder und 15981 Non-Binder
🎯 Ziel: 1129 Binder und 5143 Non-Binder
✅ Neue TPP1 Größe: 6272 Beispiele (Binder: 1129, Non-Binder: 5143)
✅ Test-Set erfolgreich aktualisiert mit neuem TPP1-Verhältnis.


#### Für Validation File

In [113]:
# prepare parameters for beta dataset
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'

read_path_train = f"{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv" #fürs erste mal durchführen train_prenegsamples und new weg
read_path_test = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv' 
read_path_validation = f"{pipeline_data_splitted}/{precision}/beta/validation_prenegsamples.tsv" #fürs erste mal durchführen validation_prenegsamples und new weg
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/beta/'
output_path = f"{pipeline_data_splitted}/{precision}/beta/new/negatives"
train_output_name = "train_neg_tpp2.tsv"
validation_output_name = "val_neg_tpp2.tsv"
test_output_name = "test_neg_tpp2.tsv"

create_folders_if_not_exists([temp_path])

%run ../negative_samples/negative_samples_beta_task_val.ipynb

  beta_train_df = pd.read_csv(read_path_train, sep="\t")


Using device: cuda:0
Loading: Rostlab/prot_t5_xl_half_uniref50-enc


In [114]:
import pandas as pd

# --- Dateipfade ---
read_path_val = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{validation_output_name}"
read_path_test = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{test_output_name}"

# --- Dateien laden ---
val_neg = pd.read_csv(read_path_val, sep='\t')
test_neg = pd.read_csv(read_path_test, sep='\t')

# --- Spalte "source" setzen ---
for df in [val_neg, test_neg]:
    df["source"] = "generated"

# --- Zurückspeichern ---
val_neg.to_csv(read_path_val, sep='\t', index=False)
test_neg.to_csv(read_path_test, sep='\t', index=False)

print("✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.")

✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.


In [115]:
import pandas as pd

# --- Pfade ---
train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"

neg_val_path = f"{output_path}/{validation_output_name}"
neg_test_path = f"{output_path}/{test_output_name}"

# --- Daten laden ---
train_df = pd.read_csv(train_path, sep='\t')
val_df = pd.read_csv(val_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

neg_val_df = pd.read_csv(neg_val_path, sep='\t')
neg_test_df = pd.read_csv(neg_test_path, sep='\t')

# --- Paare erstellen ---
used_pairs = set(pd.concat([train_df, val_df, test_df])[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

neg_val_df['pair'] = list(zip(neg_val_df['TRB_CDR3'], neg_val_df['Epitope']))
neg_test_df['pair'] = list(zip(neg_test_df['TRB_CDR3'], neg_test_df['Epitope']))

# --- Nur behalten, was kein Duplikat ist ---
neg_val_df = neg_val_df[~neg_val_df['pair'].isin(used_pairs)].drop(columns=['pair'])
neg_test_df = neg_test_df[~neg_test_df['pair'].isin(used_pairs)].drop(columns=['pair'])

# --- Existing train/val Kontext für Klassifizierung 
trainval_tcrs = set(pd.concat([train_df, test_df])['TRB_CDR3'])
trainval_epitopes = set(pd.concat([train_df, test_df])['Epitope'])

# --- Klassifikationsfunktion ---
def classify_task(tcr, epitope):
    seen_tcr = tcr in trainval_tcrs
    seen_epi = epitope in trainval_epitopes
    if seen_tcr and seen_epi:
        return 'TPP1'
    elif not seen_tcr and seen_epi:
        return 'TPP2'
    elif not seen_tcr and not seen_epi:
        return 'TPP3'
    elif seen_tcr and not seen_epi:
        return 'TPP4'

# --- Tasks für neue saubere Negatives zuweisen ---
neg_val_df['task_predicted'] = neg_val_df.apply(lambda row: classify_task(row['TRB_CDR3'], row['Epitope']), axis=1)
neg_test_df['task_predicted'] = neg_test_df.apply(lambda row: classify_task(row['TRB_CDR3'], row['Epitope']), axis=1)

# --- Prüfen wie viele wirklich TPP3 sind ---
print("\n📊 Predicted Tasks Verteilung für neue Validation-Negative:")
print(neg_val_df['task_predicted'].value_counts())

print("\n📊 Predicted Tasks Verteilung für neue Test-Negative:")
print(neg_test_df['task_predicted'].value_counts())

  val_df = pd.read_csv(val_path, sep='\t')



📊 Predicted Tasks Verteilung für neue Validation-Negative:
task_predicted
TPP2    8986
TPP1    1854
Name: count, dtype: int64

📊 Predicted Tasks Verteilung für neue Test-Negative:
task_predicted
TPP1    7128
Name: count, dtype: int64


In [116]:
# --- Pfade ---
train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"
neg_test_path = f"{output_path}/{test_output_name}"

# --- Daten laden ---
train_df = pd.read_csv(train_path, sep='\t')
val_df = pd.read_csv(val_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')
neg_test_df = pd.read_csv(neg_test_path, sep='\t')

# --- Paare erstellen ---
existing_trainval_pairs = set(pd.concat([train_df, val_df])[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))
existing_test_pairs = set(test_df[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

neg_val_df['pair'] = list(zip(neg_val_df['TRB_CDR3'], neg_val_df['Epitope']))

# --- Step 1: Nur neue Paare, die noch nicht in Train+Val existieren
neg_not_in_trainval = neg_val_df[~neg_val_df['pair'].isin(existing_trainval_pairs)]

# --- Step 2: Von denen nur die behalten, die auch NICHT im Test existieren
neg_safe_for_val = neg_not_in_trainval[~neg_not_in_trainval['pair'].isin(existing_test_pairs)].drop(columns=['pair'])

# --- Step 3: Nur TPP3 Negatives auswählen ---
neg_safe_for_val_tpp3 = neg_safe_for_val[neg_safe_for_val['task_predicted'] == 'TPP2']

print(f"✅ {len(neg_safe_for_val_tpp3)} saubere neue TPP2-Negative für Validation übrig.")

# --- Validation aktualisieren ---
val_df_final = pd.concat([val_df, neg_safe_for_val_tpp3], ignore_index=True)
val_df_final.to_csv(val_path, sep='\t', index=False)

print("✅ Validation-Set erfolgreich aktualisiert nur mit TPP3-Negatives.")

  val_df = pd.read_csv(val_path, sep='\t')


✅ 8986 saubere neue TPP2-Negative für Validation übrig.
✅ Validation-Set erfolgreich aktualisiert nur mit TPP3-Negatives.


In [119]:
# --- Validation laden ---
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
val_df = pd.read_csv(val_path, sep='\t')

# --- TPP4-Daten herausfiltern ---
tpp4_df = val_df[val_df['task'] == 'TPP4']

# --- Non-Binder und Binder trennen ---
tpp4_nonbinder = tpp4_df[tpp4_df['Binding'] == 0]
tpp4_binder = tpp4_df[tpp4_df['Binding'] == 1]

print(f"🔎 Aktuell TPP4 Validation: {len(tpp4_binder)} Binder und {len(tpp4_nonbinder)} Non-Binder")

# --- Zielverhältnis ---
target_binder_percentage = 16  # 16% Binder, 84% Non-Binder

# --- Berechne gewünschte Gesamtanzahl an Samples für TPP4 ---
target_total = int(len(tpp4_binder) / (target_binder_percentage / 100))
target_nonbinder = target_total - len(tpp4_binder)

print(f"🎯 Ziel: {len(tpp4_binder)} Binder und {target_nonbinder} Non-Binder")

# --- Reduziere Non-Binder auf Zielmenge ---
tpp4_nonbinder_reduced = tpp4_nonbinder.sample(n=target_nonbinder, random_state=42)

# --- Neues TPP4-Set bauen ---
tpp4_final = pd.concat([tpp4_binder, tpp4_nonbinder_reduced], ignore_index=True)

print(f"✅ Neue TPP4 Größe: {len(tpp4_final)} Beispiele (Binder: {tpp4_final['Binding'].sum()}, Non-Binder: {len(tpp4_final) - tpp4_final['Binding'].sum()})")

# --- Den Rest der Validation behalten ---
val_df_rest = val_df[val_df['task'] != 'TPP4']

# --- Neues Validation-Set bauen ---
val_df_new = pd.concat([val_df_rest, tpp4_final], ignore_index=True)

# --- Speichern ---
val_df_new.to_csv(val_path, sep='\t', index=False)

print("✅ Validation-Set erfolgreich aktualisiert mit neuem TPP4-Verhältnis.")


  val_df = pd.read_csv(val_path, sep='\t')


🔎 Aktuell TPP4 Validation: 16 Binder und 753 Non-Binder
🎯 Ziel: 16 Binder und 84 Non-Binder
✅ Neue TPP4 Größe: 100 Beispiele (Binder: 16, Non-Binder: 84)
✅ Validation-Set erfolgreich aktualisiert mit neuem TPP4-Verhältnis.


In [120]:
# --- Validation laden ---
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
val_df = pd.read_csv(val_path, sep='\t')

# --- TPP3-Daten herausfiltern ---
tpp3_df = val_df[val_df['task'] == 'TPP1']

# --- Non-Binder und Binder trennen ---
tpp3_nonbinder = tpp3_df[tpp3_df['Binding'] == 0]
tpp3_binder = tpp3_df[tpp3_df['Binding'] == 1]

print(f"🔎 Aktuell TPP3 Validation: {len(tpp3_binder)} Binder und {len(tpp3_nonbinder)} Non-Binder")

# --- Zielverhältnis ---
target_binder_percentage = 16  # 20% Binder, 80% Non-Binder

# --- Berechne gewünschte Gesamtanzahl an Samples für TPP3 ---
target_total = int(len(tpp3_binder) / (target_binder_percentage / 100))
target_nonbinder = target_total - len(tpp3_binder)

print(f"🎯 Ziel: {len(tpp3_binder)} Binder und {target_nonbinder} Non-Binder")

# --- Reduziere Non-Binder auf Zielmenge ---
tpp3_nonbinder_reduced = tpp3_nonbinder.sample(n=target_nonbinder, random_state=42)

# --- Neues TPP3-Set bauen ---
tpp3_final = pd.concat([tpp3_binder, tpp3_nonbinder_reduced], ignore_index=True)

print(f"✅ Neue TPP1 Größe: {len(tpp3_final)} Beispiele (Binder: {tpp3_final['Binding'].sum()}, Non-Binder: {len(tpp3_final) - tpp3_final['Binding'].sum()})")

# --- Den Rest der Validation behalten ---
val_df_rest = val_df[val_df['task'] != 'TPP1']

# --- Neues Validation-Set bauen ---
val_df_new = pd.concat([val_df_rest, tpp3_final], ignore_index=True)

# --- Speichern ---
val_df_new.to_csv(val_path, sep='\t', index=False)

print("✅ Validation-Set erfolgreich aktualisiert mit neuem TPP3-Verhältnis.")

  val_df = pd.read_csv(val_path, sep='\t')


🔎 Aktuell TPP3 Validation: 12792 Binder und 96877 Non-Binder
🎯 Ziel: 12792 Binder und 67158 Non-Binder
✅ Neue TPP1 Größe: 79950 Beispiele (Binder: 12792, Non-Binder: 67158)
✅ Validation-Set erfolgreich aktualisiert mit neuem TPP3-Verhältnis.


In [200]:
import pandas as pd
import os

# --- Pfade ---
datasets = {
    "beta_allele": {
        "train": f'{pipeline_data_splitted}/{precision}/beta/new/train.tsv',
        "test": f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv',
        "validation": f'{pipeline_data_splitted}/{precision}/beta/new/validation.tsv'
    }
}

# === Verarbeitung: positive Bindings markieren ===
for name, paths in datasets.items():
    for split, path in paths.items():
        if os.path.exists(path):
            df = pd.read_csv(path, sep="\t")
            
            # Nur Binding==1 filtern
            mask = df["Binding"] == 1
            df.loc[mask, "source"] = "dataset"
            
            # Datei überschreiben
            df.to_csv(path, sep="\t", index=False)
            print(f"✅ '{split}' aktualisiert: 'source' für {mask.sum()} positive Einträge auf 'dataset' gesetzt.")
        else:
            print(f"⚠️ Datei nicht gefunden: {path}")


✅ 'train' aktualisiert: 'source' für 126463 positive Einträge auf 'dataset' gesetzt.
✅ 'test' aktualisiert: 'source' für 9105 positive Einträge auf 'dataset' gesetzt.


  df = pd.read_csv(path, sep="\t")


✅ 'validation' aktualisiert: 'source' für 29559 positive Einträge auf 'dataset' gesetzt.


In [201]:
import pandas as pd
import os

# Pfade
datasets = {
    "train": f'{pipeline_data_splitted}/{precision}/beta/new/train.tsv',
    "validation": f'{pipeline_data_splitted}/{precision}/beta/new/validation.tsv',
    "test": f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'
}

# Check der source-Verteilung
for split, path in datasets.items():
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        print(f"\n--- {split.upper()} ---")
        if "source" in df.columns:
            print(df["source"].value_counts(dropna=False))
        else:
            print("⚠️ Keine 'source'-Spalte vorhanden.")
    else:
        print(f"⚠️ Datei nicht gefunden: {path}")



--- TRAIN ---
source
10X          315652
generated    315617
dataset      126463
Name: count, dtype: int64

--- VALIDATION ---
source
generated    85279
10X          54753
dataset      29559
Name: count, dtype: int64

--- TEST ---
source
generated    31728
dataset       9105
10X           4279
Name: count, dtype: int64


  df = pd.read_csv(path, sep="\t")


## Task Classification 
The classification in the split notebook correct for positive only data. After adding negative data, some classifications might be wrong.

In [137]:
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta/new'
test_file_name = 'test.tsv'
validation_file_name = 'validation.tsv'
train_file_name = 'train.tsv'

In [None]:
# do the classification for paired data
paired = True
test_data_path = f'{paired_output_folder}/{test_file_name}'
validation_data_path = f'{paired_output_folder}/{validation_file_name}'
train_data_path = f'{paired_output_folder}/{train_file_name}'

%run ../data_preparation/classification.ipynb

  df_test = pd.read_csv(test_data_path, sep="\t")


test data has 82726 TPP1 tasks (seen tcr & seen epitopes).
test data has 76994 TPP2 tasks (unseen tcr & seen epitopes).
test data has 600 TPP3 tasks (unseen tcr & unseen epitope).
test data has 42 TPP4 tasks (seen tcr & unseen epitope).


In [None]:
# extended classification for paired data
train_path = f'{paired_output_folder}/{train_file_name}'
validation_path = f'{paired_output_folder}/{validation_file_name}'
test_path = f'{paired_output_folder}/{test_file_name}'
output_path = f'{paired_output_folder}/test_reclassified_paired_specific.tsv'
paired_data_path = paired_output_folder
alpha_cdr3_name = 'TRA_CDR3'
beta_cdr3_name = 'TRB_CDR3'
epitope_name = 'Epitope'
task_name = 'task'

%run ../data_preparation/paired_reclassification_testonly.ipynb

allele
../../data/splitted_datasets/allele/paired/train.tsv


  df_test = pd.read_csv(test_path, sep="\t", index_col=False)


train+validate data has 72656 entries
test data has 160362 entries
test data has 128885 TPP1 tasks (old value: 82726) (seen tcr & seen epitopes).
test data has 30835 TPP2 tasks (old value: 76994) (unseen tcr & seen epitopes).
test data has 508 TPP3 tasks (old value: 600) (unseen tcr & unseen epitope).
test data has 134 TPP4 tasks (old value: 42) (seen tcr & unseen epitope).
the train/test ratio is 0.31180423829918724/0.6881957617008128
../../data/splitted_datasets/allele/paired/test_reclassified_paired_specific.tsv
/home/ubuntu/arina/BA-Cancer-Immunotherapy
uploading dataset to dataset-allele


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marina-frohofer[0m ([33mba_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./../../data/splitted_datasets/allele/paired)... Done. 0.2s


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [196]:
# do the classification for beta data
paired = False
train_data_path = f'{beta_output_folder}/{train_file_name}'
validation_data_path = f'{beta_output_folder}/{validation_file_name}'
test_data_path = f'{beta_output_folder}/{test_file_name}'

%run ../data_preparation/classification.ipynb

  df_validation = pd.read_csv(validation_data_path, sep="\t")


test data has 6272 TPP1 tasks (seen tcr & seen epitopes).
test data has 32753 TPP2 tasks (unseen tcr & seen epitopes).
test data has 5416 TPP3 tasks (unseen tcr & unseen epitope).
test data has 671 TPP4 tasks (seen tcr & unseen epitope).


In the next two cells the classification is checked. If the output says "Classification is correct", everything is fine.

In [None]:
# check task classification paired
splitted_data_path = paired_output_folder

%run ../data_preparation/check_task_classification_paired.ipynb

  df_test = pd.read_csv(test_file, sep="\t")


train+validate data has 72656 entries
test data has 160362 entries
test data has 82726 TPP1 tasks (seen tcr & seen epitopes).
test data has 76994 TPP2 tasks (unseen tcr & seen epitopes).
test data has 600 TPP3 tasks (unseen tcr & unseen epitope).
test data has 42 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.31180423829918724/0.6881957617008128
Classification is correct.
Correctness summary:
is_correct
True    160362
Name: count, dtype: int64


In [197]:
# check task classification beta
splitted_data_path = beta_output_folder

%run ../data_preparation/check_task_classification_beta.ipynb

  df_validate = pd.read_csv(f"{splitted_data_path}/{validation_file_name}", sep="\t")


train data has 757732 entries
test data has 45112 entries
test data has 6272 TPP1 tasks (seen tcr & seen epitopes).
test data has 32753 TPP2 tasks (unseen tcr & seen epitopes).
test data has 5416 TPP3 tasks (unseen tcr & unseen epitope).
test data has 671 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.9536092386637667/0.04639076133623327
Classification is correct.
Correctness summary:
is_correct
True    45112
Name: count, dtype: int64


## Upload dataset

In [13]:
import os
print(os.listdir(path_to_data))


['.ipynb_checkpoints', 'validation_prenegsamples.tsv', 'backup_res13042025', 'test.tsv', 'archiv', 'new', 'train.tsv', 'TCRPeg_data', 'validation.tsv', 'train_prenegsamples.tsv', 'test_prenegsamples.tsv']


In [198]:
from dotenv import load_dotenv, find_dotenv
load_dotenv()

# upload paired data
path_to_data = f'{pipeline_data_splitted}/{precision}/paired'
dataset_name = f'paired_{precision}'
#main_project_name = os.getenv("MAIN_PROJECT_NAME")
main_project_name = f"dataset-{precision}"

%run ../upload_datasets.ipynb

uploading dataset to dataset-allele


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marina-frohofer[0m ([33mba_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./../../../../data/splitted_datasets/allele/paired)... Done. 0.3s


In [202]:
# upload beta data
path_to_data = f'{pipeline_data_splitted}/{precision}/beta'
dataset_name = f'beta_{precision}'

%run ../upload_datasets.ipynb

uploading dataset to dataset-allele


[34m[1mwandb[0m: Adding directory to artifact (./../../../../data/splitted_datasets/allele/beta)... Done. 2.2s


VBox(children=(Label(value='1.245 MB of 177.852 MB uploaded\r'), FloatProgress(value=0.006998664355910054, max…

## Create Embeddings >> ProtBert

In [203]:
import torch
print(torch.cuda.is_available())  # Sollte True zurückgeben
print(torch.version.cuda)  # Sollte die richtige CUDA-Version anzeigen

True
12.4


In [None]:
path_paired_test = f"{pipeline_data_splitted}/{precision}/paired/test.tsv"
path_paired_validation = f"{pipeline_data_splitted}/{precision}/paired/validation.tsv"
path_paired_train = f"{pipeline_data_splitted}/{precision}/paired/train.tsv"
path_beta_test = f"{pipeline_data_splitted}/{precision}/beta/test.tsv"
path_beta_validation = f"{pipeline_data_splitted}/{precision}/beta/validation.tsv"
path_beta_train = f"{pipeline_data_splitted}/{precision}/beta/train.tsv"


path_paired = f"{pipeline_data}/embeddings/temp/{precision}/paired_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_paired)])
df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)
df_paired_validation = pd.read_csv(path_paired_validation, sep="\t", index_col=False)
df_paired_train = pd.read_csv(path_paired_train, sep="\t", index_col=False)
df_paired = pd.concat([df_paired_test, df_paired_validation, df_paired_train])
df_paired.to_csv(path_paired, sep="\t", index=False)

# paired
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz TRA_CDR3
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz TRB_CDR3
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz Epitope

path_beta = f"{pipeline_data}/embeddings/temp/{precision}/beta_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_beta)])
df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_test, df_beta_validation, df_beta_train])
df_beta.to_csv(path_beta, sep="\t", index=False)

# beta
%run ../generateEmbeddings.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/TRB_beta_embeddings.npz TRB_CDR3
%run ../generateEmbeddings.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/Epitope_beta_embeddings.npz Epitope

  df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)
  df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)


Using GPU: Tesla T4
Loading: Rostlab/prot_t5_xl_half_uniref50-enc
Model is on device: cuda:0
Processing Batch:  0 64
Processing Batch:  64 128
Processing Batch:  128 192
Processing Batch:  192 256
Processing Batch:  256 320
Processing Batch:  320 384
Processing Batch:  384 448
Processing Batch:  448 512
Processing Batch:  512 576
Processing Batch:  576 640
Processing Batch:  640 704
Processing Batch:  704 768
Processing Batch:  768 832
Processing Batch:  832 896
Processing Batch:  896 960
Processing Batch:  960 1024
Processing Batch:  1024 1088
Processing Batch:  1088 1152
Processing Batch:  1152 1216
Processing Batch:  1216 1280
Processing Batch:  1280 1344
Processing Batch:  1344 1408
Processing Batch:  1408 1472
Processing Batch:  1472 1536
Processing Batch:  1536 1600
Processing Batch:  1600 1664
Processing Batch:  1664 1728
Processing Batch:  1728 1792
Processing Batch:  1792 1856
Processing Batch:  1856 1920
Processing Batch:  1920 1984
Processing Batch:  1984 2048
Processing Bat