In [1]:
import os
import pandas as pd

In [2]:
import sys
!"{sys.executable}" -m pip install tidytcells



In [3]:
# set precision of mhc and V/J values (gene or allele)
precision = 'allele'

In [4]:
# this function is not thread safe
def create_folders_if_not_exists(folders):
  for path in folders:
    if not os.path.exists(path):
      os.makedirs(path)

In [5]:
pipeline_data = '../../../../data'
pipeline_data_plain = f'{pipeline_data}/plain_datasets'
pipeline_data_cleaned = f'{pipeline_data}/cleaned_datasets'
pipeline_data_concatenated = f'{pipeline_data}/concatenated_datasets'
pipeline_data_splitted = f'{pipeline_data}/splitted_datasets'
pipeline_data_temp_bucket = f'{pipeline_data}/temp'

pipeline_folders = [pipeline_data, pipeline_data_plain, pipeline_data_cleaned, pipeline_data_concatenated, pipeline_data_splitted, pipeline_data_temp_bucket]

create_folders_if_not_exists(pipeline_folders)

## Data Preparation

### VDJdb

In [27]:
# prepare directories
VDJdb_data_plain = f'{pipeline_data_plain}/VDJdb'
VDJdb_data_cleaned = f'{pipeline_data_cleaned}/VDJdb'
VDJdb_data_fitted = f'{pipeline_data_temp_bucket}/VDJdb'

VDJdb_folders = [VDJdb_data_plain, VDJdb_data_cleaned, VDJdb_data_fitted]
create_folders_if_not_exists(VDJdb_folders)

fitted_beta_file = 'VDJdb_beta_fitted.tsv'
fitted_paired_file = 'VDJdb_paired_fitted.tsv'

In [28]:
# prepare parameters for notebook VDJdb fit data paired
input_file = f'{VDJdb_data_plain}/VDJdb_paired_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_paired_file

# fit paired VDJdb data
%run ../VDJdb/fit_data_vdjdb_paired.ipynb

In [29]:
# prepare parameters for notebook VDJdb fit data beta
input_file = f'{VDJdb_data_plain}/VDJdb_beta_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_beta_file

# fit beta VDJdb data
%run ../VDJdb/fit_data_vdjdb_beta.ipynb

In [30]:
# prepare parameters for notebook VDJdb clean data paired
input_file = f'{VDJdb_data_fitted}/{fitted_paired_file}'
cleaned_file_paired = 'VDJdb_cleaned_data_paired.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

# clean paired VDJdb data
%run ../VDJdb/clean_data_vdjdb_paired.ipynb

MHC Class I has 27414 entries
whole dataframe has 28119 entries
filtered to only use MHC Class I. Length of dataset: 27414


In [31]:
# prepare parameters for notebook VDJdb clean data beta
input_file = f'{VDJdb_data_fitted}/{fitted_beta_file}'
cleaned_file_beta = 'VDJdb_cleaned_data_beta.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'

# clean beta VDJdb data
%run ../VDJdb/clean_data_vdjdb_beta.ipynb

MHC Class I has 46507 entries
whole dataframe has 49042 entries
filtered to only use MHC Class I. Length of dataset: 46507


In [32]:
VDJdb_cleaned_beta_output = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'
VDJdb_cleaned_paired_output = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

In [12]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'

vdjdb_beta_read_path = VDJdb_cleaned_beta_output
vdjdb_paired_read_path = VDJdb_cleaned_paired_output

output_file_beta = 'beta_concatenated_test.tsv'
output_file_paired = 'paired_concatenated_test.tsv'

create_folders_if_not_exists([custom_dataset_path])

%run ../concatDatasets_onlytest.ipynb

length of beta_df: 46507




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 7188 entries removed.
removed all duplicates (CDR3, Epitope) from distinct values (most_important_columns, keep=False). 1435 entries removed.
beta removed entries df length: 1435


Number of groups formed: 655
1435 can be re-added to the no-duplicated dataframe
from the plain dataset which has 46507 entries, 7188 entries have been removed.
for beta dataset :
size difference is: 7188
  39319 information score cleaned: 6.0
  46507 information score dropout: 6.0
✅ Nach Duplikat-Filter (Train/Val): final_beta_df enthält 9105 Einträge.
final_beta_df length = 9105
length of paired_df: 27414




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 687 entries removed.
removed all duplicates from distinct values (cultivated columns, keep=False). 246 entries removed.
paired removed entries df length: 246


246 can be re-added to the no-duplicated dataframe
from the plain dataset which has 27414 entries, 687 entries have been removed.
for paired dataset:
size difference is: 687
  26727 information score cleaned: 8.976241254162458
  27414 information score dropout: 8.975888232290071
final_paired_df length: 26727


In [13]:
import shutil
import os

# Define source folder where the files are currently stored
source_folder = f'{pipeline_data_concatenated}/{precision}/'

# Define file names
output_file_beta = 'beta_concatenated_test.tsv'
output_file_paired = 'paired_concatenated_test.tsv'

# Define destination folders
destination_beta_folder = f'{pipeline_data_splitted}/{precision}/beta/'
destination_paired_folder = f'{pipeline_data_splitted}/{precision}/paired/'

# Ensure destination folders exist
os.makedirs(destination_beta_folder, exist_ok=True)
os.makedirs(destination_paired_folder, exist_ok=True)

# Copy files
shutil.copy(os.path.join(source_folder, output_file_beta), os.path.join(destination_beta_folder, 'test_prenegsamples.tsv'))
shutil.copy(os.path.join(source_folder, output_file_paired), os.path.join(destination_paired_folder, 'test_prenegsamples.tsv'))

print(f'Beta file copied successfully to {destination_beta_folder}test_prenegsamples.tsv')
print(f'Paired file copied successfully to {destination_paired_folder}test_prenegsamples.tsv')

Beta file copied successfully to ../../../../data/splitted_datasets/allele/beta/test_prenegsamples.tsv
Paired file copied successfully to ../../../../data/splitted_datasets/allele/paired/test_prenegsamples.tsv


In [14]:
# Define file paths
beta_file_path = f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv'
paired_file_path = f'{pipeline_data_splitted}/{precision}/paired/test_prenegsamples.tsv'

# Load beta dataset
beta_df = pd.read_csv(beta_file_path, sep='\t')

# Load paired dataset
paired_df = pd.read_csv(paired_file_path, sep='\t')

# Calculate unique values for beta dataset
unique_tcr_beta = beta_df['TRB_CDR3'].nunique()
unique_epitope_beta = beta_df['Epitope'].nunique()

# Calculate unique values for paired dataset
unique_tcr_paired = paired_df['TRB_CDR3'].nunique()
unique_epitope_paired = paired_df['Epitope'].nunique()

# Print results for beta dataset
print("\nBeta Dataset:")
print(f"- Unique TCRs: {unique_tcr_beta}")
print(f"- Unique Epitope: {unique_epitope_beta}")

# Print results for paired dataset
print("\nPaired Dataset:")
print(f"- Unique TCRs: {unique_tcr_paired}")
print(f"- Unique Epitope: {unique_epitope_paired}")


Beta Dataset:
- Unique TCRs: 8612
- Unique Epitope: 293

Paired Dataset:
- Unique TCRs: 21101
- Unique Epitope: 825


In [124]:
import pandas as pd

# Pfade
train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_path = f"{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv"

# Einlesen
train_df = pd.read_csv(train_path, sep="\t")
val_df = pd.read_csv(val_path, sep="\t")
test_df = pd.read_csv(test_path, sep="\t")

# Kombiniere bekannte TCRs & Epitopes aus Train + Validation
known_tcrs = set(train_df["TRB_CDR3"]) | set(val_df["TRB_CDR3"])
known_epitopes = set(train_df["Epitope"]) | set(val_df["Epitope"])

# Task-Tagging-Funktion
def calculate_task(row):
    epitope_exists = row['Epitope'] in known_epitopes
    trb_cdr3_exists = row['TRB_CDR3'] in known_tcrs

    if epitope_exists and trb_cdr3_exists:
        return 'TPP1'
    elif epitope_exists and not trb_cdr3_exists:
        return 'TPP2'
    elif not epitope_exists and not trb_cdr3_exists:
        return 'TPP3'
    elif not epitope_exists and trb_cdr3_exists:
        return 'TPP4'
    return "UNDEFINED"

# TPP-Spalte hinzufügen
test_df["task"] = test_df.apply(calculate_task, axis=1)

# Ergebnis anzeigen
print(test_df["task"].value_counts())

# speichern
test_df.to_csv(test_path, sep="\t", index=False)

  val_df = pd.read_csv(val_path, sep="\t")


task
TPP2    6803
TPP1    1129
TPP3    1047
TPP4     126
Name: count, dtype: int64


## Negative Data

In [62]:
#Daten einlesen

combined_donors_path = f'{pipeline_data_plain}/10x/combined_donors_consensus_annotations.csv'
all_donors_consensus = pd.read_csv(combined_donors_path, sep=',')

print("Consensus: ", all_donors_consensus.head())

all_donors_meta_path = f'{pipeline_data_plain}/10x/meta.csv'
all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')

print("Meta: ", all_donors_meta.head())

Consensus:                 barcode   donor  \
0   AAACCTGAGACAAAGG-4  donor1   
1  AAACCTGAGACTGTAA-34  donor1   
2   AAACCTGAGAGCCCAA-5  donor1   
3  AAACCTGAGAGCTGCA-24  donor1   
4   AAACCTGAGAGGGATA-8  donor1   

                                  cell_clono_cdr3_aa  \
0  TRA:CAASVSIWTGTASKLTF;TRA:CAAWDMEYGNKLVF;TRB:C...   
1                                    TRB:CASDTPVGQFF   
2                 TRA:CASYTDKLIF;TRB:CASSGGSISTDTQYF   
3                                 TRB:CASSGGQSSYEQYF   
4          TRA:CAASGYGNTGRRALTF;TRB:CASSQDPAGGYNEQFF   

                                  cell_clono_cdr3_nt     CD3  CD19  CD45RA  \
0  TRA:TGTGCAGCAAGCGTTAGTATTTGGACCGGCACTGCCAGTAAA...  2125.0   0.0   912.0   
1              TRB:TGTGCCAGCGATACCCCGGTTGGGCAGTTCTTC  1023.0   0.0  2028.0   
2  TRA:TGTGCTTCCTACACCGACAAGCTCATCTTT;TRB:TGCGCCA...  1598.0   3.0  3454.0   
3     TRB:TGCGCCAGCAGTGGCGGACAGAGCTCCTACGAGCAGTACTTC   298.0   1.0   880.0   
4  TRA:TGTGCAGCAAGCGGGTATGGAAACACGGGCAGGAGAGCACTT...  10

  all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')


### Beta

In [66]:
#Dieser Code für ganzen Datensatz laufen lassen
import re
import pandas as pd

# Festlegen der Batch-Größe für die Verarbeitung
batch_size = 1000  # Passe diese Zahl je nach Speicherressourcen an

# Identifizieren von Epitope-Spalten, aber ohne "NR(B0801)_AAKGRGAAL_NC_binder"
epitope_columns = [col for col in all_donors_consensus.columns if '_binder' in col and col != "NR(B0801)_AAKGRGAAL_NC_binder"]

# Liste für alle Batch-Ergebnisse
all_batches = []

# Verarbeite `all_donors_consensus` in Batches
for batch_start in range(0, len(all_donors_consensus), batch_size):
    print("Batch Start: ", batch_start)
    # Batch definieren
    batch = all_donors_consensus.iloc[batch_start:batch_start + batch_size]
    
    # Filtern auf Zeilen, die 'TRB:' enthalten
    batch_trb = batch[batch['cell_clono_cdr3_aa'].str.contains("TRB:", na=False)]

    # Liste, um Zeilen für diesen Batch zu speichern
    expanded_rows = []
    
    # Iteriere durch jede Zeile im Batch
    for _, row in batch_trb.iterrows():
        for col in epitope_columns:
            # Extrahiere MHC und Epitope
            match = re.match(r'([A-Z0-9]+)_([A-Z]+)_.*_binder', col)
            if match:
                mhc_raw, epitope = match.groups()
                mhc_formatted = f'HLA-{mhc_raw[0]}*{mhc_raw[1:3]}:{mhc_raw[3:]}'

                # Füge `Epitope` und `MHC` zur Zeile hinzu
                new_row = row.copy()
                new_row['Epitope'] = epitope
                new_row['MHC'] = mhc_formatted

                # Füge neue Zeile zur Batch-Liste hinzu
                expanded_rows.append(new_row)
    
    # Erstelle einen DataFrame aus dem Batch
    batch_df = pd.DataFrame(expanded_rows)
    all_batches.append(batch_df)  # Speichere den Batch in der Liste

# Kombiniere alle Batch-Ergebnisse zu einem DataFrame
expanded_df = pd.concat(all_batches, ignore_index=True)

# Nur die TRB-Chain-Einträge in `all_donors_meta` beibehalten
all_donors_meta_trb = all_donors_meta[all_donors_meta['chain'] == 'TRB']

# Zusammenführen der beiden DataFrames basierend auf der 'barcode' Spalte
merged_df = pd.merge(all_donors_meta_trb, expanded_df[['barcode', 'Epitope', 'MHC']], on='barcode', how='inner')

# Spalten umbenennen und Format anpassen
merged_df = merged_df.rename(columns={
    'barcode': 'TCR_name',
    'v_gene': 'TRBV',
    'j_gene': 'TRBJ',
    'c_gene': 'TRBC',
    'cdr3': 'TRB_CDR3'
})

# Fehlende Spalten auffüllen
desired_columns = ['TCR_name', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TRBC', 'Epitope', 'MHC', 'Binding', 'task']
for col in desired_columns:
    if col not in merged_df.columns:
        merged_df[col] = 'nan' if col == 'task' else '0'

# Nur die gewünschten Spalten beibehalten und Zeilen mit `None` in `TRB_CDR3` entfernen
final_df = merged_df[desired_columns]
final_df = final_df[final_df['TRB_CDR3'] != 'None']

final_df = final_df[final_df['TRB_CDR3'].notna() & (final_df['TRB_CDR3'] != '')]

# Ausgabe des ersten Teils des Ergebnisses zur Überprüfung
print(final_df.head())

# Speichern des kombinierten DataFrames
output_path = f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv'
final_df.to_csv(output_path, index=False)


Batch Start:  0
Batch Start:  1000
Batch Start:  2000
Batch Start:  3000
Batch Start:  4000
Batch Start:  5000
Batch Start:  6000
Batch Start:  7000
Batch Start:  8000
Batch Start:  9000
Batch Start:  10000
Batch Start:  11000
Batch Start:  12000
Batch Start:  13000
Batch Start:  14000
Batch Start:  15000
Batch Start:  16000
Batch Start:  17000
Batch Start:  18000
Batch Start:  19000
Batch Start:  20000
Batch Start:  21000
Batch Start:  22000
Batch Start:  23000
Batch Start:  24000
Batch Start:  25000
Batch Start:  26000
Batch Start:  27000
Batch Start:  28000
Batch Start:  29000
Batch Start:  30000
Batch Start:  31000
Batch Start:  32000
Batch Start:  33000
Batch Start:  34000
Batch Start:  35000
Batch Start:  36000
Batch Start:  37000
Batch Start:  38000
Batch Start:  39000
Batch Start:  40000
Batch Start:  41000
Batch Start:  42000
Batch Start:  43000
Batch Start:  44000
Batch Start:  45000
Batch Start:  46000
Batch Start:  47000
Batch Start:  48000
Batch Start:  49000
Batch Start: 

### Beta Samples generieren für Test File

### Task Klassifikation an Validation Angleichung

#### Für Test File

In [182]:
# prepare parameters for beta dataset
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'

read_path_train = f"{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv" 
read_path_test = f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv' 
read_path_validation = f"{pipeline_data_splitted}/{precision}/beta/validation_prenegsamples.tsv"
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/beta/'
output_path = f"{pipeline_data_splitted}/{precision}/beta/new/negatives"
train_output_name = "train_neg_tpp2.tsv"
validation_output_name = "val_neg_tpp2.tsv"
test_output_name = "test_neg_tpp2.tsv"

create_folders_if_not_exists([temp_path])

%run ../negative_samples/negative_samples_beta_tpp_test.ipynb

Using device: cuda:0
Loading: Rostlab/prot_t5_xl_half_uniref50-enc


In [183]:
import pandas as pd

# --- Dateipfade ---
read_path_val = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{validation_output_name}"
read_path_test = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{test_output_name}"

# --- Dateien laden ---
val_neg = pd.read_csv(read_path_val, sep='\t')
test_neg = pd.read_csv(read_path_test, sep='\t')

# --- Spalte "source" setzen ---
for df in [val_neg, test_neg]:
    df["source"] = "generated"

# --- Zurückspeichern ---
val_neg.to_csv(read_path_val, sep='\t', index=False)
test_neg.to_csv(read_path_test, sep='\t', index=False)

print("✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.")

✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.


In [184]:
import pandas as pd

# --- Pfade ---
train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"

neg_val_path = f"{output_path}/{validation_output_name}"
neg_test_path = f"{output_path}/{test_output_name}"

# --- Daten laden ---
train_df = pd.read_csv(train_path, sep='\t')
val_df = pd.read_csv(val_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

neg_val_df = pd.read_csv(neg_val_path, sep='\t')
neg_test_df = pd.read_csv(neg_test_path, sep='\t')

# --- Existing train/val/test Kontext für Klassifizierung ---
trainval_tcrs = set(pd.concat([train_df, val_df])['TRB_CDR3'])
trainval_epitopes = set(pd.concat([train_df, val_df])['Epitope'])

# --- Klassifikationsfunktion ---
def classify_task(tcr, epitope):
    seen_tcr = tcr in trainval_tcrs
    seen_epi = epitope in trainval_epitopes
    if seen_tcr and seen_epi:
        return 'TPP1'
    elif not seen_tcr and seen_epi:
        return 'TPP2'
    elif not seen_tcr and not seen_epi:
        return 'TPP3'
    elif seen_tcr and not seen_epi:
        return 'TPP4'

# --- Tasks für neue Negatives zuweisen ---
neg_val_df['task_predicted'] = neg_val_df.apply(lambda row: classify_task(row['TRB_CDR3'], row['Epitope']), axis=1)
neg_test_df['task_predicted'] = neg_test_df.apply(lambda row: classify_task(row['TRB_CDR3'], row['Epitope']), axis=1)

# --- Prüfen wie viele wirklich TPP3 sind ---
print("\n📊 Predicted Tasks Verteilung für neue Validation-Negative:")
print(neg_val_df['task_predicted'].value_counts())

print("\n📊 Predicted Tasks Verteilung für neue Test-Negative:")
print(neg_test_df['task_predicted'].value_counts())

# --- Prüfen auf Duplikate (Epitope, TCR) gegen bestehende Beispiele ---
used_pairs = set(pd.concat([train_df, val_df, test_df])[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

neg_val_pairs = set(neg_val_df[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))
neg_test_pairs = set(neg_test_df[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

dups_val = neg_val_pairs.intersection(used_pairs)
dups_test = neg_test_pairs.intersection(used_pairs)

print(f"\\n🔎 Validation: {len(dups_val)} Duplikate gefunden.")
print(f"🔎 Test: {len(dups_test)} Duplikate gefunden.")

  train_df = pd.read_csv(train_path, sep='\t')
  val_df = pd.read_csv(val_path, sep='\t')



📊 Predicted Tasks Verteilung für neue Validation-Negative:
task_predicted
TPP1    16303
Name: count, dtype: int64

📊 Predicted Tasks Verteilung für neue Test-Negative:
task_predicted
TPP2    5089
TPP1    1693
Name: count, dtype: int64
\n🔎 Validation: 4855 Duplikate gefunden.
🔎 Test: 3583 Duplikate gefunden.


In [185]:
# --- Lade bestehende Testdaten ---
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"
test_df = pd.read_csv(test_path, sep='\t')

# --- Lade neu generierte Negative ---
neg_test_path = f"{output_path}/{test_output_name}"
neg_test_df = pd.read_csv(neg_test_path, sep='\t')

# --- Positive Paare (TRB_CDR3, Epitope) aus bestehenden Daten ---
existing_pairs = set(pd.concat([
    pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv", sep='\t'),
    pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv", sep='\t'),
    test_df
])[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

# --- Prüfe neue Negative auf Duplikate ---
neg_test_df['pair'] = list(zip(neg_test_df['TRB_CDR3'], neg_test_df['Epitope']))

# --- Nur behalten, was kein Duplikat ist ---
neg_test_df_clean = neg_test_df[~neg_test_df['pair'].isin(existing_pairs)].drop(columns=['pair'])

print(f"✅ {len(neg_test_df_clean)} saubere neue TPP2-Negative im Test übrig.")

# --- Test aktualisieren ---
test_df_final = pd.concat([test_df, neg_test_df_clean], ignore_index=True)
test_df_final.to_csv(test_path, sep='\t', index=False)

print("✅ Testset erfolgreich aktualisiert.")

  pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv", sep='\t'),
  pd.read_csv(f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv", sep='\t'),


✅ 3188 saubere neue TPP2-Negative im Test übrig.
✅ Testset erfolgreich aktualisiert.


#### Für Validation File

In [198]:
# prepare parameters for beta dataset
output_train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
output_val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_output_path = f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'

read_path_train = f"{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv" 
read_path_test = f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv' 
read_path_validation = f"{pipeline_data_splitted}/{precision}/beta/validation_prenegsamples.tsv" 
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/beta/'
output_path = f"{pipeline_data_splitted}/{precision}/beta/new/negatives"
train_output_name = "train_neg_tpp1.tsv"
validation_output_name = "val_neg_tpp1.tsv"
test_output_name = "test_neg_tpp1.tsv"

create_folders_if_not_exists([temp_path])

%run ../negative_samples/negative_samples_beta_task_val.ipynb

  beta_train_df = pd.read_csv(read_path_train, sep="\t")


Using device: cuda:0
Loading: Rostlab/prot_t5_xl_half_uniref50-enc


In [199]:
import pandas as pd

# --- Dateipfade ---
read_path_val = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{validation_output_name}"
read_path_test = f"{pipeline_data_splitted}/{precision}/beta/new/negatives/{test_output_name}"

# --- Dateien laden ---
val_neg = pd.read_csv(read_path_val, sep='\t')
test_neg = pd.read_csv(read_path_test, sep='\t')

# --- Spalte "source" setzen ---
for df in [val_neg, test_neg]:
    df["source"] = "generated"

# --- Zurückspeichern ---
val_neg.to_csv(read_path_val, sep='\t', index=False)
test_neg.to_csv(read_path_test, sep='\t', index=False)

print("✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.")

✅ Spalte 'source' auf 'generated' gesetzt und gespeichert.


In [200]:
import pandas as pd

# --- Pfade ---
train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"

neg_val_path = f"{output_path}/{validation_output_name}"
neg_test_path = f"{output_path}/{test_output_name}"

# --- Daten laden ---
train_df = pd.read_csv(train_path, sep='\t')
val_df = pd.read_csv(val_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

neg_val_df = pd.read_csv(neg_val_path, sep='\t')
neg_test_df = pd.read_csv(neg_test_path, sep='\t')

# --- Paare erstellen ---
used_pairs = set(pd.concat([train_df, val_df, test_df])[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

neg_val_df['pair'] = list(zip(neg_val_df['TRB_CDR3'], neg_val_df['Epitope']))
neg_test_df['pair'] = list(zip(neg_test_df['TRB_CDR3'], neg_test_df['Epitope']))

# --- Nur behalten, was kein Duplikat ist ---
neg_val_df = neg_val_df[~neg_val_df['pair'].isin(used_pairs)].drop(columns=['pair'])
neg_test_df = neg_test_df[~neg_test_df['pair'].isin(used_pairs)].drop(columns=['pair'])

# --- Existing train/val Kontext für Klassifizierung 
trainval_tcrs = set(pd.concat([train_df, test_df])['TRB_CDR3'])
trainval_epitopes = set(pd.concat([train_df, test_df])['Epitope'])

# --- Klassifikationsfunktion ---
def classify_task(tcr, epitope):
    seen_tcr = tcr in trainval_tcrs
    seen_epi = epitope in trainval_epitopes
    if seen_tcr and seen_epi:
        return 'TPP1'
    elif not seen_tcr and seen_epi:
        return 'TPP2'
    elif not seen_tcr and not seen_epi:
        return 'TPP3'
    elif seen_tcr and not seen_epi:
        return 'TPP4'

# --- Tasks für neue saubere Negatives zuweisen ---
neg_val_df['task_predicted'] = neg_val_df.apply(lambda row: classify_task(row['TRB_CDR3'], row['Epitope']), axis=1)
neg_test_df['task_predicted'] = neg_test_df.apply(lambda row: classify_task(row['TRB_CDR3'], row['Epitope']), axis=1)

# --- Prüfen wie viele wirklich TPP3 sind ---
print("\n📊 Predicted Tasks Verteilung für neue Validation-Negative:")
print(neg_val_df['task_predicted'].value_counts())

print("\n📊 Predicted Tasks Verteilung für neue Test-Negative:")
print(neg_test_df['task_predicted'].value_counts())

  train_df = pd.read_csv(train_path, sep='\t')
  val_df = pd.read_csv(val_path, sep='\t')



📊 Predicted Tasks Verteilung für neue Validation-Negative:
task_predicted
TPP1    13796
Name: count, dtype: int64

📊 Predicted Tasks Verteilung für neue Test-Negative:
task_predicted
TPP1    939
Name: count, dtype: int64


In [201]:
# --- Pfade ---
train_path = f"{pipeline_data_splitted}/{precision}/beta/new/train.tsv"
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/validation.tsv"
test_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"
neg_test_path = f"{output_path}/{test_output_name}"

# --- Daten laden ---
train_df = pd.read_csv(train_path, sep='\t')
val_df = pd.read_csv(val_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')
neg_test_df = pd.read_csv(neg_test_path, sep='\t')

# --- Paare erstellen ---
existing_trainval_pairs = set(pd.concat([train_df, val_df])[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))
existing_test_pairs = set(test_df[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1))

neg_val_df['pair'] = list(zip(neg_val_df['TRB_CDR3'], neg_val_df['Epitope']))

# --- Step 1: Nur neue Paare, die noch nicht in Train+Val existieren
neg_not_in_trainval = neg_val_df[~neg_val_df['pair'].isin(existing_trainval_pairs)]

# --- Step 2: Von denen nur die behalten, die auch NICHT im Test existieren
neg_safe_for_val = neg_not_in_trainval[~neg_not_in_trainval['pair'].isin(existing_test_pairs)].drop(columns=['pair'])

# --- Step 3: Nur TPP3 Negatives auswählen ---
neg_safe_for_val_tpp3 = neg_safe_for_val[neg_safe_for_val['task_predicted'] == 'TPP1']

print(f"✅ {len(neg_safe_for_val_tpp3)} saubere neue TPP2-Negative für Validation übrig.")

# --- Validation aktualisieren ---
val_df_final = pd.concat([val_df, neg_safe_for_val_tpp3], ignore_index=True)
val_df_final.to_csv(val_path, sep='\t', index=False)

print("✅ Validation-Set erfolgreich aktualisiert nur mit TPP3-Negatives.")

  train_df = pd.read_csv(train_path, sep='\t')
  val_df = pd.read_csv(val_path, sep='\t')


✅ 13796 saubere neue TPP2-Negative für Validation übrig.
✅ Validation-Set erfolgreich aktualisiert nur mit TPP3-Negatives.


### rauslöschen von zu vielen non-binders in TPPs

In [202]:
import pandas as pd

# --- Validation laden ---
val_path = f"{pipeline_data_splitted}/{precision}/beta/new/test.tsv"
val_df = pd.read_csv(val_path, sep='\t')

# --- TPP1-Daten herausfiltern ---
tpp1_df = val_df[val_df['task'] == 'TPP1']

# --- Non-Binder und Binder trennen ---
tpp1_binder = tpp1_df[tpp1_df['Binding'] == 1]
tpp1_nonbinder = tpp1_df[tpp1_df['Binding'] == 0]

print(f"🔎 Aktuell TPP1 Validation: {len(tpp1_binder)} Binder und {len(tpp1_nonbinder)} Non-Binder")

# --- Zielverhältnis ---
target_binder_percentage = 14 

# --- Zielanzahl berechnen ---
target_total = int(len(tpp1_binder) / (target_binder_percentage / 100))
target_nonbinder = target_total - len(tpp1_binder)
print(f"🎯 Ziel: {len(tpp1_binder)} Binder und {target_nonbinder} Non-Binder")

# --- Nach Quelle sortieren ---
generated_neg = tpp1_nonbinder[tpp1_nonbinder['source'] == 'generated']
tenx_neg = tpp1_nonbinder[tpp1_nonbinder['source'] == '10X']
other_neg = tpp1_nonbinder[~tpp1_nonbinder['source'].isin(['generated', '10X'])]

# --- Sampling in Reihenfolge: generated → 10X → andere ---
neg_pool = pd.concat([generated_neg, tenx_neg, other_neg], ignore_index=True)

if len(neg_pool) < target_nonbinder:
    print(f"⚠️ Nur {len(neg_pool)} Non-Binder verfügbar (benötigt: {target_nonbinder}) – alle verwendet.")
    tpp1_nonbinder_reduced = neg_pool
else:
    tpp1_nonbinder_reduced = neg_pool.sample(n=target_nonbinder, random_state=42)

# --- Neues TPP1-Subset bauen ---
tpp1_final = pd.concat([tpp1_binder, tpp1_nonbinder_reduced], ignore_index=True)

print(f"✅ Neue TPP1 Größe: {len(tpp1_final)} Beispiele (Binder: {tpp1_final['Binding'].sum()}, Non-Binder: {len(tpp1_final) - tpp1_final['Binding'].sum()})")

# --- Rest der Validation behalten ---
val_df_rest = val_df[val_df['task'] != 'TPP1']

# --- Neues Validation-Set zusammenbauen ---
val_df_new = pd.concat([val_df_rest, tpp1_final], ignore_index=True)

# --- Speichern ---
val_df_new.to_csv(val_path, sep='\t', index=False)

print("✅ Validation-Set erfolgreich aktualisiert mit neuem TPP1-Verhältnis und bevorzugtem Entfernen von 'generated'.")


🔎 Aktuell TPP1 Validation: 2541 Binder und 22422 Non-Binder
🎯 Ziel: 2541 Binder und 15609 Non-Binder
✅ Neue TPP1 Größe: 18150 Beispiele (Binder: 2541, Non-Binder: 15609)
✅ Validation-Set erfolgreich aktualisiert mit neuem TPP1-Verhältnis und bevorzugtem Entfernen von 'generated'.


In [180]:
import pandas as pd
import os

# --- Pfade ---
datasets = {
    "beta_allele": {
        "train": f'{pipeline_data_splitted}/{precision}/beta/new/train.tsv',
        "test": f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv',
        "validation": f'{pipeline_data_splitted}/{precision}/beta/new/validation.tsv'
    }
}

# === Verarbeitung: positive Bindings markieren ===
for name, paths in datasets.items():
    for split, path in paths.items():
        if os.path.exists(path):
            df = pd.read_csv(path, sep="\t")
            
            # Nur Binding==1 filtern
            mask = df["Binding"] == 1
            df.loc[mask, "source"] = "dataset"
            
            # Datei überschreiben
            df.to_csv(path, sep="\t", index=False)
            print(f"✅ '{split}' aktualisiert: 'source' für {mask.sum()} positive Einträge auf 'dataset' gesetzt.")
        else:
            print(f"⚠️ Datei nicht gefunden: {path}")


  df = pd.read_csv(path, sep="\t")


✅ 'train' aktualisiert: 'source' für 126286 positive Einträge auf 'dataset' gesetzt.
✅ 'test' aktualisiert: 'source' für 8745 positive Einträge auf 'dataset' gesetzt.


  df = pd.read_csv(path, sep="\t")


✅ 'validation' aktualisiert: 'source' für 28266 positive Einträge auf 'dataset' gesetzt.


In [177]:
import pandas as pd
import os

# Pfade
datasets = {
    "train": f'{pipeline_data_splitted}/{precision}/beta/new/train.tsv',
    "validation": f'{pipeline_data_splitted}/{precision}/beta/new/validation.tsv',
    "test": f'{pipeline_data_splitted}/{precision}/beta/new/test.tsv'
}

# Check der source-Verteilung
for split, path in datasets.items():
    if os.path.exists(path):
        df = pd.read_csv(path, sep="\t")
        print(f"\n--- {split.upper()} ---")
        if "source" in df.columns:
            print(df["source"].value_counts(dropna=False))
        else:
            print("⚠️ Keine 'source'-Spalte vorhanden.")
    else:
        print(f"⚠️ Datei nicht gefunden: {path}")


  df = pd.read_csv(path, sep="\t")



--- TRAIN ---
source
10X          315980
generated    315450
dataset      126286
Name: count, dtype: int64

--- VALIDATION ---
source
10X          79110
generated    56915
dataset      28266
Name: count, dtype: int64

--- TEST ---
source
generated    25770
10X          13513
dataset       8745
Name: count, dtype: int64


  df = pd.read_csv(path, sep="\t")


## Task Classification 
The classification in the split notebook correct for positive only data. After adding negative data, some classifications might be wrong.

In [12]:
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta/new'
test_file_name = 'test.tsv'
validation_file_name = 'validation.tsv'
train_file_name = 'train.tsv'

In [203]:
# do the classification for beta data
paired = False
train_data_path = f'{beta_output_folder}/{train_file_name}'
validation_data_path = f'{beta_output_folder}/{validation_file_name}'
test_data_path = f'{beta_output_folder}/{test_file_name}'

%run ../data_preparation/classification.ipynb

  df_train = pd.read_csv(train_data_path, sep="\t")
  df_validation = pd.read_csv(validation_data_path, sep="\t")


test data has 18150 TPP1 tasks (seen tcr & seen epitopes).
test data has 29788 TPP2 tasks (unseen tcr & seen epitopes).
test data has 5375 TPP3 tasks (unseen tcr & unseen epitope).
test data has 813 TPP4 tasks (seen tcr & unseen epitope).


In the next two cells the classification is checked. If the output says "Classification is correct", everything is fine.

In [204]:
# check task classification beta
splitted_data_path = beta_output_folder

%run ../data_preparation/check_task_classification_beta.ipynb

  df_train = pd.read_csv(f"{splitted_data_path}/{train_file_name}", sep="\t")
  df_validate = pd.read_csv(f"{splitted_data_path}/{validation_file_name}", sep="\t")


train data has 755758 entries
test data has 54126 entries
test data has 18150 TPP1 tasks (seen tcr & seen epitopes).
test data has 29788 TPP2 tasks (unseen tcr & seen epitopes).
test data has 5375 TPP3 tasks (unseen tcr & unseen epitope).
test data has 813 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.9447080588366893/0.05529194116331073
Classification is correct.
Correctness summary:
is_correct
True    54126
Name: count, dtype: int64


## Upload dataset

In [205]:
import os
print(os.listdir(path_to_data))


NameError: name 'path_to_data' is not defined

In [206]:
from dotenv import load_dotenv, find_dotenv
load_dotenv()

# upload paired data
path_to_data = f'{pipeline_data_splitted}/{precision}/paired'
dataset_name = f'paired_{precision}'
#main_project_name = os.getenv("MAIN_PROJECT_NAME")
main_project_name = f"dataset-{precision}"

%run ../upload_datasets.ipynb

uploading dataset to dataset-allele


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marina-frohofer[0m ([33mba_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./../../../../data/splitted_datasets/allele/paired)... Done. 0.2s


In [207]:
# upload beta data
path_to_data = f'{pipeline_data_splitted}/{precision}/beta'
dataset_name = f'beta_{precision}'

%run ../upload_datasets.ipynb

uploading dataset to dataset-allele


[34m[1mwandb[0m: Adding directory to artifact (./../../../../data/splitted_datasets/allele/beta)... Done. 2.7s


VBox(children=(Label(value='1.561 MB of 558.558 MB uploaded\r'), FloatProgress(value=0.0027942574582421036, ma…

## Create Embeddings >> ProtBert

In [210]:
import torch
print(torch.cuda.is_available())  # Sollte True zurückgeben
print(torch.version.cuda)  # Sollte die richtige CUDA-Version anzeigen

True
12.4


In [211]:
path_paired_test = f"{pipeline_data_splitted}/{precision}/paired/test.tsv"
path_paired_validation = f"{pipeline_data_splitted}/{precision}/paired/validation.tsv"
path_paired_train = f"{pipeline_data_splitted}/{precision}/paired/train.tsv"
path_beta_test = f"{pipeline_data_splitted}/{precision}/beta/test.tsv"
path_beta_validation = f"{pipeline_data_splitted}/{precision}/beta/validation.tsv"
path_beta_train = f"{pipeline_data_splitted}/{precision}/beta/train.tsv"


path_paired = f"{pipeline_data}/embeddings/temp/{precision}/paired_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_paired)])
df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)
df_paired_validation = pd.read_csv(path_paired_validation, sep="\t", index_col=False)
df_paired_train = pd.read_csv(path_paired_train, sep="\t", index_col=False)
df_paired = pd.concat([df_paired_test, df_paired_validation, df_paired_train])
df_paired.to_csv(path_paired, sep="\t", index=False)

# paired
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz TRA_CDR3
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz TRB_CDR3
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz Epitope

path_beta = f"{pipeline_data}/embeddings/temp/{precision}/beta_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_beta)])
df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_test, df_beta_validation, df_beta_train])
df_beta.to_csv(path_beta, sep="\t", index=False)

# beta
%run ../generateEmbeddings.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/TRB_beta_embeddings.npz TRB_CDR3
%run ../generateEmbeddings.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/Epitope_beta_embeddings.npz Epitope

  df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)
  df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
  df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)


Using GPU: Tesla T4
Loading: Rostlab/prot_t5_xl_half_uniref50-enc
Model is on device: cuda:0
Processing Batch:  0 64
Processing Batch:  64 128


OutOfMemoryError: CUDA out of memory. Tried to allocate 84.00 MiB. GPU 0 has a total capacity of 14.58 GiB of which 37.62 MiB is free. Process 124678 has 5.10 GiB memory in use. Including non-PyTorch memory, this process has 9.44 GiB memory in use. Of the allocated memory 9.18 GiB is allocated by PyTorch, and 145.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Using GPU: Tesla T4
Loading: Rostlab/prot_t5_xl_half_uniref50-enc
Model is on device: cuda:0
Processing Batch:  0 64


OutOfMemoryError: CUDA out of memory. Tried to allocate 44.00 MiB. GPU 0 has a total capacity of 14.58 GiB of which 5.62 MiB is free. Process 124678 has 5.10 GiB memory in use. Including non-PyTorch memory, this process has 9.47 GiB memory in use. Of the allocated memory 9.26 GiB is allocated by PyTorch, and 90.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)