In [1]:
import os
import pandas as pd

In [2]:
import sys
!"{sys.executable}" -m pip install tidytcells



In [3]:
# set precision of mhc and V/J values (gene or allele)
precision = 'allele'

In [4]:
# this function is not thread safe
def create_folders_if_not_exists(folders):
  for path in folders:
    if not os.path.exists(path):
      os.makedirs(path)

In [5]:
pipeline_data = '../../../../data'
pipeline_data_plain = f'{pipeline_data}/plain_datasets'
pipeline_data_cleaned = f'{pipeline_data}/cleaned_datasets'
pipeline_data_concatenated = f'{pipeline_data}/concatenated_datasets'
pipeline_data_splitted = f'{pipeline_data}/splitted_datasets'
pipeline_data_temp_bucket = f'{pipeline_data}/temp'

pipeline_folders = [pipeline_data, pipeline_data_plain, pipeline_data_cleaned, pipeline_data_concatenated, pipeline_data_splitted, pipeline_data_temp_bucket]

create_folders_if_not_exists(pipeline_folders)

## Data Preparation

### VDJdb

In [6]:
# prepare directories
VDJdb_data_plain = f'{pipeline_data_plain}/VDJdb'
VDJdb_data_cleaned = f'{pipeline_data_cleaned}/VDJdb'
VDJdb_data_fitted = f'{pipeline_data_temp_bucket}/VDJdb'

VDJdb_folders = [VDJdb_data_plain, VDJdb_data_cleaned, VDJdb_data_fitted]
create_folders_if_not_exists(VDJdb_folders)

fitted_beta_file = 'VDJdb_beta_fitted.tsv'
fitted_paired_file = 'VDJdb_paired_fitted.tsv'

In [7]:
# prepare parameters for notebook VDJdb fit data paired
input_file = f'{VDJdb_data_plain}/VDJdb_paired_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_paired_file

# fit paired VDJdb data
%run ../VDJdb/fit_data_vdjdb_paired.ipynb

In [8]:
# prepare parameters for notebook VDJdb fit data beta
input_file = f'{VDJdb_data_plain}/VDJdb_beta_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_beta_file

# fit beta VDJdb data
%run ../VDJdb/fit_data_vdjdb_beta.ipynb

In [9]:
# prepare parameters for notebook VDJdb clean data paired
input_file = f'{VDJdb_data_fitted}/{fitted_paired_file}'
cleaned_file_paired = 'VDJdb_cleaned_data_paired.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

# clean paired VDJdb data
%run ../VDJdb/clean_data_vdjdb_paired.ipynb

MHC Class I has 27414 entries
whole dataframe has 28119 entries
filtered to only use MHC Class I. Length of dataset: 27414


In [10]:
# prepare parameters for notebook VDJdb clean data beta
input_file = f'{VDJdb_data_fitted}/{fitted_beta_file}'
cleaned_file_beta = 'VDJdb_cleaned_data_beta.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'

# clean beta VDJdb data
%run ../VDJdb/clean_data_vdjdb_beta.ipynb

MHC Class I has 46507 entries
whole dataframe has 49042 entries
filtered to only use MHC Class I. Length of dataset: 46507


In [11]:
VDJdb_cleaned_beta_output = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'
VDJdb_cleaned_paired_output = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

In [None]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'

vdjdb_beta_read_path = VDJdb_cleaned_beta_output
vdjdb_paired_read_path = VDJdb_cleaned_paired_output

output_file_beta = 'beta_concatenated_test.tsv'
output_file_paired = 'paired_concatenated_test.tsv'

create_folders_if_not_exists([custom_dataset_path])

%run ../concatDatasets_onlytest.ipynb

length of beta_df: 46507




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 7188 entries removed.
removed all duplicates (CDR3, Epitope) from distinct values (most_important_columns, keep=False). 1435 entries removed.
beta removed entries df length: 1435


Number of groups formed: 655
1435 can be re-added to the no-duplicated dataframe
from the plain dataset which has 46507 entries, 7188 entries have been removed.
for beta dataset :
size difference is: 7188
  39319 information score cleaned: 6.0
  46507 information score dropout: 6.0
final_beta_df length = 39319
length of paired_df: 27414




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 687 entries removed.
removed all duplicates from distinct values (cultivated columns, keep=False). 246 entries removed.
paired removed entries df length: 246


246 can be re-added to the no-duplicated dataframe
from the plain dataset which has 27414 entries, 687 entries have been removed.
for paired dataset:
size difference is: 687
  26727 information score cleaned: 8.976241254162458
  27414 information score dropout: 8.975888232290071
final_paired_df length: 26727


In [29]:
import shutil
import os

# Define source folder where the files are currently stored
source_folder = f'{pipeline_data_concatenated}/{precision}/'

# Define file names
output_file_beta = 'beta_concatenated_test.tsv'
output_file_paired = 'paired_concatenated_test.tsv'

# Define destination folders
destination_beta_folder = f'{pipeline_data_splitted}/{precision}/beta/'
destination_paired_folder = f'{pipeline_data_splitted}/{precision}/paired/'

# Ensure destination folders exist
os.makedirs(destination_beta_folder, exist_ok=True)
os.makedirs(destination_paired_folder, exist_ok=True)

# Copy files
shutil.copy(os.path.join(source_folder, output_file_beta), os.path.join(destination_beta_folder, 'test_prenegsamples.tsv'))
shutil.copy(os.path.join(source_folder, output_file_paired), os.path.join(destination_paired_folder, 'test_prenegsamples.tsv'))

print(f'Beta file copied successfully to {destination_beta_folder}test_prenegsamples.tsv')
print(f'Paired file copied successfully to {destination_paired_folder}test_prenegsamples.tsv')

Beta file copied successfully to ../../data/splitted_datasets/allele/beta/test_prenegsamples.tsv
Paired file copied successfully to ../../data/splitted_datasets/allele/paired/test_prenegsamples.tsv


In [15]:
# Define file paths
beta_file_path = f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv'
paired_file_path = f'{pipeline_data_splitted}/{precision}/paired/test_prenegsamples.tsv'

# Load beta dataset
beta_df = pd.read_csv(beta_file_path, sep='\t')

# Load paired dataset
paired_df = pd.read_csv(paired_file_path, sep='\t')

# Calculate unique values for beta dataset
unique_tcr_beta = beta_df['TRB_CDR3'].nunique()
unique_epitope_beta = beta_df['Epitope'].nunique()

# Calculate unique values for paired dataset
unique_tcr_paired = paired_df['TRB_CDR3'].nunique()
unique_epitope_paired = paired_df['Epitope'].nunique()

# Print results for beta dataset
print("\nBeta Dataset:")
print(f"- Unique TCRs: {unique_tcr_beta}")
print(f"- Unique Epitope: {unique_epitope_beta}")

# Print results for paired dataset
print("\nPaired Dataset:")
print(f"- Unique TCRs: {unique_tcr_paired}")
print(f"- Unique Epitope: {unique_epitope_paired}")


Beta Dataset:
- Unique TCRs: 36039
- Unique Epitope: 1003

Paired Dataset:
- Unique TCRs: 21101
- Unique Epitope: 825


## Negative Data

In [62]:
#Daten einlesen

combined_donors_path = f'{pipeline_data_plain}/10x/combined_donors_consensus_annotations.csv'
all_donors_consensus = pd.read_csv(combined_donors_path, sep=',')

print("Consensus: ", all_donors_consensus.head())

all_donors_meta_path = f'{pipeline_data_plain}/10x/meta.csv'
all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')

print("Meta: ", all_donors_meta.head())

Consensus:                 barcode   donor  \
0   AAACCTGAGACAAAGG-4  donor1   
1  AAACCTGAGACTGTAA-34  donor1   
2   AAACCTGAGAGCCCAA-5  donor1   
3  AAACCTGAGAGCTGCA-24  donor1   
4   AAACCTGAGAGGGATA-8  donor1   

                                  cell_clono_cdr3_aa  \
0  TRA:CAASVSIWTGTASKLTF;TRA:CAAWDMEYGNKLVF;TRB:C...   
1                                    TRB:CASDTPVGQFF   
2                 TRA:CASYTDKLIF;TRB:CASSGGSISTDTQYF   
3                                 TRB:CASSGGQSSYEQYF   
4          TRA:CAASGYGNTGRRALTF;TRB:CASSQDPAGGYNEQFF   

                                  cell_clono_cdr3_nt     CD3  CD19  CD45RA  \
0  TRA:TGTGCAGCAAGCGTTAGTATTTGGACCGGCACTGCCAGTAAA...  2125.0   0.0   912.0   
1              TRB:TGTGCCAGCGATACCCCGGTTGGGCAGTTCTTC  1023.0   0.0  2028.0   
2  TRA:TGTGCTTCCTACACCGACAAGCTCATCTTT;TRB:TGCGCCA...  1598.0   3.0  3454.0   
3     TRB:TGCGCCAGCAGTGGCGGACAGAGCTCCTACGAGCAGTACTTC   298.0   1.0   880.0   
4  TRA:TGTGCAGCAAGCGGGTATGGAAACACGGGCAGGAGAGCACTT...  10

  all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')


### Beta

In [66]:
#Dieser Code für ganzen Datensatz laufen lassen
import re
import pandas as pd

# Festlegen der Batch-Größe für die Verarbeitung
batch_size = 1000  # Passe diese Zahl je nach Speicherressourcen an

# Identifizieren von Epitope-Spalten, aber ohne "NR(B0801)_AAKGRGAAL_NC_binder"
epitope_columns = [col for col in all_donors_consensus.columns if '_binder' in col and col != "NR(B0801)_AAKGRGAAL_NC_binder"]

# Liste für alle Batch-Ergebnisse
all_batches = []

# Verarbeite `all_donors_consensus` in Batches
for batch_start in range(0, len(all_donors_consensus), batch_size):
    print("Batch Start: ", batch_start)
    # Batch definieren
    batch = all_donors_consensus.iloc[batch_start:batch_start + batch_size]
    
    # Filtern auf Zeilen, die 'TRB:' enthalten
    batch_trb = batch[batch['cell_clono_cdr3_aa'].str.contains("TRB:", na=False)]

    # Liste, um Zeilen für diesen Batch zu speichern
    expanded_rows = []
    
    # Iteriere durch jede Zeile im Batch
    for _, row in batch_trb.iterrows():
        for col in epitope_columns:
            # Extrahiere MHC und Epitope
            match = re.match(r'([A-Z0-9]+)_([A-Z]+)_.*_binder', col)
            if match:
                mhc_raw, epitope = match.groups()
                mhc_formatted = f'HLA-{mhc_raw[0]}*{mhc_raw[1:3]}:{mhc_raw[3:]}'

                # Füge `Epitope` und `MHC` zur Zeile hinzu
                new_row = row.copy()
                new_row['Epitope'] = epitope
                new_row['MHC'] = mhc_formatted

                # Füge neue Zeile zur Batch-Liste hinzu
                expanded_rows.append(new_row)
    
    # Erstelle einen DataFrame aus dem Batch
    batch_df = pd.DataFrame(expanded_rows)
    all_batches.append(batch_df)  # Speichere den Batch in der Liste

# Kombiniere alle Batch-Ergebnisse zu einem DataFrame
expanded_df = pd.concat(all_batches, ignore_index=True)

# Nur die TRB-Chain-Einträge in `all_donors_meta` beibehalten
all_donors_meta_trb = all_donors_meta[all_donors_meta['chain'] == 'TRB']

# Zusammenführen der beiden DataFrames basierend auf der 'barcode' Spalte
merged_df = pd.merge(all_donors_meta_trb, expanded_df[['barcode', 'Epitope', 'MHC']], on='barcode', how='inner')

# Spalten umbenennen und Format anpassen
merged_df = merged_df.rename(columns={
    'barcode': 'TCR_name',
    'v_gene': 'TRBV',
    'j_gene': 'TRBJ',
    'c_gene': 'TRBC',
    'cdr3': 'TRB_CDR3'
})

# Fehlende Spalten auffüllen
desired_columns = ['TCR_name', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TRBC', 'Epitope', 'MHC', 'Binding', 'task']
for col in desired_columns:
    if col not in merged_df.columns:
        merged_df[col] = 'nan' if col == 'task' else '0'

# Nur die gewünschten Spalten beibehalten und Zeilen mit `None` in `TRB_CDR3` entfernen
final_df = merged_df[desired_columns]
final_df = final_df[final_df['TRB_CDR3'] != 'None']

final_df = final_df[final_df['TRB_CDR3'].notna() & (final_df['TRB_CDR3'] != '')]

# Ausgabe des ersten Teils des Ergebnisses zur Überprüfung
print(final_df.head())

# Speichern des kombinierten DataFrames
output_path = f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv'
final_df.to_csv(output_path, index=False)


Batch Start:  0
Batch Start:  1000
Batch Start:  2000
Batch Start:  3000
Batch Start:  4000
Batch Start:  5000
Batch Start:  6000
Batch Start:  7000
Batch Start:  8000
Batch Start:  9000
Batch Start:  10000
Batch Start:  11000
Batch Start:  12000
Batch Start:  13000
Batch Start:  14000
Batch Start:  15000
Batch Start:  16000
Batch Start:  17000
Batch Start:  18000
Batch Start:  19000
Batch Start:  20000
Batch Start:  21000
Batch Start:  22000
Batch Start:  23000
Batch Start:  24000
Batch Start:  25000
Batch Start:  26000
Batch Start:  27000
Batch Start:  28000
Batch Start:  29000
Batch Start:  30000
Batch Start:  31000
Batch Start:  32000
Batch Start:  33000
Batch Start:  34000
Batch Start:  35000
Batch Start:  36000
Batch Start:  37000
Batch Start:  38000
Batch Start:  39000
Batch Start:  40000
Batch Start:  41000
Batch Start:  42000
Batch Start:  43000
Batch Start:  44000
Batch Start:  45000
Batch Start:  46000
Batch Start:  47000
Batch Start:  48000
Batch Start:  49000
Batch Start: 

## Unterhalb dieser Zeile gibt es noch eine Datenbereingung, die man zuerst durchführen muss

In [12]:
import pandas as pd

# Pfade definieren
train_file = f'{pipeline_data_splitted}/{precision}/beta/train.tsv'
validation_file = f'{pipeline_data_splitted}/{precision}/beta/validation.tsv'
test_preneg_file = f'{pipeline_data_splitted}/{precision}/beta/test_prenegsamples.tsv'
beta_data_file = f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv'
vdjdb_beta_read_path = f'{pipeline_data_cleaned}/VDJdb/VDJdb_cleaned_data_beta.tsv'

# Train und Validation laden
train_df = pd.read_csv(train_file, sep='\t')
validation_df = pd.read_csv(validation_file, sep='\t')

# Zusammenführen zu Train+Validation
trainval = pd.concat([train_df, validation_df], ignore_index=True)

# Extrahiere alle TCR-Epitope-Paare aus Train+Validation (nur negative Samples)
trainval_negatives = set(zip(
    trainval.loc[trainval['Binding'] == 0, 'TRB_CDR3'], 
    trainval.loc[trainval['Binding'] == 0, 'Epitope']
))

# Original VDJdb-Testset laden, um TPP3 sicherzustellen
vdjdb_df = pd.read_csv(vdjdb_beta_read_path, sep='\t')
vdjdb_tcrs = set(vdjdb_df['TRB_CDR3'])
vdjdb_epitopes = set(vdjdb_df['Epitope'])

# Negative Daten aus der Beta-Datei laden
beta = pd.read_csv(beta_data_file, sep=',')

# Positive Test-Samples laden
test_preneg = pd.read_csv(test_preneg_file, sep='\t')

# Anzahl positiver Samples
num_test_pos = len(test_preneg)
test_neg_needed = num_test_pos * 5

# Nur negative Samples, die NICHT in Trainval und NICHT in VDJdb vorkommen
beta_negatives = beta[beta['Binding'] == 0].copy()
beta_negatives_filtered = beta_negatives[
    ~beta_negatives[['TRB_CDR3', 'Epitope']].apply(tuple, axis=1).isin(trainval_negatives) & 
    ~beta_negatives['TRB_CDR3'].isin(vdjdb_tcrs) &
    ~beta_negatives['Epitope'].isin(vdjdb_epitopes)
]

# Falls zu wenige negative übrig sind, mit Replacement auffüllen
if len(beta_negatives_filtered) < test_neg_needed:
    test_negatives = beta_negatives_filtered.sample(test_neg_needed, replace=True, random_state=42)
else:
    test_negatives = beta_negatives_filtered.sample(test_neg_needed, random_state=42)

# Positive und negative Samples kombinieren
test_combined = pd.concat([test_preneg, test_negatives], ignore_index=True)

# Speichern des kombinierten Test-Datensatzes
output_dir = f'{pipeline_data_splitted}/{precision}/beta/'
test_combined.to_csv(output_dir + "test.tsv", sep='\t', index=False)

# Berechne Unique Werte
unique_tcr_test = test_combined['TRB_CDR3'].nunique()
unique_epitope_test = test_combined['Epitope'].nunique()

# Berechne Anzahl positiver und negativer Einträge
test_binding_counts = test_combined['Binding'].value_counts()

# Finale Ausgabe
print("\nAlle Datensätze wurden erfolgreich gespeichert.")
print(f"Test: {len(test_combined)} Einträge")
print(f"- Unique TCRs: {unique_tcr_test}")
print(f"- Unique Epitope: {unique_epitope_test}")
print(f"- Positive (binding == 1): {test_binding_counts.get(1, 0)}")
print(f"- Negative (binding == 0): {test_binding_counts.get(0, 0)}")


  train_df = pd.read_csv(train_file, sep='\t')



✅ Alle Datensätze wurden erfolgreich gespeichert.
Test: 235914 Einträge
- Unique TCRs: 96964
- Unique Epitope: 1009
- Positive (binding == 1): 39319
- Negative (binding == 0): 196595


In [12]:
import pandas as pd

# File paths
train_path = '../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../data/splitted_datasets/allele/beta/validation.tsv'
test_path = '../../../../data/splitted_datasets/allele/beta/test.tsv'

# Load datasets
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

# Relevante Spalten
target_columns = ['Epitope', 'TRB_CDR3']

# Paare aus Train- und Valid-Set
train_pairs = set(map(tuple, train_df[target_columns].values))
valid_pairs = set(map(tuple, valid_df[target_columns].values))

# Paare aus Test-Set
test_pairs = set(map(tuple, test_df[target_columns].values))

# Schnittmengen berechnen
test_in_train = test_pairs & train_pairs
test_in_valid = test_pairs & valid_pairs

# Ausgabe
print(f"Anzahl (Epitope, TRB_CDR3) Paare im Testset, die auch im Trainingsset vorkommen: {len(test_in_train)}")
print(f"Anzahl (Epitope, TRB_CDR3) Paare im Testset, die auch im Validierungsset vorkommen: {len(test_in_valid)}")

  train_df = pd.read_csv(train_path, sep='\t')
  test_df = pd.read_csv(test_path, sep='\t')


Anzahl (Epitope, TRB_CDR3) Paare im Testset, die auch im Trainingsset vorkommen: 14419
Anzahl (Epitope, TRB_CDR3) Paare im Testset, die auch im Validierungsset vorkommen: 23658


In [13]:
import pandas as pd

# File paths
train_path = '../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../data/splitted_datasets/allele/beta/validation.tsv'
test_path = '../../../../data/splitted_datasets/allele/beta/test.tsv'

# Load datasets
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

# Relevante Spalten
target_columns = ['Epitope', 'TRB_CDR3']

# Sets aus Train und Valid
train_pairs = set(map(tuple, train_df[target_columns].values))
valid_pairs = set(map(tuple, valid_df[target_columns].values))

# Kombinierte Referenzmenge
train_valid_pairs = train_pairs.union(valid_pairs)

# Hole alle Test-Zeilen, deren Paare in Train oder Valid vorkommen
test_duplicates = test_df[test_df[target_columns].apply(tuple, axis=1).isin(train_valid_pairs)]

# Gruppiere nach Binding und zähle
binding_counts = test_duplicates['Binding'].value_counts()

# Ausgabe
print("Verteilung der Binding-Labels unter den Duplikaten im Testset:")
print(binding_counts)


  train_df = pd.read_csv(train_path, sep='\t')
  test_df = pd.read_csv(test_path, sep='\t')


Verteilung der Binding-Labels unter den Duplikaten im Testset:
Binding
1    30214
0     9189
Name: count, dtype: int64


In [14]:
# Speicherpfad
duplicate_path = '../../../../data/splitted_datasets/allele/beta/duplicates.tsv'

# Speichern mit allen Spalten
test_duplicates.to_csv(duplicate_path, sep='\t', index=False)

print(f"\n✅ Duplikate erfolgreich gespeichert unter: {duplicate_path}")
print(f"Anzahl gespeicherter Duplikat-Zeilen: {len(test_duplicates)}")


✅ Duplikate erfolgreich gespeichert unter: ../../../../data/splitted_datasets/allele/beta/duplicates.tsv
Anzahl gespeicherter Duplikat-Zeilen: 39403


In [19]:
import pandas as pd

paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta'
splitted_data_path = beta_output_folder
test_file_name = 'test.tsv'
validation_file_name = 'validation.tsv'
train_file_name = 'train.tsv'

# Lade Trainings- und Validierungsdaten, um Referenzmengen zu erstellen
df_train = pd.read_csv(f"{splitted_data_path}/{train_file_name}", sep="\t")
df_validate = pd.read_csv(f"{splitted_data_path}/{validation_file_name}", sep="\t")
df_train = pd.concat([df_train, df_validate])

# Lade die Duplikate aus dem Testset
duplicate_path = f"{splitted_data_path}/duplicates.tsv"
df_duplicates = pd.read_csv(duplicate_path, sep="\t")

# Erstelle Sets für schnelle Nachschlageprüfung
epitopes_in_train = set(df_train['Epitope'])
trb_cdr3_in_train = set(df_train['TRB_CDR3'])

# Task-Logik zur Einordnung der Duplikate
def get_task(row):
    epitope_seen = row['Epitope'] in epitopes_in_train
    tcr_seen = row['TRB_CDR3'] in trb_cdr3_in_train

    if epitope_seen and tcr_seen:
        return 'TPP1'
    elif epitope_seen and not tcr_seen:
        return 'TPP2'
    elif not epitope_seen and not tcr_seen:
        return 'TPP3'
    elif not epitope_seen and tcr_seen:
        return 'TPP4'
    return 'UNKNOWN'

# Aufgabe berechnen
df_duplicates['task'] = df_duplicates.apply(get_task, axis=1)

# Aufgaben zählen
task_counts = df_duplicates['task'].value_counts()

print("Verteilung der Aufgabenklassen in den Duplikaten:")
print(task_counts)


  df_train = pd.read_csv(f"{splitted_data_path}/{train_file_name}", sep="\t")


Verteilung der Aufgabenklassen in den Duplikaten:
task
TPP1    39403
Name: count, dtype: int64


In [28]:
import pandas as pd

# File paths
train_path = '../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../data/splitted_datasets/allele/beta/validation.tsv'
test_path = '../../../../data/splitted_datasets/allele/beta/test_filtered.tsv'

# Lade die Daten
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

# Relevante Spalten
target_columns = ['Epitope', 'TRB_CDR3']

# Kombinierte Paare aus Train und Valid
train_valid_pairs = set(map(tuple, pd.concat([train_df, valid_df])[target_columns].values))

# Markiere Duplikate mit Binding == 1
is_duplicate_binding1 = test_df[target_columns].apply(tuple, axis=1).isin(train_valid_pairs) & (test_df['Binding'] == 1)

# Filter: nur Zeilen behalten, die NICHT Binding-1-Duplikate sind
test_filtered = test_df[~is_duplicate_binding1]

# Pfad zum Speichern
filtered_path = '../../../../data/splitted_datasets/allele/beta/test_filtered.tsv'
test_filtered.to_csv(filtered_path, sep='\t', index=False)

# Statistik
print(f"\n✅ Gefilterter Testdatensatz gespeichert unter: {filtered_path}")
print(f"Original Testset: {len(test_df)} Einträge")
print(f"Entfernte Binding==1 Duplikate: {is_duplicate_binding1.sum()}")
print(f"Neuer Testset: {len(test_filtered)} Einträge")


  train_df = pd.read_csv(train_path, sep='\t')
  test_df = pd.read_csv(test_path, sep='\t')



✅ Gefilterter Testdatensatz gespeichert unter: ../../../../data/splitted_datasets/allele/beta/test_filtered.tsv
Original Testset: 196511 Einträge
Entfernte Binding==1 Duplikate: 0
Neuer Testset: 196511 Einträge


In [30]:
import pandas as pd

# File paths
train_path = '../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../data/splitted_datasets/allele/beta/validation.tsv'
test_path = '../../../../data/splitted_datasets/allele/beta/test_filtered.tsv'

# Load datasets
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

# Relevante Spalten
target_columns = ['Epitope', 'TRB_CDR3']

# Sets aus Train und Valid
train_pairs = set(map(tuple, train_df[target_columns].values))
valid_pairs = set(map(tuple, valid_df[target_columns].values))

# Kombinierte Referenzmenge
train_valid_pairs = train_pairs.union(valid_pairs)

# Hole alle Test-Zeilen, deren Paare in Train oder Valid vorkommen
test_duplicates = test_df[test_df[target_columns].apply(tuple, axis=1).isin(train_valid_pairs)]

# Gruppiere nach Binding und zähle
binding_counts = test_duplicates['Binding'].value_counts()

# Ausgabe
print("Verteilung der Binding-Labels unter den Duplikaten im Testset:")
print(binding_counts)


  train_df = pd.read_csv(train_path, sep='\t')
  test_df = pd.read_csv(test_path, sep='\t')


Verteilung der Binding-Labels unter den Duplikaten im Testset:
Series([], Name: count, dtype: int64)


### Paired

In [68]:
import re
import pandas as pd

# Annahme: all_donors_consensus und all_donors_meta sind bereits geladen und gefiltert

# Festlegen der Batch-Größe für die Verarbeitung
batch_size = 1000

# Identifizieren von Epitope-Spalten, aber ohne "NR(B0801)_AAKGRGAAL_NC_binder"
epitope_columns = [col for col in all_donors_consensus.columns if '_binder' in col and col != "NR(B0801)_AAKGRGAAL_NC_binder"]

# Ausgabe-Datei für Batch-Ergebnisse
output_batch_file = f'{pipeline_data_plain}/10x/expanded_batches.csv'

# Stelle sicher, dass die Ausgabedatei leer ist
with open(output_batch_file, 'w') as f:
    f.write('')  # Leere Datei erstellen

# Verarbeite `all_donors_consensus` in Batches
for batch_start in range(0, len(all_donors_consensus), batch_size):
    print(f"Batch Start: {batch_start}")
    # Definiere Batch
    batch = all_donors_consensus.iloc[batch_start:batch_start + batch_size]
    
    # Filtern auf Zeilen, die sowohl 'TRA:' als auch 'TRB:' in 'cell_clono_cdr3_aa' enthalten
    batch_paired = batch[
        batch['cell_clono_cdr3_aa'].str.contains("TRA:", na=False) &
        batch['cell_clono_cdr3_aa'].str.contains("TRB:", na=False)
    ]

    # Liste, um Zeilen für diesen Batch zu speichern
    expanded_rows = []
    
    # Iteriere durch jede Zeile im Batch
    for _, row in batch_paired.iterrows():
        for col in epitope_columns:
            # Extrahiere MHC und Epitope
            match = re.match(r'([A-Z0-9]+)_([A-Z]+)_.*_binder', col)
            if match:
                mhc_raw, epitope = match.groups()
                mhc_formatted = f'HLA-{mhc_raw[0]}*{mhc_raw[1:3]}:{mhc_raw[3:]}'

                # Füge `Epitope` und `MHC` zur Zeile hinzu
                new_row = row.copy()
                new_row['Epitope'] = epitope
                new_row['MHC'] = mhc_formatted

                # Neue Zeile zur Batch-Liste hinzufügen
                expanded_rows.append(new_row)

    # Erstelle einen DataFrame aus dem Batch
    batch_df = pd.DataFrame(expanded_rows)

    # Füge den Batch direkt in die Datei ein
    batch_df.to_csv(output_batch_file, mode='a', index=False, header=not batch_start, sep=',')
    print(f"Batch {batch_start} gespeichert.")

# Laden der gespeicherten Batch-Ergebnisse
expanded_df = pd.read_csv(output_batch_file)

# Nur die Paired-Einträge in `all_donors_meta` beibehalten
# Filtern auf Barcodes, die sowohl eine TRA- als auch eine TRB-Kette haben
paired_barcodes = all_donors_meta.groupby('barcode').filter(
    lambda x: set(x['chain']) == {'TRA', 'TRB'}
)['barcode'].unique()
all_donors_meta_paired = all_donors_meta[all_donors_meta['barcode'].isin(paired_barcodes)]

# Split `all_donors_meta_paired` nach `chain` in separate DataFrames für TRA und TRB
alpha_chain = all_donors_meta_paired[all_donors_meta_paired['chain'] == 'TRA'].rename(
    columns={'v_gene': 'TRAV', 'j_gene': 'TRAJ', 'cdr3': 'TRA_CDR3', 'c_gene': 'TRAC'}
)
beta_chain = all_donors_meta_paired[all_donors_meta_paired['chain'] == 'TRB'].rename(
    columns={'v_gene': 'TRBV', 'j_gene': 'TRBJ', 'cdr3': 'TRB_CDR3', 'c_gene': 'TRBC'}
)

# Zusammenführen von alpha_chain und beta_chain anhand der gemeinsamen 'barcode'-Spalte
paired_meta = pd.merge(alpha_chain, beta_chain, on='barcode', suffixes=('_alpha', '_beta'))

# Zusammenführen von `paired_meta` mit `expanded_df` anhand der 'barcode'-Spalte
merged_df = pd.merge(paired_meta, expanded_df[['barcode', 'Epitope', 'MHC']], on='barcode', how='inner')

# Spalten umbenennen und Format anpassen
merged_df = merged_df.rename(columns={'barcode': 'TCR_name'})

# Fehlende Spalten auffüllen
desired_columns = [
    'TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TRAC', 'TRBC', 
    'Epitope', 'MHC', 'Binding', 'task'
]
for col in desired_columns:
    if col not in merged_df.columns:
        merged_df[col] = 'nan' if col == 'task' else '0'

# Nur die gewünschten Spalten beibehalten und Zeilen mit `None` in `TRB_CDR3` entfernen
final_df = merged_df[desired_columns]
final_df = final_df[final_df['TRB_CDR3'] != 'None']

final_df = final_df[
    final_df['TRB_CDR3'].notna() & (final_df['TRB_CDR3'] != '') &
    final_df['TRA_CDR3'].notna() & (final_df['TRA_CDR3'] != '')
]

# Ausgabe des ersten Teils des Ergebnisses zur Überprüfung
print(final_df.head())

# Optional: Speichern des kombinierten DataFrames
output_path = f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_paired_only_expanded-all.csv'
final_df.to_csv(output_path, index=False)
print("Datei erfolgreich gespeichert!")


Batch Start: 0
Batch 0 gespeichert.
Batch Start: 1000
Batch 1000 gespeichert.
Batch Start: 2000
Batch 2000 gespeichert.
Batch Start: 3000
Batch 3000 gespeichert.
Batch Start: 4000
Batch 4000 gespeichert.
Batch Start: 5000
Batch 5000 gespeichert.
Batch Start: 6000
Batch 6000 gespeichert.
Batch Start: 7000
Batch 7000 gespeichert.
Batch Start: 8000
Batch 8000 gespeichert.
Batch Start: 9000
Batch 9000 gespeichert.
Batch Start: 10000
Batch 10000 gespeichert.
Batch Start: 11000
Batch 11000 gespeichert.
Batch Start: 12000
Batch 12000 gespeichert.
Batch Start: 13000
Batch 13000 gespeichert.
Batch Start: 14000
Batch 14000 gespeichert.
Batch Start: 15000
Batch 15000 gespeichert.
Batch Start: 16000
Batch 16000 gespeichert.
Batch Start: 17000
Batch 17000 gespeichert.
Batch Start: 18000
Batch 18000 gespeichert.
Batch Start: 19000
Batch 19000 gespeichert.
Batch Start: 20000
Batch 20000 gespeichert.
Batch Start: 21000
Batch 21000 gespeichert.
Batch Start: 22000
Batch 22000 gespeichert.
Batch Start: 2

In [14]:
import pandas as pd

train_file = f'{pipeline_data_splitted}/{precision}/paired/train.tsv'
validation_file = f'{pipeline_data_splitted}/{precision}/paired/validation.tsv'

trainval = pd.concat([
    pd.read_csv(train_file, sep='\t'),
    pd.read_csv(validation_file, sep='\t')
], ignore_index=True)

trainval_negatives = set(zip(trainval.loc[trainval['Binding'] == 0, 'TRA_CDR3'], 
                             trainval.loc[trainval['Binding'] == 0, 'TRB_CDR3'], 
                             trainval.loc[trainval['Binding'] == 0, 'Epitope']))

# Negative Daten laden
paired = pd.read_csv(f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_paired_only_expanded-all.csv', sep=',')

# Positive Samples laden
test_preneg = pd.read_csv(f'{pipeline_data_splitted}/{precision}/paired/test_prenegsamples.tsv', sep='\t')

# Anzahl positiver Samples
num_test_pos = len(test_preneg)

# Zielgrößen für negative Samples
test_neg_needed = num_test_pos * 5

# Stelle sicher, dass ALLE positiven Epitope enthalten sind
positive_epitopes = set(test_preneg["Epitope"])
paired_with_epitopes = paired[paired["Epitope"].isin(positive_epitopes)]

# Nur negative Samples, die NICHT in trainval existieren
paired_negatives = paired[paired['Binding'] == 0].copy()
paired_negatives_filtered = paired_negatives[~paired_negatives[['TRA_CDR3', 'TRB_CDR3', 'Epitope']].apply(tuple, axis=1).isin(trainval_negatives)]

# Falls zu wenige negative übrig sind, mit Replacement auffüllen
if len(paired_negatives_filtered) < test_neg_needed:
    test_negatives = paired_negatives_filtered.sample(test_neg_needed, replace=True, random_state=42)
else:
    test_negatives = paired_negatives_filtered.sample(test_neg_needed, random_state=42)

# Positive und negative Samples kombinieren
test_combined = pd.concat([test_preneg, test_negatives], ignore_index=True)

# Speichern der kombinierten Datensätze
output_dir = f'{pipeline_data_splitted}/{precision}/paired/'
test_combined.to_csv(output_dir + "test.tsv", sep='\t', index=False)

# Berechne Unique Werte
unique_tcr_test = test_combined['TRB_CDR3'].nunique()
unique_epitope_test = test_combined['Epitope'].nunique()

# Berechne Anzahl positiver und negativer Einträge
test_binding_counts = test_combined['Binding'].value_counts()

# Finale Ausgabe
print("\nAlle Datensätze wurden erfolgreich gespeichert.")
print(f"Test: {len(test_combined)} Einträge")
print(f"- Unique TCRs: {unique_tcr_test}")
print(f"- Unique Epitope: {unique_epitope_test}")
print(f"- Positive (binding == 1): {test_binding_counts.get(1, 0)}")
print(f"- Negative (binding == 0): {test_binding_counts.get(0, 0)}")

  pd.read_csv(train_file, sep='\t'),



Alle Datensätze wurden erfolgreich gespeichert.
Test: 160362 Einträge
- Unique TCRs: 52994
- Unique Epitope: 831
- Positive (binding == 1): 26727
- Negative (binding == 0): 133635


## Task Classification 
The classification in the split notebook correct for positive only data. After adding negative data, some classifications might be wrong.

In [33]:
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta'
test_file_name = 'test.tsv'
validation_file_name = 'validation.tsv'
train_file_name = 'train.tsv'

In [None]:
# do the classification for paired data
paired = True
test_data_path = f'{paired_output_folder}/{test_file_name}'
validation_data_path = f'{paired_output_folder}/{validation_file_name}'
train_data_path = f'{paired_output_folder}/{train_file_name}'

%run ../data_preparation/classification.ipynb

  df_test = pd.read_csv(test_data_path, sep="\t")


test data has 82726 TPP1 tasks (seen tcr & seen epitopes).
test data has 76994 TPP2 tasks (unseen tcr & seen epitopes).
test data has 600 TPP3 tasks (unseen tcr & unseen epitope).
test data has 42 TPP4 tasks (seen tcr & unseen epitope).


In [None]:
# extended classification for paired data
train_path = f'{paired_output_folder}/{train_file_name}'
validation_path = f'{paired_output_folder}/{validation_file_name}'
test_path = f'{paired_output_folder}/{test_file_name}'
output_path = f'{paired_output_folder}/test_reclassified_paired_specific.tsv'
paired_data_path = paired_output_folder
alpha_cdr3_name = 'TRA_CDR3'
beta_cdr3_name = 'TRB_CDR3'
epitope_name = 'Epitope'
task_name = 'task'

%run ../data_preparation/paired_reclassification_testonly.ipynb

allele
../../data/splitted_datasets/allele/paired/train.tsv


  df_test = pd.read_csv(test_path, sep="\t", index_col=False)


train+validate data has 72656 entries
test data has 160362 entries
test data has 128885 TPP1 tasks (old value: 82726) (seen tcr & seen epitopes).
test data has 30835 TPP2 tasks (old value: 76994) (unseen tcr & seen epitopes).
test data has 508 TPP3 tasks (old value: 600) (unseen tcr & unseen epitope).
test data has 134 TPP4 tasks (old value: 42) (seen tcr & unseen epitope).
the train/test ratio is 0.31180423829918724/0.6881957617008128
../../data/splitted_datasets/allele/paired/test_reclassified_paired_specific.tsv
/home/ubuntu/arina/BA-Cancer-Immunotherapy
uploading dataset to dataset-allele


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marina-frohofer[0m ([33mba_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./../../data/splitted_datasets/allele/paired)... Done. 0.2s


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [34]:
# do the classification for beta data
paired = False
train_data_path = f'{beta_output_folder}/{train_file_name}'
validation_data_path = f'{beta_output_folder}/{validation_file_name}'
test_data_path = f'{beta_output_folder}/{test_file_name}'

%run ../data_preparation/classification.ipynb

  df_train = pd.read_csv(train_data_path, sep="\t")
  df_test = pd.read_csv(test_data_path, sep="\t")


test data has 178814 TPP1 tasks (seen tcr & seen epitopes).
test data has 16485 TPP2 tasks (unseen tcr & seen epitopes).
test data has 1060 TPP3 tasks (unseen tcr & unseen epitope).
test data has 152 TPP4 tasks (seen tcr & unseen epitope).


In the next two cells the classification is checked. If the output says "Classification is correct", everything is fine.

In [None]:
# check task classification paired
splitted_data_path = paired_output_folder

%run ../data_preparation/check_task_classification_paired.ipynb

  df_test = pd.read_csv(test_file, sep="\t")


train+validate data has 72656 entries
test data has 160362 entries
test data has 82726 TPP1 tasks (seen tcr & seen epitopes).
test data has 76994 TPP2 tasks (unseen tcr & seen epitopes).
test data has 600 TPP3 tasks (unseen tcr & unseen epitope).
test data has 42 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.31180423829918724/0.6881957617008128
Classification is correct.
Correctness summary:
is_correct
True    160362
Name: count, dtype: int64


In [35]:
# check task classification beta
splitted_data_path = beta_output_folder

%run ../data_preparation/check_task_classification_beta.ipynb

  df_train = pd.read_csv(f"{splitted_data_path}/{train_file_name}", sep="\t")
  df_test = pd.read_csv(f"{splitted_data_path}/{test_file_name}", sep="\t")


train data has 280252 entries
test data has 196511 entries
test data has 178814 TPP1 tasks (seen tcr & seen epitopes).
test data has 16485 TPP2 tasks (unseen tcr & seen epitopes).
test data has 1060 TPP3 tasks (unseen tcr & unseen epitope).
test data has 152 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.7139390498649838/0.28606095013501615
Classification is correct.
Correctness summary:
is_correct
True    196511
Name: count, dtype: int64


## Upload dataset

In [38]:
import os
print(os.listdir(path_to_data))


['.ipynb_checkpoints', 'validation_prenegsamples.tsv', 'test.tsv', 'train.tsv', 'test_reclassified_paired_specific.tsv', 'validation.tsv', 'train_prenegsamples.tsv', 'validate_reclassified_paired_specific.tsv', 'test_prenegsamples.tsv']


In [37]:
from dotenv import load_dotenv, find_dotenv
load_dotenv()

# upload paired data
path_to_data = f'{pipeline_data_splitted}/{precision}/paired'
dataset_name = f'paired_{precision}'
#main_project_name = os.getenv("MAIN_PROJECT_NAME")
main_project_name = f"dataset-{precision}"

%run ../upload_datasets.ipynb

uploading dataset to dataset-allele


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marina-frohofer[0m ([33mba_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./../../../../data/splitted_datasets/allele/paired)... Done. 0.3s


In [39]:
# upload beta data
path_to_data = f'{pipeline_data_splitted}/{precision}/beta'
dataset_name = f'beta_{precision}'

%run ../upload_datasets.ipynb

uploading dataset to dataset-allele


[34m[1mwandb[0m: Adding directory to artifact (./../../../../data/splitted_datasets/allele/beta)... Done. 0.2s


VBox(children=(Label(value='1.088 MB of 16.265 MB uploaded\r'), FloatProgress(value=0.06688385651491562, max=1…

## Create Embeddings >> ProtBert

In [40]:
import torch
print(torch.cuda.is_available())  # Sollte True zurückgeben
print(torch.version.cuda)  # Sollte die richtige CUDA-Version anzeigen

True
12.4


In [None]:
path_paired_test = f"{pipeline_data_splitted}/{precision}/paired/test.tsv"
path_paired_validation = f"{pipeline_data_splitted}/{precision}/paired/validation.tsv"
path_paired_train = f"{pipeline_data_splitted}/{precision}/paired/train.tsv"
path_beta_test = f"{pipeline_data_splitted}/{precision}/beta/test.tsv"
path_beta_validation = f"{pipeline_data_splitted}/{precision}/beta/validation.tsv"
path_beta_train = f"{pipeline_data_splitted}/{precision}/beta/train.tsv"


path_paired = f"{pipeline_data}/embeddings/temp/{precision}/paired_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_paired)])
df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)
df_paired_validation = pd.read_csv(path_paired_validation, sep="\t", index_col=False)
df_paired_train = pd.read_csv(path_paired_train, sep="\t", index_col=False)
df_paired = pd.concat([df_paired_test, df_paired_validation, df_paired_train])
df_paired.to_csv(path_paired, sep="\t", index=False)

# paired
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz TRA_CDR3
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz TRB_CDR3
#%run ../generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz Epitope

path_beta = f"{pipeline_data}/embeddings/temp/{precision}/beta_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_beta)])
df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_test, df_beta_validation, df_beta_train])
df_beta.to_csv(path_beta, sep="\t", index=False)

# beta
%run ../generateEmbeddingsProtBERT.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/TRB_beta_embeddings.npz TRB_CDR3
%run ../generateEmbeddingsProtBERT.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/Epitope_beta_embeddings.npz Epitope

  df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)
  df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
  df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)


Using GPU: Tesla T4
Loading: Rostlab/prot_bert
Model is on device: cuda:0
Processing Batch:  0 64
Processing Batch:  64 128
Processing Batch:  128 192
Processing Batch:  192 256
Processing Batch:  256 320
Processing Batch:  320 384
Processing Batch:  384 448
Processing Batch:  448 512
Processing Batch:  512 576
Processing Batch:  576 640
Processing Batch:  640 704
Processing Batch:  704 768
Processing Batch:  768 832
Processing Batch:  832 896
Processing Batch:  896 960
Processing Batch:  960 1024
Processing Batch:  1024 1088
Processing Batch:  1088 1152
Processing Batch:  1152 1216
Processing Batch:  1216 1280
Processing Batch:  1280 1344
Processing Batch:  1344 1408
Processing Batch:  1408 1472
Processing Batch:  1472 1536
Processing Batch:  1536 1600
Processing Batch:  1600 1664
Processing Batch:  1664 1728
Processing Batch:  1728 1792
Processing Batch:  1792 1856
Processing Batch:  1856 1920
Processing Batch:  1920 1984
Processing Batch:  1984 2048
Processing Batch:  2048 2112
Proc

In [9]:
import numpy as np

# Funktion, um Embeddings korrekt zu laden
def load_embeddings(file_path):
    npz_data = np.load(file_path)
    all_keys = list(npz_data.keys())

    # Falls Embeddings als einzelne Sequenzen gespeichert sind
    if len(all_keys) > 1:
        all_values = [npz_data[k] for k in all_keys]
        return np.vstack(all_values)  # Alles zusammenfügen
    else:
        return npz_data[all_keys[0]]

# Embeddings für TRA, TRB und Epitope laden
tra_embeddings = load_embeddings(f"{pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz")
trb_embeddings = load_embeddings(f"{pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz")
epitope_embeddings = load_embeddings(f"{pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz")

# Ausgabe der finalen Shapes
print(f"📌 TRA Embedding Shape: {tra_embeddings.shape}")
print(f"📌 TRB Embedding Shape: {trb_embeddings.shape}")
print(f"📌 Epitope Embedding Shape: {epitope_embeddings.shape}")


📌 TRA Embedding Shape: (596417, 1024)
📌 TRB Embedding Shape: (694427, 1024)
📌 Epitope Embedding Shape: (12870, 1024)


In [11]:
import pandas as pd

# Beispielpfade für Train-, Test-, und Validierungsdatensätze für alle vier Kategorien
base_path = pipeline_data_splitted

# Definierte Pfade für alle vier Kategorien
datasets = {
    "paired_gene": {
        "train": f"{base_path}/gene/paired/train.tsv",
        "test": f"{base_path}/gene/paired/test.tsv",
        "validation": f"{base_path}/gene/paired/validation.tsv"
    },
    "beta_gene": {
        "train": f"{base_path}/gene/beta/train.tsv",
        "test": f"{base_path}/gene/beta/test.tsv",
        "validation": f"{base_path}/gene/beta/validation.tsv"
    }
}

# Berechnung der Anzahl der Zeilen für jedes Set
results = {}
for dataset_name, paths in datasets.items():
    # Daten laden
    train_df = pd.read_csv(paths["train"], sep='\t')
    test_df = pd.read_csv(paths["test"], sep='\t')
    validation_df = pd.read_csv(paths["validation"], sep='\t')
    
    # Anzahl der Zeilen berechnen
    train_length = len(train_df)
    test_length = len(test_df)
    validation_length = len(validation_df)
    total_length = train_length + test_length + validation_length
    
    # Zähle die Anzahl der Bindings 1 und 0 in jedem Datensatz
    train_binding_counts = train_df['Binding'].value_counts()
    test_binding_counts = test_df['Binding'].value_counts()
    validation_binding_counts = validation_df['Binding'].value_counts()
    
    # Zähle die Anzahl der TPP1, TPP2, TPP3 Einträge in jedem Datensatz
    train_task_counts = train_df['task'].value_counts()
    test_task_counts = test_df['task'].value_counts()
    validation_task_counts = validation_df['task'].value_counts()

    # Ergebnisse speichern
    results[dataset_name] = {
        "Train": train_length,
        "Train_Binding_1": train_binding_counts.get(1, 0),
        "Train_Binding_0": train_binding_counts.get(0, 0),
        "Train_TPP1": train_task_counts.get("TPP1", 0),
        "Train_TPP2": train_task_counts.get("TPP2", 0),
        "Train_TPP3": train_task_counts.get("TPP3", 0),
        "Train_TPP4": train_task_counts.get("TPP4", 0),
        "Test": test_length,
        "Test_Binding_1": test_binding_counts.get(1, 0),
        "Test_Binding_0": test_binding_counts.get(0, 0),
        "Test_TPP1": test_task_counts.get("TPP1", 0),
        "Test_TPP2": test_task_counts.get("TPP2", 0),
        "Test_TPP3": test_task_counts.get("TPP3", 0),
        "Test_TPP4": test_task_counts.get("TPP4", 0),
        "Validation": validation_length,
        "Validation_Binding_1": validation_binding_counts.get(1, 0),
        "Validation_Binding_0": validation_binding_counts.get(0, 0),
        "Validation_TPP1": validation_task_counts.get("TPP1", 0),
        "Validation_TPP2": validation_task_counts.get("TPP2", 0),
        "Validation_TPP3": validation_task_counts.get("TPP3", 0),
        "Validation_TPP4": validation_task_counts.get("TPP4", 0),
        "Total": total_length
    }

# Ergebnisse anzeigen
for dataset, lengths in results.items():
    print(f'--- {dataset.replace("_", " ").title()} ---')
    print(f'Anzahl der Zeilen im Trainingsdatensatz: {lengths["Train"]} (Binding=1: {lengths["Train_Binding_1"]}, Binding=0: {lengths["Train_Binding_0"]}, TPP1: {lengths["Train_TPP1"]}, TPP2: {lengths["Train_TPP2"]}, TPP3: {lengths["Train_TPP3"]})')
    print(f'Anzahl der Zeilen im Testdatensatz: {lengths["Test"]} (Binding=1: {lengths["Test_Binding_1"]}, Binding=0: {lengths["Test_Binding_0"]}, TPP1: {lengths["Test_TPP1"]}, TPP2: {lengths["Test_TPP2"]}, TPP3: {lengths["Test_TPP3"]})')
    print(f'Anzahl der Zeilen im Validierungsdatensatz: {lengths["Validation"]} (Binding=1: {lengths["Validation_Binding_1"]}, Binding=0: {lengths["Validation_Binding_0"]}, TPP1: {lengths["Validation_TPP1"]}, TPP2: {lengths["Validation_TPP2"]}, TPP3: {lengths["Validation_TPP3"]})')
    print(f'Gesamtanzahl der Zeilen (Train + Test + Validation): {lengths["Total"]}\n')

# Optional: Ergebnisse in einer Übersichtstabelle darstellen
summary_data = []
for dataset, lengths in results.items():
    summary_data.append({
        "Dataset": dataset.replace("_", " ").title(),
        "Train": lengths["Train"],
        "Train_Binding_1": lengths["Train_Binding_1"],
        "Train_Binding_0": lengths["Train_Binding_0"],
        "Train_TPP1": lengths["Train_TPP1"],
        "Train_TPP2": lengths["Train_TPP2"],
        "Train_TPP3": lengths["Train_TPP3"],
        "Train_TPP4": lengths["Train_TPP4"],
        "Test": lengths["Test"],
        "Test_Binding_1": lengths["Test_Binding_1"],
        "Test_Binding_0": lengths["Test_Binding_0"],
        "Test_TPP1": lengths["Test_TPP1"],
        "Test_TPP2": lengths["Test_TPP2"],
        "Test_TPP3": lengths["Test_TPP3"],
        "Test_TPP4": lengths["Test_TPP4"],
        "Validation": lengths["Validation"],
        "Validation_Binding_1": lengths["Validation_Binding_1"],
        "Validation_Binding_0": lengths["Validation_Binding_0"],
        "Validation_TPP1": lengths["Validation_TPP1"],
        "Validation_TPP2": lengths["Validation_TPP2"],
        "Validation_TPP3": lengths["Validation_TPP3"],
        "Validation_TPP4": lengths["Validation_TPP4"],
        "Total": lengths["Total"]
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df)


  train_df = pd.read_csv(paths["train"], sep='\t')


--- Paired Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 67422 (Binding=1: 33711, Binding=0: 33711, TPP1: 0, TPP2: 0, TPP3: 0)
Anzahl der Zeilen im Testdatensatz: 43356 (Binding=1: 7226, Binding=0: 36130, TPP1: 27972, TPP2: 15095, TPP3: 289)
Anzahl der Zeilen im Validierungsdatensatz: 43344 (Binding=1: 7224, Binding=0: 36120, TPP1: 0, TPP2: 0, TPP3: 0)
Gesamtanzahl der Zeilen (Train + Test + Validation): 154122

--- Beta Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 251750 (Binding=1: 125875, Binding=0: 125875, TPP1: 0, TPP2: 0, TPP3: 0)
Anzahl der Zeilen im Testdatensatz: 161844 (Binding=1: 26974, Binding=0: 134870, TPP1: 140896, TPP2: 20645, TPP3: 299)
Anzahl der Zeilen im Validierungsdatensatz: 161838 (Binding=1: 26973, Binding=0: 134865, TPP1: 0, TPP2: 0, TPP3: 0)
Gesamtanzahl der Zeilen (Train + Test + Validation): 575432

       Dataset   Train  Train_Binding_1  Train_Binding_0  Train_TPP1  \
0  Paired Gene   67422            33711            33711           0   
1    Bet

In [10]:
import pandas as pd

# Beispielpfade für Train-, Test-, und Validierungsdatensätze für alle vier Kategorien
base_path = pipeline_data_splitted

# Definierte Pfade für alle vier Kategorien
datasets = {
    "paired_gene": {
        "train": f"{base_path}/gene/paired/train.tsv",
        "test": f"{base_path}/gene/paired/test.tsv",
        "validation": f"{base_path}/gene/paired/validation.tsv"
    },
    "paired_allele": {
        "train": f"{base_path}/allele/paired/train.tsv",
        "test": f"{base_path}/allele/paired/test.tsv",
        "validation": f"{base_path}/allele/paired/validation.tsv"
    },
    "beta_gene": {
        "train": f"{base_path}/gene/beta/train.tsv",
        "test": f"{base_path}/gene/beta/test.tsv",
        "validation": f"{base_path}/gene/beta/validation.tsv"
    },
    "beta_allele": {
        "train": f"{base_path}/allele/beta/train.tsv",
        "test": f"{base_path}/allele/beta/test.tsv",
        "validation": f"{base_path}/allele/beta/validation.tsv"
    }
}

# Berechnung der Anzahl der Zeilen für jedes Set
results = {}
for dataset_name, paths in datasets.items():
    # Daten laden
    train_df = pd.read_csv(paths["train"], sep='\t')
    test_df = pd.read_csv(paths["test"], sep='\t')
    validation_df = pd.read_csv(paths["validation"], sep='\t')
    
    # Anzahl der Zeilen berechnen
    train_length = len(train_df)
    test_length = len(test_df)
    validation_length = len(validation_df)
    total_length = train_length + test_length + validation_length
    
    # Zähle die Anzahl der Bindings 1 und 0 in jedem Datensatz
    train_binding_counts = train_df['Binding'].value_counts()
    test_binding_counts = test_df['Binding'].value_counts()
    validation_binding_counts = validation_df['Binding'].value_counts()
    
    # Zähle die Anzahl der TPP1, TPP2, TPP3 Einträge in jedem Datensatz
    train_task_counts = train_df['task'].value_counts()
    test_task_counts = test_df['task'].value_counts()
    validation_task_counts = validation_df['task'].value_counts()

    # Ergebnisse speichern
    results[dataset_name] = {
        "Train": train_length,
        "Train_Binding_1": train_binding_counts.get(1, 0),
        "Train_Binding_0": train_binding_counts.get(0, 0),
        "Train_TPP1": train_task_counts.get("TPP1", 0),
        "Train_TPP2": train_task_counts.get("TPP2", 0),
        "Train_TPP3": train_task_counts.get("TPP3", 0),
        "Train_TPP4": train_task_counts.get("TPP4", 0),
        "Test": test_length,
        "Test_Binding_1": test_binding_counts.get(1, 0),
        "Test_Binding_0": test_binding_counts.get(0, 0),
        "Test_TPP1": test_task_counts.get("TPP1", 0),
        "Test_TPP2": test_task_counts.get("TPP2", 0),
        "Test_TPP3": test_task_counts.get("TPP3", 0),
        "Test_TPP4": test_task_counts.get("TPP4", 0),
        "Validation": validation_length,
        "Validation_Binding_1": validation_binding_counts.get(1, 0),
        "Validation_Binding_0": validation_binding_counts.get(0, 0),
        "Validation_TPP1": validation_task_counts.get("TPP1", 0),
        "Validation_TPP2": validation_task_counts.get("TPP2", 0),
        "Validation_TPP3": validation_task_counts.get("TPP3", 0),
        "Validation_TPP4": validation_task_counts.get("TPP4", 0),
        "Total": total_length
    }

# Ergebnisse anzeigen
for dataset, lengths in results.items():
    print(f'--- {dataset.replace("_", " ").title()} ---')
    print(f'Anzahl der Zeilen im Trainingsdatensatz: {lengths["Train"]} (Binding=1: {lengths["Train_Binding_1"]}, Binding=0: {lengths["Train_Binding_0"]}, TPP1: {lengths["Train_TPP1"]}, TPP2: {lengths["Train_TPP2"]}, TPP3: {lengths["Train_TPP3"]})')
    print(f'Anzahl der Zeilen im Testdatensatz: {lengths["Test"]} (Binding=1: {lengths["Test_Binding_1"]}, Binding=0: {lengths["Test_Binding_0"]}, TPP1: {lengths["Test_TPP1"]}, TPP2: {lengths["Test_TPP2"]}, TPP3: {lengths["Test_TPP3"]})')
    print(f'Anzahl der Zeilen im Validierungsdatensatz: {lengths["Validation"]} (Binding=1: {lengths["Validation_Binding_1"]}, Binding=0: {lengths["Validation_Binding_0"]}, TPP1: {lengths["Validation_TPP1"]}, TPP2: {lengths["Validation_TPP2"]}, TPP3: {lengths["Validation_TPP3"]})')
    print(f'Gesamtanzahl der Zeilen (Train + Test + Validation): {lengths["Total"]}\n')

# Optional: Ergebnisse in einer Übersichtstabelle darstellen
summary_data = []
for dataset, lengths in results.items():
    summary_data.append({
        "Dataset": dataset.replace("_", " ").title(),
        "Train": lengths["Train"],
        "Train_Binding_1": lengths["Train_Binding_1"],
        "Train_Binding_0": lengths["Train_Binding_0"],
        "Train_TPP1": lengths["Train_TPP1"],
        "Train_TPP2": lengths["Train_TPP2"],
        "Train_TPP3": lengths["Train_TPP3"],
        "Train_TPP4": lengths["Train_TPP4"],
        "Test": lengths["Test"],
        "Test_Binding_1": lengths["Test_Binding_1"],
        "Test_Binding_0": lengths["Test_Binding_0"],
        "Test_TPP1": lengths["Test_TPP1"],
        "Test_TPP2": lengths["Test_TPP2"],
        "Test_TPP3": lengths["Test_TPP3"],
        "Test_TPP4": lengths["Test_TPP4"],
        "Validation": lengths["Validation"],
        "Validation_Binding_1": lengths["Validation_Binding_1"],
        "Validation_Binding_0": lengths["Validation_Binding_0"],
        "Validation_TPP1": lengths["Validation_TPP1"],
        "Validation_TPP2": lengths["Validation_TPP2"],
        "Validation_TPP3": lengths["Validation_TPP3"],
        "Validation_TPP4": lengths["Validation_TPP4"],
        "Total": lengths["Total"]
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df)


FileNotFoundError: [Errno 2] No such file or directory: '../../data/splitted_datasets/allele/paired/train.tsv'