In [2]:
import os
import pandas as pd

In [3]:
import sys
!"{sys.executable}" -m pip install tidytcells



In [4]:
# set precision of mhc and V/J values (gene or allele)
precision = 'allele'

In [5]:
# this function is not thread safe
def create_folders_if_not_exists(folders):
  for path in folders:
    if not os.path.exists(path):
      os.makedirs(path)

In [6]:
pipeline_data = '../../data'
pipeline_data_plain = f'{pipeline_data}/plain_datasets'
pipeline_data_cleaned = f'{pipeline_data}/cleaned_datasets'
pipeline_data_concatenated = f'{pipeline_data}/concatenated_datasets'
pipeline_data_splitted = f'{pipeline_data}/splitted_datasets'
pipeline_data_temp_bucket = f'{pipeline_data}/temp'

pipeline_folders = [pipeline_data, pipeline_data_plain, pipeline_data_cleaned, pipeline_data_concatenated, pipeline_data_splitted, pipeline_data_temp_bucket]

create_folders_if_not_exists(pipeline_folders)

## Data Preparation

### IEDB

In [15]:
# prepare directories
IEDB_data_plain = f'{pipeline_data_plain}/IEDB'
IEDB_data_cleaned = f'{pipeline_data_cleaned}/IEDB'
IEDB_data_fitted = f'{pipeline_data_temp_bucket}/IEDB'

IEDB_folders = [IEDB_data_plain, IEDB_data_cleaned, IEDB_data_fitted]
create_folders_if_not_exists(IEDB_folders)

In [17]:
# prepare parameters for notebook IEDB fit data
path_prefix_plain = IEDB_data_plain
path_prefix_fitted = IEDB_data_fitted
mhc_I_input_beta = f"{path_prefix_plain}/MHCI_IEDB_beta_export.csv"
mhc_I_output_beta = f"{path_prefix_fitted}/IEDB_beta_fitted.csv"
mhc_I_input_paired = f"{path_prefix_plain}/MHCI_IEDB_paired_export.csv"
mhc_I_output_paired = f"{path_prefix_fitted}/IEDB_paired_fitted.csv"

# fit IEDB data
%run data_scripts/IEDB/IEDB_fitted_dataset.ipynb

In [18]:
# prepare parameters for notebook IEDB clean data
path_prefix_fitted = IEDB_data_fitted
path_prefix_cleaned =  IEDB_data_cleaned
fitted_file_beta = "IEDB_beta_fitted.csv"
fitted_file_paired = "IEDB_paired_fitted.csv"
cleaned_file_beta = "IEDB_cleaned_data_beta.csv"
cleaned_file_paired = "IEDB_cleaned_data_paired.csv"

# clean IEDB data
%run data_scripts/IEDB/IEDB_clean_dataset.ipynb

In [19]:
IEDB_cleaned_beta_output = f'{IEDB_data_cleaned}/{cleaned_file_beta}'
IEDB_cleaned_paired_output = f'{IEDB_data_cleaned}/{cleaned_file_paired}'

### McPAS

In [20]:
# prepare directories
McPas_data_plain = f'{pipeline_data_plain}/McPas'
McPas_data_cleaned = f'{pipeline_data_cleaned}/McPas'
McPas_data_fitted = f'{pipeline_data_temp_bucket}/McPas'

McPas_folders = [McPas_data_plain, McPas_data_cleaned, McPas_data_fitted]
create_folders_if_not_exists(McPas_folders)

In [21]:
# prepare parameters for notebook McPAS fit data
input_file = f'{McPas_data_plain}/McPAS-TCR.csv'
path_prefix_fitted = McPas_data_fitted
fitted_file = 'McPAS_fitted.tsv'

# fit McPAS data
%run data_scripts/McPas-TCR/fit_data_mcpastcr_both.ipynb

In [22]:
# prepare parameters for notebook McPAS clean data
fitted_input_file = f'{McPas_data_fitted}/{fitted_file}'
path_prefix_cleaned = McPas_data_cleaned
cleaned_file_paired = 'McPAS_cleaned_data_paired.tsv'
cleaned_file_beta = 'McPAS_cleaned_data_beta.tsv'

# clean McPAS data
%run data_scripts/McPas-TCR/clean_data_mcpastcr_both.ipynb

MHC Class I has 10078 entries
whole dataframe has 13701 entries
filtered to only use MHC Class I. Length of dataset: 10078


  mcpastcr_cleaned_both_df = mcpastcr_cleaned_both_df[~mask]


In [23]:
McPAS_cleaned_beta_output = f'{McPas_data_cleaned}/{cleaned_file_beta}'
McPAS_cleaned_paired_output = f'{McPas_data_cleaned}/{cleaned_file_paired}'

### VDJdb

In [24]:
# prepare directories
VDJdb_data_plain = f'{pipeline_data_plain}/VDJdb'
VDJdb_data_cleaned = f'{pipeline_data_cleaned}/VDJdb'
VDJdb_data_fitted = f'{pipeline_data_temp_bucket}/VDJdb'

VDJdb_folders = [VDJdb_data_plain, VDJdb_data_cleaned, VDJdb_data_fitted]
create_folders_if_not_exists(VDJdb_folders)

fitted_beta_file = 'VDJdb_beta_fitted.tsv'
fitted_paired_file = 'VDJdb_paired_fitted.tsv'

In [25]:
# prepare parameters for notebook VDJdb fit data paired
input_file = f'{VDJdb_data_plain}/VDJdb_paired_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_paired_file

# fit paired VDJdb data
%run data_scripts/VDJdb/fit_data_vdjdb_paired.ipynb

In [26]:
# prepare parameters for notebook VDJdb fit data beta
input_file = f'{VDJdb_data_plain}/VDJdb_beta_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_beta_file

# fit beta VDJdb data
%run data_scripts/VDJdb/fit_data_vdjdb_beta.ipynb

In [27]:
# prepare parameters for notebook VDJdb clean data paired
input_file = f'{VDJdb_data_fitted}/{fitted_paired_file}'
cleaned_file_paired = 'VDJdb_cleaned_data_paired.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

# clean paired VDJdb data
%run data_scripts/VDJdb/clean_data_vdjdb_paired.ipynb

MHC Class I has 27414 entries
whole dataframe has 28119 entries
filtered to only use MHC Class I. Length of dataset: 27414


In [28]:
# prepare parameters for notebook VDJdb clean data beta
input_file = f'{VDJdb_data_fitted}/{fitted_beta_file}'
cleaned_file_beta = 'VDJdb_cleaned_data_beta.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'

# clean beta VDJdb data
%run data_scripts/VDJdb/clean_data_vdjdb_beta.ipynb

MHC Class I has 46507 entries
whole dataframe has 49042 entries
filtered to only use MHC Class I. Length of dataset: 46507


In [29]:
VDJdb_cleaned_beta_output = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'
VDJdb_cleaned_paired_output = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

### pMTnet (beta data only!)

In [30]:
# Dateipfade
train_path = "../../data/plain_datasets/pMTnet/training_data.csv"
test_path = "../../data/plain_datasets/pMTnet/testing_data.csv"
output_path = "../../data/plain_datasets/pMTnet/beta_data.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_combined = pd.concat([df_train, df_test], ignore_index=True)

df_combined.to_csv(output_path, index=False)

print(f"Kombinierte Datei gespeichert unter: {output_path}")

Kombinierte Datei gespeichert unter: ../../data/plain_datasets/pMTnet/beta_data.csv


In [31]:
# prepare directories
pMTnet_data_plain = f'{pipeline_data_plain}/pMTnet'
pMTnet_data_cleaned = f'{pipeline_data_cleaned}/pMTnet'
pMTnet_data_fitted = f'{pipeline_data_temp_bucket}/pMTnet'

pMTnet_folders = [pMTnet_data_plain, pMTnet_data_cleaned, pMTnet_data_fitted]
create_folders_if_not_exists(pMTnet_folders)
fitted_combined_file = 'pMTnet_fitted_data_beta.tsv'

In [32]:
# prepare parameters for notebook pMTnet fit data combined
input_file = f'{pMTnet_data_plain}/beta_data.csv'
path_prefix_fitted = pMTnet_data_fitted
fitted_file = fitted_combined_file

# fit combined pMTnet data
%run data_scripts/pMTnet/fit_data_pMTnet_beta.ipynb

Gefittete Datei gespeichert unter: ../../data/temp/pMTnet/pMTnet_fitted_data_beta.tsv


In [33]:
# prepare parameters for notebook pMTnet clean data combined
input_file = f'{pMTnet_data_fitted}/{fitted_combined_file}'
cleaned_file_combined = 'pMTnet_cleaned_data_beta.tsv'
output_file = f'{pMTnet_data_cleaned}/{cleaned_file_combined}'

# clean combined pMTnet data
%run data_scripts/pMTnet/clean_data_pMTnet_beta.ipynb

Bereinigte Datei gespeichert unter: ../../data/cleaned_datasets/pMTnet/pMTnet_cleaned_data_beta.tsv


In [34]:
pMTnet_cleaned_beta_output = f'{pMTnet_data_cleaned}/{cleaned_file_combined}'

## ImmuneCODE

In [None]:
#vorerst auslassen, weil riesiger Datensatz, aber nicht schön beschrieben

In [93]:
'''# prepare directories
ImmCode_data_plain = f'{pipeline_data_plain}/ImmCode'
ImmCode_data_cleaned = f'{pipeline_data_cleaned}/ImmCode'
ImmCode_data_fitted = f'{pipeline_data_temp_bucket}/ImmCode'

ImmCode_folders = [ImmCode_data_plain, ImmCode_data_cleaned, ImmCode_data_fitted]
create_folders_if_not_exists(ImmCode_folders)
fitted_combined_file = 'ImmCode_fitted_data_beta.tsv'
fitted_paired_file = 'ImmCode_paired_fitted.tsv'

In [None]:
'''# prepare parameters for notebook ImmCode fit data combined
input_file = f'{ImmCode_data_plain}/beta_data.csv'
path_prefix_fitted = ImmCode_data_fitted
fitted_file = fitted_combined_file

# fit combined ImmCode data
%run data_scripts/ImmCode/fit_data_ImmCode_beta.ipynb

In [None]:
'''# prepare parameters for notebook ImmCode clean data combined
input_file = f'{ImmCode_data_fitted}/{fitted_combined_file}'
cleaned_file_combined = 'ImmCode_cleaned_data_beta.tsv'
output_file = f'{ImmCode_data_cleaned}/{cleaned_file_combined}'

# clean combined ImmCode data
%run data_scripts/ImmCode/clean_data_ImmCode_beta.ipynb

In [None]:
'''# prepare parameters for notebook ImmCode fit data paired
input_file = f'{ImmCode_data_plain}/ImmCode_paired_only.tsv'
path_prefix_fitted = ImmCode_data_fitted
fitted_file = fitted_paired_file

# fit paired ImmCode data
%run data_scripts/ImmCode/fit_data_ImmCode_paired.ipynb

In [None]:
'''# prepare parameters for notebook ImmCode clean data paired
input_file = f'{ImmCode_data_fitted}/{fitted_paired_file}'
cleaned_file_paired = 'ImmCode_cleaned_data_paired.tsv'
output_file = f'{ImmCode_data_cleaned}/{cleaned_file_paired}'

# clean paired ImmCode data
%run data_scripts/ImmCode/clean_data_ImmCode_paired.ipynb

In [None]:
'''ImmCode_cleaned_beta_output = f'{ImmCode_data_cleaned}/{cleaned_file_beta}'
ImmCode_cleaned_paired_output = f'{ImmCode_data_cleaned}/{cleaned_file_paired}'

## Data check per database

In [35]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'

# beta input files
vdjdb_beta_read_path = VDJdb_cleaned_beta_output
mcpastcr_beta_read_path = McPAS_cleaned_beta_output
iedb_beta_read_path = IEDB_cleaned_beta_output
pmtnet_beta_read_path = pMTnet_cleaned_beta_output
#immcode_beta_read_path = ImmCode_cleaned_beta_output
# paired input files
vdjdb_paired_read_path = VDJdb_cleaned_paired_output
mcpastcr_paired_read_path = McPAS_cleaned_paired_output
iedb_paired_read_path = IEDB_cleaned_paired_output
#immcode_paired_read_path = ImmCode_cleaned_paired_output

In [70]:
%run datacheck_for_testfile.ipynb

VDJdb_beta geladen mit 46507 Einträgen.
McPAS_beta geladen mit 9458 Einträgen.
IEDB_beta geladen mit 175662 Einträgen.
pMTnet_beta geladen mit 32663 Einträgen.
VDJdb_paired geladen mit 27414 Einträgen.
McPAS_paired geladen mit 1904 Einträgen.
IEDB_paired geladen mit 25020 Einträgen.

**Wenn VDJdb_beta als Testset verwendet wird:**
  - TPP3-Paare im Testset: 176
  - Gesamt Test-Paare: 46507

**Wenn McPAS_beta als Testset verwendet wird:**
  - TPP3-Paare im Testset: 27
  - Gesamt Test-Paare: 9458

**Wenn IEDB_beta als Testset verwendet wird:**
  - TPP3-Paare im Testset: 73837
  - Gesamt Test-Paare: 175662

**Wenn pMTnet_beta als Testset verwendet wird:**
  - TPP3-Paare im Testset: 372
  - Gesamt Test-Paare: 32663

**Wenn VDJdb_paired als Testset verwendet wird:**
  - TPP3-Paare im Testset: 0
  - Gesamt Test-Paare: 27414

**Wenn McPAS_paired als Testset verwendet wird:**
  - TPP3-Paare im Testset: 0
  - Gesamt Test-Paare: 1904

**Wenn IEDB_paired als Testset verwendet wird:**
  - TPP3-Paa

  df_train = pd.read_csv(train_file, sep='\t')
  df_test = pd.read_csv(test_file, sep='\t')



**TPP Analysis for VDJdb with Train + Validation**
  - TPP3-Paare im Testset: 0
  - Gesamt Test-Paare: 46507
Anzahl der ursprünglichen TPP3-Paare, die jetzt in negativen Daten des Train/Validation-Sets vorkommen: 0
Keine der ursprünglichen TPP3-Paare wurden in den negativen Daten gefunden.
Anzahl der ursprünglichen TPP3-Paare, die jetzt in den positiven Train/Validation-Daten vorkommen: 0


  concatenated_beta_df = pd.read_csv(concatenated_beta_file, sep='\t')



**TPP Analysis for VDJdb with Concatenated Beta Data**
  - TPP3-Paare im Testset: 1338
  - Gesamt Test-Paare: 46507
VDJdb TPP3-Paare, deren TCR jetzt in negativen Daten aus Train/Validation vorkommt: 0
VDJdb TPP3-Paare, deren Epitope jetzt in negativen Daten aus Train/Validation vorkommt: 0
TPP3-Paare von cleaned data, die noch im finalen Testset vorhanden sind: 1421

✅ TPP3-Paare von cleaned data, die noch im finalen Testset vorhanden sind: 1421
🔄 Aktuelle TPP-Klassen dieser Paare im finalen Testset:
task
TPP1    1421
Name: count, dtype: int64
⚠️ Anzahl TPP3-Paare, deren TCRs in den negativen Train/Validation-Daten vorkommen: 0
⚠️ Anzahl TPP3-Paare, deren Epitope in den negativen Train/Validation-Daten vorkommen: 0
✅ Anzahl TPP3-Paare, deren TCRs in den positiven Train/Validation-Daten vorkommen: 1338
✅ Anzahl TPP3-Paare, deren Epitope in den positiven Train/Validation-Daten vorkommen: 1338


## Data Concatenation
The concatenation includes further cleaning and advanced removal of duplicated rows.

In [71]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'

# beta input files
#vdjdb_beta_read_path = VDJdb_cleaned_beta_output >> rausgenommen, weil für seaprates testfile nutzen
mcpastcr_beta_read_path = McPAS_cleaned_beta_output
iedb_beta_read_path = IEDB_cleaned_beta_output 
pmtnet_beta_read_path = pMTnet_cleaned_beta_output
# paired input files
#vdjdb_paired_read_path = VDJdb_cleaned_paired_output >> rausgenommen, weil für seaprates testfile nutzen
mcpastcr_paired_read_path = McPAS_cleaned_paired_output
iedb_paired_read_path = IEDB_cleaned_paired_output 
# output files
output_file_beta = 'beta_concatenated.tsv'
output_file_paired = 'paired_concatenated.tsv'

create_folders_if_not_exists([custom_dataset_path])

%run data_scripts/concatDatasets.ipynb

length of beta_df: 217783




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 9620 entries removed.
removed all duplicates (CDR3, Epitope) from distinct values (most_important_columns, keep=False). 26543 entries removed.
beta removed entries df length: 26543


Number of groups formed: 7582


  duplicates_to_add = pd.concat([duplicates_to_add, group[group['is_duplicated'] == False]])


26206 can be re-added to the no-duplicated dataframe
from the plain dataset which has 185115 entries, 9957 entries have been removed.
for beta dataset :
size difference is: 9957
  175158 information score cleaned: 4.8346521426369335
  185115 information score dropout: 4.809572427950193
final_beta_df length = 175158
length of paired_df: 26924




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 915 entries removed.
removed all duplicates from distinct values (cultivated columns, keep=False). 1961 entries removed.
paired removed entries df length: 1961




  duplicates_to_add = pd.concat([duplicates_to_add, group[group['is_duplicated'] == False]])


1945 can be re-added to the no-duplicated dataframe
from the plain dataset which has 26881 entries, 931 entries have been removed.
for paired dataset:
size difference is: 931
  25950 information score cleaned: 5.406319845857418
  26881 information score dropout: 5.47635876641494
final_paired_df length: 25950


In [72]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'
# output files
output_file_beta = 'beta_concatenated.tsv'
output_file_paired = 'paired_concatenated.tsv'

concatenated_paired = f'{custom_dataset_path}/{output_file_paired}'
concatenated_beta = f'{custom_dataset_path}/{output_file_beta}'

## Data split
The split creates 3 datasets. Train, Validation and Test. 

In [84]:
# prepare parameters for split of paired dataset
input_file = concatenated_paired
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
validation_file_name = 'validation.tsv'
train_file_name = 'train.tsv'
aimed_validation_ratio = 0.2 # this means 20% of the concatenated dataset will be for validation

create_folders_if_not_exists([paired_output_folder])

# do the split
%run data_scripts/data_preparation/split_paired_trainval.ipynb

distinct tcr's: 23068 from 25950
unique tcr's: 21070 from 25950
unique epitopes: 565 from 25950
train data has 4880 entries
validation data has 21070 entries
validation data has 0 TPP1 tasks (unseen tcr & seen epitopes).
validation data has 17391 TPP2 tasks (unseen tcr & seen epitopes).
validation data has 3679 TPP3 tasks (unseen tcr & unseen epitope).
the train/validation ratio is 0.18805394990366087/0.8119460500963391
15881 entries will be shifted from test to train so the train/validation ratio can be 0.8/0.2
755 entries will be shifted from test to train so the tpp1/tpp2 ratio can be 0.5/0.5
755 entries need to be shifted from train to test so the tpp1/tpp2 ratio can be 0.5/0.5
755 entries from train will be moved to test (TPP1)
df_train size before: 21516
number of tpp1 before: 0
number of tpp2 before: 755
df_train size after: 20761
number of tpp1 after: 755
number of tpp2 after: 755
train data has 20761 entries
validation data has 5189 entries
validation data has 755 TPP1 tasks (

In [85]:
# prepare parameters for split of beta dataset
input_file = concatenated_beta
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta'
aimed_validation_ratio = 0.2 # this means 20% of the concatenated dataset will be for validation

create_folders_if_not_exists([beta_output_folder])

# do the split
%run data_scripts/data_preparation/split_beta_trainval.ipynb

distinct tcr's: 144650 from 175158
unique tcr's: 131596 from 175158
unique epitopes: 597 from 175158
train data has 43562 entries
validation data has 131596 entries
validation data has 0 TPP1 tasks (unseen tcr & seen epitopes).
validation data has 129170 TPP2 tasks (unseen tcr & seen epitopes).
validation data has 2426 TPP3 tasks (unseen tcr & unseen epitope).
the train/validation ratio is 0.24870117265554526/0.7512988273444547
96565 entries will be shifted from validation to train so the train/validation ratio can be 0.8/0.2
16302 entries will be shifted from validation to train so the tpp1/tpp2 ratio can be 0.5/0.5
16303 entries need to be shifted from train to validation so the tpp1/tpp2 ratio can be 0.5/0.5
train data has 140126 entries
validation data has 35032 entries
validation data has 16303 TPP1 tasks (seen tcr & seen epitopes).
validation data has 16303 TPP2 tasks (unseen tcr & seen epitopes).
validation data has 2426 TPP3 tasks (unseen tcr & unseen epitope).
the train/valida

## Negative Data

In [62]:
#Daten einlesen

combined_donors_path = f'{pipeline_data_plain}/10x/combined_donors_consensus_annotations.csv'
all_donors_consensus = pd.read_csv(combined_donors_path, sep=',')

print("Consensus: ", all_donors_consensus.head())

all_donors_meta_path = f'{pipeline_data_plain}/10x/meta.csv'
all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')

print("Meta: ", all_donors_meta.head())

Consensus:                 barcode   donor  \
0   AAACCTGAGACAAAGG-4  donor1   
1  AAACCTGAGACTGTAA-34  donor1   
2   AAACCTGAGAGCCCAA-5  donor1   
3  AAACCTGAGAGCTGCA-24  donor1   
4   AAACCTGAGAGGGATA-8  donor1   

                                  cell_clono_cdr3_aa  \
0  TRA:CAASVSIWTGTASKLTF;TRA:CAAWDMEYGNKLVF;TRB:C...   
1                                    TRB:CASDTPVGQFF   
2                 TRA:CASYTDKLIF;TRB:CASSGGSISTDTQYF   
3                                 TRB:CASSGGQSSYEQYF   
4          TRA:CAASGYGNTGRRALTF;TRB:CASSQDPAGGYNEQFF   

                                  cell_clono_cdr3_nt     CD3  CD19  CD45RA  \
0  TRA:TGTGCAGCAAGCGTTAGTATTTGGACCGGCACTGCCAGTAAA...  2125.0   0.0   912.0   
1              TRB:TGTGCCAGCGATACCCCGGTTGGGCAGTTCTTC  1023.0   0.0  2028.0   
2  TRA:TGTGCTTCCTACACCGACAAGCTCATCTTT;TRB:TGCGCCA...  1598.0   3.0  3454.0   
3     TRB:TGCGCCAGCAGTGGCGGACAGAGCTCCTACGAGCAGTACTTC   298.0   1.0   880.0   
4  TRA:TGTGCAGCAAGCGGGTATGGAAACACGGGCAGGAGAGCACTT...  10

  all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')


### Beta

In [66]:
#Dieser Code für ganzen Datensatz laufen lassen
import re
import pandas as pd

# Festlegen der Batch-Größe für die Verarbeitung
batch_size = 1000  # Passe diese Zahl je nach Speicherressourcen an

# Identifizieren von Epitope-Spalten, aber ohne "NR(B0801)_AAKGRGAAL_NC_binder"
epitope_columns = [col for col in all_donors_consensus.columns if '_binder' in col and col != "NR(B0801)_AAKGRGAAL_NC_binder"]

# Liste für alle Batch-Ergebnisse
all_batches = []

# Verarbeite `all_donors_consensus` in Batches
for batch_start in range(0, len(all_donors_consensus), batch_size):
    print("Batch Start: ", batch_start)
    # Batch definieren
    batch = all_donors_consensus.iloc[batch_start:batch_start + batch_size]
    
    # Filtern auf Zeilen, die 'TRB:' enthalten
    batch_trb = batch[batch['cell_clono_cdr3_aa'].str.contains("TRB:", na=False)]

    # Liste, um Zeilen für diesen Batch zu speichern
    expanded_rows = []
    
    # Iteriere durch jede Zeile im Batch
    for _, row in batch_trb.iterrows():
        for col in epitope_columns:
            # Extrahiere MHC und Epitope
            match = re.match(r'([A-Z0-9]+)_([A-Z]+)_.*_binder', col)
            if match:
                mhc_raw, epitope = match.groups()
                mhc_formatted = f'HLA-{mhc_raw[0]}*{mhc_raw[1:3]}:{mhc_raw[3:]}'

                # Füge `Epitope` und `MHC` zur Zeile hinzu
                new_row = row.copy()
                new_row['Epitope'] = epitope
                new_row['MHC'] = mhc_formatted

                # Füge neue Zeile zur Batch-Liste hinzu
                expanded_rows.append(new_row)
    
    # Erstelle einen DataFrame aus dem Batch
    batch_df = pd.DataFrame(expanded_rows)
    all_batches.append(batch_df)  # Speichere den Batch in der Liste

# Kombiniere alle Batch-Ergebnisse zu einem DataFrame
expanded_df = pd.concat(all_batches, ignore_index=True)

# Nur die TRB-Chain-Einträge in `all_donors_meta` beibehalten
all_donors_meta_trb = all_donors_meta[all_donors_meta['chain'] == 'TRB']

# Zusammenführen der beiden DataFrames basierend auf der 'barcode' Spalte
merged_df = pd.merge(all_donors_meta_trb, expanded_df[['barcode', 'Epitope', 'MHC']], on='barcode', how='inner')

# Spalten umbenennen und Format anpassen
merged_df = merged_df.rename(columns={
    'barcode': 'TCR_name',
    'v_gene': 'TRBV',
    'j_gene': 'TRBJ',
    'c_gene': 'TRBC',
    'cdr3': 'TRB_CDR3'
})

# Fehlende Spalten auffüllen
desired_columns = ['TCR_name', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TRBC', 'Epitope', 'MHC', 'Binding', 'task']
for col in desired_columns:
    if col not in merged_df.columns:
        merged_df[col] = 'nan' if col == 'task' else '0'

# Nur die gewünschten Spalten beibehalten und Zeilen mit `None` in `TRB_CDR3` entfernen
final_df = merged_df[desired_columns]
final_df = final_df[final_df['TRB_CDR3'] != 'None']

final_df = final_df[final_df['TRB_CDR3'].notna() & (final_df['TRB_CDR3'] != '')]

# Ausgabe des ersten Teils des Ergebnisses zur Überprüfung
print(final_df.head())

# Speichern des kombinierten DataFrames
output_path = f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv'
final_df.to_csv(output_path, index=False)


Batch Start:  0
Batch Start:  1000
Batch Start:  2000
Batch Start:  3000
Batch Start:  4000
Batch Start:  5000
Batch Start:  6000
Batch Start:  7000
Batch Start:  8000
Batch Start:  9000
Batch Start:  10000
Batch Start:  11000
Batch Start:  12000
Batch Start:  13000
Batch Start:  14000
Batch Start:  15000
Batch Start:  16000
Batch Start:  17000
Batch Start:  18000
Batch Start:  19000
Batch Start:  20000
Batch Start:  21000
Batch Start:  22000
Batch Start:  23000
Batch Start:  24000
Batch Start:  25000
Batch Start:  26000
Batch Start:  27000
Batch Start:  28000
Batch Start:  29000
Batch Start:  30000
Batch Start:  31000
Batch Start:  32000
Batch Start:  33000
Batch Start:  34000
Batch Start:  35000
Batch Start:  36000
Batch Start:  37000
Batch Start:  38000
Batch Start:  39000
Batch Start:  40000
Batch Start:  41000
Batch Start:  42000
Batch Start:  43000
Batch Start:  44000
Batch Start:  45000
Batch Start:  46000
Batch Start:  47000
Batch Start:  48000
Batch Start:  49000
Batch Start: 

In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Daten laden
beta = pd.read_csv(f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_TRB_only_expanded-all.csv', sep=',')

# Schritt 1: Aufteilung in Train und Validation (keine Test-Daten mehr)
train_split, validation_split = train_test_split(beta, test_size=0.2, random_state=42)  # 20% für Validation

# Positive Samples laden
train_preneg = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv', sep='\t')
validation_preneg = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/validation_prenegsamples.tsv', sep='\t')

# Anzahl positiver Samples
num_train_pos = len(train_preneg)
num_validation_pos = len(validation_preneg)

# Zielgrößen für negative Samples
train_neg_needed = num_train_pos
validation_neg_needed = num_validation_pos * 5

# Filter VDJdb TCRs und Epitope aus den Negativdaten
vdjdb_tcrs = set(vdjdb_df['TRB_CDR3'])
vdjdb_epitopes = set(vdjdb_df['Epitope'])

def filter_negatives(df):
    return df[
        ~df['TRB_CDR3'].isin(vdjdb_tcrs) &
        ~df['Epitope'].isin(vdjdb_epitopes)
    ]

# Funktion zur Sicherstellung, dass alle unique Epitope erhalten bleiben
def ensure_unique_epitopes(df, target_count):
    unique_epitopes = df['Epitope'].unique()
    guaranteed_samples = []
    
    # Stelle sicher, dass jedes Epitope mindestens einmal vorkommt
    for epitope in unique_epitopes:
        epitope_group = df[df['Epitope'] == epitope]
        if len(epitope_group) > 0:
            guaranteed_samples.append(epitope_group.sample(1, random_state=42))
    
    # Kombiniere garantierte Samples
    guaranteed_df = pd.concat(guaranteed_samples, ignore_index=True)
    
    # Berechne verbleibende Anzahl an Samples
    remaining_count = target_count - len(guaranteed_df)
    if remaining_count > 0:
        remaining_samples = df.sample(remaining_count, random_state=42, replace=True)
        return pd.concat([guaranteed_df, remaining_samples], ignore_index=True)
    return guaranteed_df

# Filter negative Daten, die nicht in VDJdb vorkommen
train_filtered_negatives = filter_negatives(train_split)
validation_filtered_negatives = filter_negatives(validation_split)

# Balancierung der Splits
train_balanced_negatives = ensure_unique_epitopes(train_filtered_negatives, train_neg_needed)
validation_balanced_negatives = ensure_unique_epitopes(validation_filtered_negatives, validation_neg_needed)

# Positive und negative Samples kombinieren
train_combined = pd.concat([train_preneg, train_balanced_negatives], ignore_index=True)
validation_combined = pd.concat([validation_preneg, validation_balanced_negatives], ignore_index=True)

# Speichern der kombinierten Datensätze
output_dir = f'{pipeline_data_splitted}/{precision}/beta/'
train_combined.to_csv(output_dir + "train.tsv", sep='\t', index=False)
validation_combined.to_csv(output_dir + "validation.tsv", sep='\t', index=False)

# Berechne Unique Werte
unique_tcr_train = train_combined['TRB_CDR3'].nunique()
unique_epitope_train = train_combined['Epitope'].nunique()
unique_tcr_validation = validation_combined['TRB_CDR3'].nunique()
unique_epitope_validation = validation_combined['Epitope'].nunique()

train_binding_counts = train_combined['Binding'].value_counts()
validation_binding_counts = validation_combined['Binding'].value_counts()

# Finale Ausgabe
print("\nAlle Datensätze wurden erfolgreich gespeichert.")
print(f"Train: {len(train_combined)} Einträge")
print(f"- Unique TCRs: {unique_tcr_train}")
print(f"- Unique Epitope: {unique_epitope_train}")
print(f"- Positive (binding == 1): {train_binding_counts.get(1, 0)}")
print(f"- Negative (binding == 0): {train_binding_counts.get(0, 0)}")

print(f"\nValidation: {len(validation_combined)} Einträge")
print(f"- Unique TCRs: {unique_tcr_validation}")
print(f"- Unique Epitope: {unique_epitope_validation}")
print(f"- Positive (binding == 1): {validation_binding_counts.get(1, 0)}")
print(f"- Negative (binding == 0): {validation_binding_counts.get(0, 0)}")

  train_preneg = pd.read_csv(f'{pipeline_data_splitted}/{precision}/beta/train_prenegsamples.tsv', sep='\t')



Alle Datensätze wurden erfolgreich gespeichert.
Train: 280252 Einträge
- Unique TCRs: 177220
- Unique Epitope: 987
- Positive (binding == 1): 140126
- Negative (binding == 0): 140126

Validation: 210192 Einträge
- Unique TCRs: 71316
- Unique Epitope: 1456
- Positive (binding == 1): 35032
- Negative (binding == 0): 175160


### Paired

In [68]:
import re
import pandas as pd

# Annahme: all_donors_consensus und all_donors_meta sind bereits geladen und gefiltert

# Festlegen der Batch-Größe für die Verarbeitung
batch_size = 1000

# Identifizieren von Epitope-Spalten, aber ohne "NR(B0801)_AAKGRGAAL_NC_binder"
epitope_columns = [col for col in all_donors_consensus.columns if '_binder' in col and col != "NR(B0801)_AAKGRGAAL_NC_binder"]

# Ausgabe-Datei für Batch-Ergebnisse
output_batch_file = f'{pipeline_data_plain}/10x/expanded_batches.csv'

# Stelle sicher, dass die Ausgabedatei leer ist
with open(output_batch_file, 'w') as f:
    f.write('')  # Leere Datei erstellen

# Verarbeite `all_donors_consensus` in Batches
for batch_start in range(0, len(all_donors_consensus), batch_size):
    print(f"Batch Start: {batch_start}")
    # Definiere Batch
    batch = all_donors_consensus.iloc[batch_start:batch_start + batch_size]
    
    # Filtern auf Zeilen, die sowohl 'TRA:' als auch 'TRB:' in 'cell_clono_cdr3_aa' enthalten
    batch_paired = batch[
        batch['cell_clono_cdr3_aa'].str.contains("TRA:", na=False) &
        batch['cell_clono_cdr3_aa'].str.contains("TRB:", na=False)
    ]

    # Liste, um Zeilen für diesen Batch zu speichern
    expanded_rows = []
    
    # Iteriere durch jede Zeile im Batch
    for _, row in batch_paired.iterrows():
        for col in epitope_columns:
            # Extrahiere MHC und Epitope
            match = re.match(r'([A-Z0-9]+)_([A-Z]+)_.*_binder', col)
            if match:
                mhc_raw, epitope = match.groups()
                mhc_formatted = f'HLA-{mhc_raw[0]}*{mhc_raw[1:3]}:{mhc_raw[3:]}'

                # Füge `Epitope` und `MHC` zur Zeile hinzu
                new_row = row.copy()
                new_row['Epitope'] = epitope
                new_row['MHC'] = mhc_formatted

                # Neue Zeile zur Batch-Liste hinzufügen
                expanded_rows.append(new_row)

    # Erstelle einen DataFrame aus dem Batch
    batch_df = pd.DataFrame(expanded_rows)

    # Füge den Batch direkt in die Datei ein
    batch_df.to_csv(output_batch_file, mode='a', index=False, header=not batch_start, sep=',')
    print(f"Batch {batch_start} gespeichert.")

# Laden der gespeicherten Batch-Ergebnisse
expanded_df = pd.read_csv(output_batch_file)

# Nur die Paired-Einträge in `all_donors_meta` beibehalten
# Filtern auf Barcodes, die sowohl eine TRA- als auch eine TRB-Kette haben
paired_barcodes = all_donors_meta.groupby('barcode').filter(
    lambda x: set(x['chain']) == {'TRA', 'TRB'}
)['barcode'].unique()
all_donors_meta_paired = all_donors_meta[all_donors_meta['barcode'].isin(paired_barcodes)]

# Split `all_donors_meta_paired` nach `chain` in separate DataFrames für TRA und TRB
alpha_chain = all_donors_meta_paired[all_donors_meta_paired['chain'] == 'TRA'].rename(
    columns={'v_gene': 'TRAV', 'j_gene': 'TRAJ', 'cdr3': 'TRA_CDR3', 'c_gene': 'TRAC'}
)
beta_chain = all_donors_meta_paired[all_donors_meta_paired['chain'] == 'TRB'].rename(
    columns={'v_gene': 'TRBV', 'j_gene': 'TRBJ', 'cdr3': 'TRB_CDR3', 'c_gene': 'TRBC'}
)

# Zusammenführen von alpha_chain und beta_chain anhand der gemeinsamen 'barcode'-Spalte
paired_meta = pd.merge(alpha_chain, beta_chain, on='barcode', suffixes=('_alpha', '_beta'))

# Zusammenführen von `paired_meta` mit `expanded_df` anhand der 'barcode'-Spalte
merged_df = pd.merge(paired_meta, expanded_df[['barcode', 'Epitope', 'MHC']], on='barcode', how='inner')

# Spalten umbenennen und Format anpassen
merged_df = merged_df.rename(columns={'barcode': 'TCR_name'})

# Fehlende Spalten auffüllen
desired_columns = [
    'TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TRAC', 'TRBC', 
    'Epitope', 'MHC', 'Binding', 'task'
]
for col in desired_columns:
    if col not in merged_df.columns:
        merged_df[col] = 'nan' if col == 'task' else '0'

# Nur die gewünschten Spalten beibehalten und Zeilen mit `None` in `TRB_CDR3` entfernen
final_df = merged_df[desired_columns]
final_df = final_df[final_df['TRB_CDR3'] != 'None']

final_df = final_df[
    final_df['TRB_CDR3'].notna() & (final_df['TRB_CDR3'] != '') &
    final_df['TRA_CDR3'].notna() & (final_df['TRA_CDR3'] != '')
]

# Ausgabe des ersten Teils des Ergebnisses zur Überprüfung
print(final_df.head())

# Optional: Speichern des kombinierten DataFrames
output_path = f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_paired_only_expanded-all.csv'
final_df.to_csv(output_path, index=False)
print("Datei erfolgreich gespeichert!")


Batch Start: 0
Batch 0 gespeichert.
Batch Start: 1000
Batch 1000 gespeichert.
Batch Start: 2000
Batch 2000 gespeichert.
Batch Start: 3000
Batch 3000 gespeichert.
Batch Start: 4000
Batch 4000 gespeichert.
Batch Start: 5000
Batch 5000 gespeichert.
Batch Start: 6000
Batch 6000 gespeichert.
Batch Start: 7000
Batch 7000 gespeichert.
Batch Start: 8000
Batch 8000 gespeichert.
Batch Start: 9000
Batch 9000 gespeichert.
Batch Start: 10000
Batch 10000 gespeichert.
Batch Start: 11000
Batch 11000 gespeichert.
Batch Start: 12000
Batch 12000 gespeichert.
Batch Start: 13000
Batch 13000 gespeichert.
Batch Start: 14000
Batch 14000 gespeichert.
Batch Start: 15000
Batch 15000 gespeichert.
Batch Start: 16000
Batch 16000 gespeichert.
Batch Start: 17000
Batch 17000 gespeichert.
Batch Start: 18000
Batch 18000 gespeichert.
Batch Start: 19000
Batch 19000 gespeichert.
Batch Start: 20000
Batch 20000 gespeichert.
Batch Start: 21000
Batch 21000 gespeichert.
Batch Start: 22000
Batch 22000 gespeichert.
Batch Start: 2

In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Daten laden
paired_df = pd.read_csv(f'{pipeline_data_plain}/10x/combined_output_with_epitope_mhc_paired_only_expanded-all.csv', sep=',')

# Schritt 1: Aufteilung in Train und Validation (keine Test-Daten mehr)
train_split, validation_split = train_test_split(paired_df, test_size=0.2, random_state=42)  # 20% für Validation

# Positive Samples laden >> files zuerst umbenennen
train_preneg = pd.read_csv(f'{pipeline_data_splitted}/{precision}/paired/train_prenegsamples.tsv', sep='\t')
validation_preneg = pd.read_csv(f'{pipeline_data_splitted}/{precision}/paired/validation_prenegsamples.tsv', sep='\t')

# Anzahl positiver Samples
num_train_pos = len(train_preneg)
num_validation_pos = len(validation_preneg)

# Zielgrößen für negative Samples
train_neg_needed = num_train_pos
validation_neg_needed = num_validation_pos * 5

# Funktion zur Sicherstellung, dass alle unique Epitope erhalten bleiben
def ensure_unique_epitopes(df, target_count):
    unique_epitopes = df['Epitope'].unique()
    guaranteed_samples = []
    
    # Stelle sicher, dass jedes Epitope mindestens einmal vorkommt
    for epitope in unique_epitopes:
        epitope_group = df[df['Epitope'] == epitope]
        if len(epitope_group) > 0:
            guaranteed_samples.append(epitope_group.sample(1, random_state=42))
    
    # Kombiniere garantierte Samples
    guaranteed_df = pd.concat(guaranteed_samples, ignore_index=True)
    
    # Berechne verbleibende Anzahl an Samples
    remaining_count = target_count - len(guaranteed_df)
    if remaining_count > 0:
        remaining_samples = df.sample(remaining_count, random_state=42, replace=True)
        return pd.concat([guaranteed_df, remaining_samples], ignore_index=True)
    return guaranteed_df

# Balancierung der Splits
train_balanced_negatives = ensure_unique_epitopes(train_split, train_neg_needed)
validation_balanced_negatives = ensure_unique_epitopes(validation_split, validation_neg_needed)

# Positive und negative Samples kombinieren
train_combined = pd.concat([train_preneg, train_balanced_negatives], ignore_index=True)
validation_combined = pd.concat([validation_preneg, validation_balanced_negatives], ignore_index=True)

# Speichern der kombinierten Datensätze
output_dir = f'{pipeline_data_splitted}/{precision}/paired/'
train_combined.to_csv(output_dir + "train.tsv", sep='\t', index=False)
validation_combined.to_csv(output_dir + "validation.tsv", sep='\t', index=False)

# Berechne Unique Werte
unique_tcr_train = train_combined['TRB_CDR3'].nunique()
unique_epitope_train = train_combined['Epitope'].nunique()
unique_tcr_validation = validation_combined['TRB_CDR3'].nunique()
unique_epitope_validation = validation_combined['Epitope'].nunique()

# Finale Ausgabe
print("\nAlle Datensätze wurden erfolgreich gespeichert.")
print(f"Train: {len(train_combined)} Einträge")
print(f"-Unique TCRs: {unique_tcr_train}")
print(f"-Unique Epitope: {unique_epitope_train}")
print(f"Validation: {len(validation_combined)} Einträge")
print(f"-Unique TCRs: {unique_tcr_validation}")
print(f"-Unique Epitope: {unique_epitope_validation}")


Alle Datensätze wurden erfolgreich gespeichert.
Train: 41522 Einträge
-Unique TCRs: 22801
-Unique Epitope: 520
Validation: 31134 Einträge
-Unique TCRs: 15392
-Unique Epitope: 1031


## Task Classification 
The classification in the split notebook correct for positive only data. After adding negative data, some classifications might be wrong.

In [88]:
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
validation_file_name = 'validation.tsv'
train_file_name = 'train.tsv'
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta'

In [89]:
# do the classification for paired data
paired = True
train_data_path = f'{paired_output_folder}/{train_file_name}'
validation_data_path = f'{paired_output_folder}/{validation_file_name}'

%run data_scripts/data_preparation/classification_notest.ipynb

train data has 41522 entries
validate data has 31134 entries
validate data has 15339 TPP1 tasks (seen tcr & seen epitopes).
validate data has 12190 TPP2 tasks (unseen tcr & seen epitopes).
validate data has 3605 TPP3 tasks (unseen tcr & unseen epitope).
validate data has 0 TPP4 tasks (seen tcr & unseen epitope).


In [90]:
# extended classification for paired data
paired = True
train_path = f'{paired_output_folder}/{train_file_name}'
validation_path = f'{paired_output_folder}/{validation_file_name}'
output_path = f'{paired_output_folder}/validate_reclassified_paired_specific.tsv'
paired_data_path = paired_output_folder
alpha_cdr3_name = 'TRA_CDR3'
beta_cdr3_name = 'TRB_CDR3'
epitope_name = 'Epitope'
task_name = 'task'

%run data_scripts/data_preparation/paired_reclassification_notest.ipynb

allele
train data has 41522 entries
validate data has 31134 entries
validate data has 21747 TPP1 tasks (old value: 15339) (seen tcr & seen epitopes).
validate data has 5782 TPP2 tasks (old value: 12190) (unseen tcr & seen epitopes).
validate data has 3063 TPP3 tasks (old value: 3605) (unseen tcr & unseen epitope).
validate data has 542 TPP4 tasks (old value: 0) (seen tcr & unseen epitope).
the train/validate ratio is 0.5714875578066505/0.4285124421933495
../../data/splitted_datasets/allele/paired/validate_reclassified_paired_specific.tsv
/home/ubuntu/arina/BA-Cancer-Immunotherapy
uploading dataset to dataset-allele


[34m[1mwandb[0m: Adding directory to artifact (./../../data/splitted_datasets/allele/paired)... Done. 0.3s


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [91]:
# do the classification for beta data
paired = False
train_data_path = f'{beta_output_folder}/{train_file_name}'
validation_data_path = f'{beta_output_folder}/{validation_file_name}'

%run data_scripts/data_preparation/classification_notest.ipynb

  df_train = pd.read_csv(train_data_path, sep="\t")
  df_validation = pd.read_csv(validation_data_path, sep="\t")


train data has 280252 entries
validate data has 210192 entries
validate data has 163334 TPP1 tasks (seen tcr & seen epitopes).
validate data has 44432 TPP2 tasks (unseen tcr & seen epitopes).
validate data has 2412 TPP3 tasks (unseen tcr & unseen epitope).
validate data has 14 TPP4 tasks (seen tcr & unseen epitope).


In the next two cells the classification is checked. If the output says "Classification is correct", everything is fine.

In [92]:
# check task classification paired
paired = True
splitted_data_path = paired_output_folder

%run data_scripts/data_preparation/check_task_classification_paired_notest.ipynb

train data has 41522 entries
validate data has 31134 entries
validate data has 15339 TPP1 tasks (seen tcr & seen epitopes).
validate data has 12190 TPP2 tasks (unseen tcr & seen epitopes).
validate data has 3605 TPP3 tasks (unseen tcr & unseen epitope).
validate data has 0 TPP4 tasks (seen tcr & unseen epitope).
the train/validate ratio is 0.5714875578066505/0.4285124421933495
Classification is correct.
Correctness summary:
is_correct
True    31134
Name: count, dtype: int64


In [93]:
# check task classification beta
paired = False
splitted_data_path = beta_output_folder

%run data_scripts/data_preparation/check_task_classification_beta_notest.ipynb

  df_train = pd.read_csv(f"{splitted_data_path}/{train_file_name}", sep="\t")


train data has 280252 entries
validate data has 210192 entries
validate data has 163334 TPP1 tasks (seen tcr & seen epitopes).
validate data has 44432 TPP2 tasks (unseen tcr & seen epitopes).
validate data has 2412 TPP3 tasks (unseen tcr & unseen epitope).
validate data has 14 TPP4 tasks (seen tcr & unseen epitope).
the train/validate ratio is 0.5714250760535351/0.42857492394646485
Classification is correct.
Correctness summary:
is_correct
True    210192
Name: count, dtype: int64


## Upload dataset

In [94]:
import os
print(os.listdir(path_to_data))


['.ipynb_checkpoints', 'validation_prenegsamples.tsv', 'test.tsv', 'train.tsv', 'test_reclassified_paired_specific.tsv', 'validation.tsv', 'train_prenegsamples.tsv', 'validate_reclassified_paired_specific.tsv', 'test_prenegsamples.tsv']


In [95]:
from dotenv import load_dotenv, find_dotenv
load_dotenv()

# upload paired data
path_to_data = f'{pipeline_data_splitted}/{precision}/paired'
dataset_name = f'paired_{precision}'
#main_project_name = os.getenv("MAIN_PROJECT_NAME")
main_project_name = f"dataset-{precision}"

%run data_scripts/upload_datasets.ipynb

uploading dataset to dataset-allele


[34m[1mwandb[0m: Adding directory to artifact (./../../data/splitted_datasets/allele/paired)... Done. 0.2s


In [96]:
# upload beta data
path_to_data = f'{pipeline_data_splitted}/{precision}/beta'
dataset_name = f'beta_{precision}'

%run data_scripts/upload_datasets.ipynb

uploading dataset to dataset-allele


[34m[1mwandb[0m: Adding directory to artifact (./../../data/splitted_datasets/allele/beta)... Done. 1.0s


VBox(children=(Label(value='1.197 MB of 46.627 MB uploaded\r'), FloatProgress(value=0.025671545897824336, max=…

## Create Embeddings >> ProtBert

In [97]:
import torch
print(torch.cuda.is_available())  # Sollte True zurückgeben
print(torch.version.cuda)  # Sollte die richtige CUDA-Version anzeigen

True
12.4


In [None]:
path_paired_validation = f"{pipeline_data_splitted}/{precision}/paired/validation.tsv"
path_paired_train = f"{pipeline_data_splitted}/{precision}/paired/train.tsv"
path_beta_validation = f"{pipeline_data_splitted}/{precision}/beta/validation.tsv"
path_beta_train = f"{pipeline_data_splitted}/{precision}/beta/train.tsv"


path_paired = f"{pipeline_data}/embeddings/temp/{precision}/paired_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_paired)])
df_paired_validation = pd.read_csv(path_paired_validation, sep="\t", index_col=False)
df_paired_train = pd.read_csv(path_paired_train, sep="\t", index_col=False)
df_paired = pd.concat([df_paired_validation, df_paired_train])
df_paired.to_csv(path_paired, sep="\t", index=False)

# paired
%run data_scripts/generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz TRA_CDR3
%run data_scripts/generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz TRB_CDR3
%run data_scripts/generateEmbeddingsProtBERT.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz Epitope

path_beta = f"{pipeline_data}/embeddings/temp/{precision}/beta_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_beta)])
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_validation, df_beta_train])
df_beta.to_csv(path_beta, sep="\t", index=False)

# beta
%run data_scripts/generateEmbeddingsProtBERT.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/TRB_beta_embeddings.npz TRB_CDR3
%run data_scripts/generateEmbeddingsProtBERT.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/Epitope_beta_embeddings.npz Epitope

Using GPU: Tesla T4
Loading: Rostlab/prot_bert
Model is on device: cuda:0
Processing Batch:  0 64
Processing Batch:  64 128
Processing Batch:  128 192
Processing Batch:  192 256
Processing Batch:  256 320
Processing Batch:  320 384
Processing Batch:  384 448
Processing Batch:  448 512
Processing Batch:  512 576
Processing Batch:  576 640
Processing Batch:  640 704
Processing Batch:  704 768
Processing Batch:  768 832
Processing Batch:  832 896
Processing Batch:  896 960
Processing Batch:  960 1024
Processing Batch:  1024 1088
Processing Batch:  1088 1152
Processing Batch:  1152 1216
Processing Batch:  1216 1280
Processing Batch:  1280 1344
Processing Batch:  1344 1408
Processing Batch:  1408 1472
Processing Batch:  1472 1536
Processing Batch:  1536 1600
Processing Batch:  1600 1664
Processing Batch:  1664 1728
Processing Batch:  1728 1792
Processing Batch:  1792 1856
Processing Batch:  1856 1920
Processing Batch:  1920 1984
Processing Batch:  1984 2048
Processing Batch:  2048 2112
Proc

In [9]:
import numpy as np

# Funktion, um Embeddings korrekt zu laden
def load_embeddings(file_path):
    npz_data = np.load(file_path)
    all_keys = list(npz_data.keys())

    # Falls Embeddings als einzelne Sequenzen gespeichert sind
    if len(all_keys) > 1:
        all_values = [npz_data[k] for k in all_keys]
        return np.vstack(all_values)  # Alles zusammenfügen
    else:
        return npz_data[all_keys[0]]

# Embeddings für TRA, TRB und Epitope laden
tra_embeddings = load_embeddings(f"{pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz")
trb_embeddings = load_embeddings(f"{pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz")
epitope_embeddings = load_embeddings(f"{pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz")

# Ausgabe der finalen Shapes
print(f"📌 TRA Embedding Shape: {tra_embeddings.shape}")
print(f"📌 TRB Embedding Shape: {trb_embeddings.shape}")
print(f"📌 Epitope Embedding Shape: {epitope_embeddings.shape}")


📌 TRA Embedding Shape: (596417, 1024)
📌 TRB Embedding Shape: (694427, 1024)
📌 Epitope Embedding Shape: (12870, 1024)


In [11]:
import pandas as pd

# Beispielpfade für Train-, Test-, und Validierungsdatensätze für alle vier Kategorien
base_path = pipeline_data_splitted

# Definierte Pfade für alle vier Kategorien
datasets = {
    "paired_gene": {
        "train": f"{base_path}/gene/paired/train.tsv",
        "test": f"{base_path}/gene/paired/test.tsv",
        "validation": f"{base_path}/gene/paired/validation.tsv"
    },
    "beta_gene": {
        "train": f"{base_path}/gene/beta/train.tsv",
        "test": f"{base_path}/gene/beta/test.tsv",
        "validation": f"{base_path}/gene/beta/validation.tsv"
    }
}

# Berechnung der Anzahl der Zeilen für jedes Set
results = {}
for dataset_name, paths in datasets.items():
    # Daten laden
    train_df = pd.read_csv(paths["train"], sep='\t')
    test_df = pd.read_csv(paths["test"], sep='\t')
    validation_df = pd.read_csv(paths["validation"], sep='\t')
    
    # Anzahl der Zeilen berechnen
    train_length = len(train_df)
    test_length = len(test_df)
    validation_length = len(validation_df)
    total_length = train_length + test_length + validation_length
    
    # Zähle die Anzahl der Bindings 1 und 0 in jedem Datensatz
    train_binding_counts = train_df['Binding'].value_counts()
    test_binding_counts = test_df['Binding'].value_counts()
    validation_binding_counts = validation_df['Binding'].value_counts()
    
    # Zähle die Anzahl der TPP1, TPP2, TPP3 Einträge in jedem Datensatz
    train_task_counts = train_df['task'].value_counts()
    test_task_counts = test_df['task'].value_counts()
    validation_task_counts = validation_df['task'].value_counts()

    # Ergebnisse speichern
    results[dataset_name] = {
        "Train": train_length,
        "Train_Binding_1": train_binding_counts.get(1, 0),
        "Train_Binding_0": train_binding_counts.get(0, 0),
        "Train_TPP1": train_task_counts.get("TPP1", 0),
        "Train_TPP2": train_task_counts.get("TPP2", 0),
        "Train_TPP3": train_task_counts.get("TPP3", 0),
        "Train_TPP4": train_task_counts.get("TPP4", 0),
        "Test": test_length,
        "Test_Binding_1": test_binding_counts.get(1, 0),
        "Test_Binding_0": test_binding_counts.get(0, 0),
        "Test_TPP1": test_task_counts.get("TPP1", 0),
        "Test_TPP2": test_task_counts.get("TPP2", 0),
        "Test_TPP3": test_task_counts.get("TPP3", 0),
        "Test_TPP4": test_task_counts.get("TPP4", 0),
        "Validation": validation_length,
        "Validation_Binding_1": validation_binding_counts.get(1, 0),
        "Validation_Binding_0": validation_binding_counts.get(0, 0),
        "Validation_TPP1": validation_task_counts.get("TPP1", 0),
        "Validation_TPP2": validation_task_counts.get("TPP2", 0),
        "Validation_TPP3": validation_task_counts.get("TPP3", 0),
        "Validation_TPP4": validation_task_counts.get("TPP4", 0),
        "Total": total_length
    }

# Ergebnisse anzeigen
for dataset, lengths in results.items():
    print(f'--- {dataset.replace("_", " ").title()} ---')
    print(f'Anzahl der Zeilen im Trainingsdatensatz: {lengths["Train"]} (Binding=1: {lengths["Train_Binding_1"]}, Binding=0: {lengths["Train_Binding_0"]}, TPP1: {lengths["Train_TPP1"]}, TPP2: {lengths["Train_TPP2"]}, TPP3: {lengths["Train_TPP3"]})')
    print(f'Anzahl der Zeilen im Testdatensatz: {lengths["Test"]} (Binding=1: {lengths["Test_Binding_1"]}, Binding=0: {lengths["Test_Binding_0"]}, TPP1: {lengths["Test_TPP1"]}, TPP2: {lengths["Test_TPP2"]}, TPP3: {lengths["Test_TPP3"]})')
    print(f'Anzahl der Zeilen im Validierungsdatensatz: {lengths["Validation"]} (Binding=1: {lengths["Validation_Binding_1"]}, Binding=0: {lengths["Validation_Binding_0"]}, TPP1: {lengths["Validation_TPP1"]}, TPP2: {lengths["Validation_TPP2"]}, TPP3: {lengths["Validation_TPP3"]})')
    print(f'Gesamtanzahl der Zeilen (Train + Test + Validation): {lengths["Total"]}\n')

# Optional: Ergebnisse in einer Übersichtstabelle darstellen
summary_data = []
for dataset, lengths in results.items():
    summary_data.append({
        "Dataset": dataset.replace("_", " ").title(),
        "Train": lengths["Train"],
        "Train_Binding_1": lengths["Train_Binding_1"],
        "Train_Binding_0": lengths["Train_Binding_0"],
        "Train_TPP1": lengths["Train_TPP1"],
        "Train_TPP2": lengths["Train_TPP2"],
        "Train_TPP3": lengths["Train_TPP3"],
        "Train_TPP4": lengths["Train_TPP4"],
        "Test": lengths["Test"],
        "Test_Binding_1": lengths["Test_Binding_1"],
        "Test_Binding_0": lengths["Test_Binding_0"],
        "Test_TPP1": lengths["Test_TPP1"],
        "Test_TPP2": lengths["Test_TPP2"],
        "Test_TPP3": lengths["Test_TPP3"],
        "Test_TPP4": lengths["Test_TPP4"],
        "Validation": lengths["Validation"],
        "Validation_Binding_1": lengths["Validation_Binding_1"],
        "Validation_Binding_0": lengths["Validation_Binding_0"],
        "Validation_TPP1": lengths["Validation_TPP1"],
        "Validation_TPP2": lengths["Validation_TPP2"],
        "Validation_TPP3": lengths["Validation_TPP3"],
        "Validation_TPP4": lengths["Validation_TPP4"],
        "Total": lengths["Total"]
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df)


  train_df = pd.read_csv(paths["train"], sep='\t')


--- Paired Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 67422 (Binding=1: 33711, Binding=0: 33711, TPP1: 0, TPP2: 0, TPP3: 0)
Anzahl der Zeilen im Testdatensatz: 43356 (Binding=1: 7226, Binding=0: 36130, TPP1: 27972, TPP2: 15095, TPP3: 289)
Anzahl der Zeilen im Validierungsdatensatz: 43344 (Binding=1: 7224, Binding=0: 36120, TPP1: 0, TPP2: 0, TPP3: 0)
Gesamtanzahl der Zeilen (Train + Test + Validation): 154122

--- Beta Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 251750 (Binding=1: 125875, Binding=0: 125875, TPP1: 0, TPP2: 0, TPP3: 0)
Anzahl der Zeilen im Testdatensatz: 161844 (Binding=1: 26974, Binding=0: 134870, TPP1: 140896, TPP2: 20645, TPP3: 299)
Anzahl der Zeilen im Validierungsdatensatz: 161838 (Binding=1: 26973, Binding=0: 134865, TPP1: 0, TPP2: 0, TPP3: 0)
Gesamtanzahl der Zeilen (Train + Test + Validation): 575432

       Dataset   Train  Train_Binding_1  Train_Binding_0  Train_TPP1  \
0  Paired Gene   67422            33711            33711           0   
1    Bet

In [10]:
import pandas as pd

# Beispielpfade für Train-, Test-, und Validierungsdatensätze für alle vier Kategorien
base_path = pipeline_data_splitted

# Definierte Pfade für alle vier Kategorien
datasets = {
    "paired_gene": {
        "train": f"{base_path}/gene/paired/train.tsv",
        "test": f"{base_path}/gene/paired/test.tsv",
        "validation": f"{base_path}/gene/paired/validation.tsv"
    },
    "paired_allele": {
        "train": f"{base_path}/allele/paired/train.tsv",
        "test": f"{base_path}/allele/paired/test.tsv",
        "validation": f"{base_path}/allele/paired/validation.tsv"
    },
    "beta_gene": {
        "train": f"{base_path}/gene/beta/train.tsv",
        "test": f"{base_path}/gene/beta/test.tsv",
        "validation": f"{base_path}/gene/beta/validation.tsv"
    },
    "beta_allele": {
        "train": f"{base_path}/allele/beta/train.tsv",
        "test": f"{base_path}/allele/beta/test.tsv",
        "validation": f"{base_path}/allele/beta/validation.tsv"
    }
}

# Berechnung der Anzahl der Zeilen für jedes Set
results = {}
for dataset_name, paths in datasets.items():
    # Daten laden
    train_df = pd.read_csv(paths["train"], sep='\t')
    test_df = pd.read_csv(paths["test"], sep='\t')
    validation_df = pd.read_csv(paths["validation"], sep='\t')
    
    # Anzahl der Zeilen berechnen
    train_length = len(train_df)
    test_length = len(test_df)
    validation_length = len(validation_df)
    total_length = train_length + test_length + validation_length
    
    # Zähle die Anzahl der Bindings 1 und 0 in jedem Datensatz
    train_binding_counts = train_df['Binding'].value_counts()
    test_binding_counts = test_df['Binding'].value_counts()
    validation_binding_counts = validation_df['Binding'].value_counts()
    
    # Zähle die Anzahl der TPP1, TPP2, TPP3 Einträge in jedem Datensatz
    train_task_counts = train_df['task'].value_counts()
    test_task_counts = test_df['task'].value_counts()
    validation_task_counts = validation_df['task'].value_counts()

    # Ergebnisse speichern
    results[dataset_name] = {
        "Train": train_length,
        "Train_Binding_1": train_binding_counts.get(1, 0),
        "Train_Binding_0": train_binding_counts.get(0, 0),
        "Train_TPP1": train_task_counts.get("TPP1", 0),
        "Train_TPP2": train_task_counts.get("TPP2", 0),
        "Train_TPP3": train_task_counts.get("TPP3", 0),
        "Train_TPP4": train_task_counts.get("TPP4", 0),
        "Test": test_length,
        "Test_Binding_1": test_binding_counts.get(1, 0),
        "Test_Binding_0": test_binding_counts.get(0, 0),
        "Test_TPP1": test_task_counts.get("TPP1", 0),
        "Test_TPP2": test_task_counts.get("TPP2", 0),
        "Test_TPP3": test_task_counts.get("TPP3", 0),
        "Test_TPP4": test_task_counts.get("TPP4", 0),
        "Validation": validation_length,
        "Validation_Binding_1": validation_binding_counts.get(1, 0),
        "Validation_Binding_0": validation_binding_counts.get(0, 0),
        "Validation_TPP1": validation_task_counts.get("TPP1", 0),
        "Validation_TPP2": validation_task_counts.get("TPP2", 0),
        "Validation_TPP3": validation_task_counts.get("TPP3", 0),
        "Validation_TPP4": validation_task_counts.get("TPP4", 0),
        "Total": total_length
    }

# Ergebnisse anzeigen
for dataset, lengths in results.items():
    print(f'--- {dataset.replace("_", " ").title()} ---')
    print(f'Anzahl der Zeilen im Trainingsdatensatz: {lengths["Train"]} (Binding=1: {lengths["Train_Binding_1"]}, Binding=0: {lengths["Train_Binding_0"]}, TPP1: {lengths["Train_TPP1"]}, TPP2: {lengths["Train_TPP2"]}, TPP3: {lengths["Train_TPP3"]})')
    print(f'Anzahl der Zeilen im Testdatensatz: {lengths["Test"]} (Binding=1: {lengths["Test_Binding_1"]}, Binding=0: {lengths["Test_Binding_0"]}, TPP1: {lengths["Test_TPP1"]}, TPP2: {lengths["Test_TPP2"]}, TPP3: {lengths["Test_TPP3"]})')
    print(f'Anzahl der Zeilen im Validierungsdatensatz: {lengths["Validation"]} (Binding=1: {lengths["Validation_Binding_1"]}, Binding=0: {lengths["Validation_Binding_0"]}, TPP1: {lengths["Validation_TPP1"]}, TPP2: {lengths["Validation_TPP2"]}, TPP3: {lengths["Validation_TPP3"]})')
    print(f'Gesamtanzahl der Zeilen (Train + Test + Validation): {lengths["Total"]}\n')

# Optional: Ergebnisse in einer Übersichtstabelle darstellen
summary_data = []
for dataset, lengths in results.items():
    summary_data.append({
        "Dataset": dataset.replace("_", " ").title(),
        "Train": lengths["Train"],
        "Train_Binding_1": lengths["Train_Binding_1"],
        "Train_Binding_0": lengths["Train_Binding_0"],
        "Train_TPP1": lengths["Train_TPP1"],
        "Train_TPP2": lengths["Train_TPP2"],
        "Train_TPP3": lengths["Train_TPP3"],
        "Train_TPP4": lengths["Train_TPP4"],
        "Test": lengths["Test"],
        "Test_Binding_1": lengths["Test_Binding_1"],
        "Test_Binding_0": lengths["Test_Binding_0"],
        "Test_TPP1": lengths["Test_TPP1"],
        "Test_TPP2": lengths["Test_TPP2"],
        "Test_TPP3": lengths["Test_TPP3"],
        "Test_TPP4": lengths["Test_TPP4"],
        "Validation": lengths["Validation"],
        "Validation_Binding_1": lengths["Validation_Binding_1"],
        "Validation_Binding_0": lengths["Validation_Binding_0"],
        "Validation_TPP1": lengths["Validation_TPP1"],
        "Validation_TPP2": lengths["Validation_TPP2"],
        "Validation_TPP3": lengths["Validation_TPP3"],
        "Validation_TPP4": lengths["Validation_TPP4"],
        "Total": lengths["Total"]
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df)


FileNotFoundError: [Errno 2] No such file or directory: '../../data/splitted_datasets/allele/paired/train.tsv'