In [3]:
import os

# List all files and directories in the current directory
# print(os.listdir())

# List files in a specific directory
print(os.listdir("./../../data"))


['embeddings', 'cleaned_datasets', 'concatenated_datasets', 'splitted_datasets', 'temp', 'plain_datasets']


In [None]:
import sys
import os
import pandas as pd
import numpy as np
from tcrpeg.TCRpeg import TCRpeg

# Get the absolute path of 'TCRpeg' and add it to sys.path
tcrpeg_path = os.path.abspath("../../data/embeddings/TCRpeg")
sys.path.append(tcrpeg_path)


In [None]:
# ACHTUNG: for allele and paired the code must be revised/checked. Do the for loops work as intended????

# Define paths
precisions = ['gene']
levels = ['beta']
datasets = ['train', 'validation', 'test']
file_paths = []
embedding_paths = []
for precision in precisions:
    for level in levels:
        for dataset in datasets:
            path_dataset = f'../../data/splitted_datasets/{precision}/{level}/{dataset}.tsv'
            file_paths.append(path_dataset)
            path_embedding_epitope = f'../../data/embeddings/{level}/{precision}/TCRPeg_Epitope_embeddings.npz' 
            path_embedding_tcrs = f'../../data/embeddings/{level}/{precision}/TCRPeg_tcr_embeddings.npz' 

# Dictionaries to store all embeddings
dictionary_tcr = {}
dictionary_epitope = {}

# Process each dataset
for k, file_path in enumerate(file_paths):
    print(f"Processing {file_path}")
    df = pd.read_csv(file_path, sep='\t')
    
    # Drop unnecessary columns
    df.drop(columns=['TCR_name', 'TRBV', 'TRBJ', 'TRBC', 'MHC', 'task'], inplace=True, errors='ignore')
    
    # Rename columns
    df.rename(columns={'TRB_CDR3': 'CDR3.beta', 'Binding': 'Label'}, inplace=True)

    # Remove sequences longer than 36
    df = df[df['CDR3.beta'].apply(len) <= 36]
    df = df[df['Epitope'].apply(len) <= 36]

    # Identify and remove unmapped characters
    unique_chars = set(''.join(df['CDR3.beta']))
    df_train = df['CDR3.beta'].values
    model = TCRpeg(hidden_size=64, num_layers=3, max_length=36, load_data=True, 
                   embedding_path='../../data/embeddings/TCRpeg/tcrpeg/data/embedding_32.txt', path_train=df_train)
    model.create_model()
    
    unmapped_chars = [ch for ch in unique_chars if ch not in model.aa2idx]
    print("Unmapped characters:", unmapped_chars)
    if unmapped_chars:
        pattern = f"[{''.join(unmapped_chars)}]"
        df = df[~df['CDR3.beta'].str.contains(pattern)]

    # Unique sequences for embedding generation
    unique_tcrs = df['CDR3.beta'].unique()
    unique_epitopes = df['Epitope'].unique()

    for seq_type, unique_seqs, model_dict in zip(['TCR', 'Epitope'], [unique_tcrs, unique_epitopes], [dictionary_tcr, dictionary_epitope]):
        print(f"Generating embeddings for {seq_type}s...")
        
        batch_size = 10000
        total_sequences = len(unique_seqs)
        num_batches = (total_sequences + batch_size - 1) // batch_size
        all_embeddings = []
        
        model = TCRpeg(hidden_size=512, num_layers=2, max_length=36, load_data=False, 
                       embedding_path='../../data/embeddings/TCRpeg/tcrpeg/data/embedding_32.txt')
        model.create_model()
        
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = min(start_idx + batch_size, total_sequences)
            print(f"Processing batch {i+1}/{num_batches}, sequences {start_idx} to {end_idx}")
            batch_embeddings = model.get_embedding(unique_seqs[start_idx:end_idx])
            all_embeddings.append(batch_embeddings)
        
        final_embeddings = np.concatenate(all_embeddings, axis=0)
        
        for index, element in enumerate(unique_seqs):
            model_dict[element] = final_embeddings[index]

# Save embeddings
np.savez(path_embedding_tcrs, **dictionary_tcr)
np.savez(path_embedding_epitope, **dictionary_epitope)

print(f"Embeddings saved: {path_embedding_tcrs}, {path_embedding_epitope}")


Processing ../../data/splitted_datasets/gene/beta/train.tsv


  df = pd.read_csv(file_path, sep='\t')


Have loaded the data, total training seqs : 250602
Unmapped characters: ['*']
Generating embeddings for TCRs...
Processing batch 1/16, sequences 0 to 10000
Processing batch 2/16, sequences 10000 to 20000
Processing batch 3/16, sequences 20000 to 30000
Processing batch 4/16, sequences 30000 to 40000
Processing batch 5/16, sequences 40000 to 50000
Processing batch 6/16, sequences 50000 to 60000
Processing batch 7/16, sequences 60000 to 70000
Processing batch 8/16, sequences 70000 to 80000
Processing batch 9/16, sequences 80000 to 90000
Processing batch 10/16, sequences 90000 to 100000
Processing batch 11/16, sequences 100000 to 110000
Processing batch 12/16, sequences 110000 to 120000
Processing batch 13/16, sequences 120000 to 130000
Processing batch 14/16, sequences 130000 to 140000
Processing batch 15/16, sequences 140000 to 150000
Processing batch 16/16, sequences 150000 to 155972
Generating embeddings for Epitopes...
Processing batch 1/1, sequences 0 to 1104
Processing ../../data/sp

In [8]:
import random

# Select one random key from the dictionary
random_key = random.choice(list(dictionary_epitope.keys()))

# Get the corresponding value
random_value = dictionary_epitope[random_key]

# Print the result
print(random_key, ":", random_value)


CASSPEAGYSYEQFF : [ 0.06751539  0.03299918  0.03864877 ... -0.03103181  0.02821492
  0.00885988]


In [14]:
np.size(random_value)

1024

In [15]:
print(model.max_length)

35
