In [3]:
import os

# List all files and directories in the current directory
# print(os.listdir())

# List files in a specific directory
print(os.listdir("./../../data"))


['embeddings', 'cleaned_datasets', 'concatenated_datasets', 'splitted_datasets', 'temp', 'plain_datasets']


In [6]:
import sys
import os
import pandas as pd
import numpy as np

# Get the absolute path of 'TCRpeg' and add it to sys.path
tcrpeg_path = os.path.abspath("../../data/embeddings/TCRpeg")
sys.path.append(tcrpeg_path)

from tcrpeg.TCRpeg import TCRpeg

In [8]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")

GPU is available: Tesla T4


In [7]:
# Version for embeddings from 6.3.
# TCRs-Embeddings contains also 'label'

# Define paths
precisions = ['gene']
levels = ['beta']
datasets = ['train', 'validation', 'test']
file_paths = []
embedding_paths = []
for precision in precisions:
    for level in levels:
        for dataset in datasets:
            path_dataset = f'../../data/splitted_datasets/{precision}/{level}/{dataset}.tsv'
            file_paths.append((level, path_dataset))
            path_embedding_epitope = f'../../data/embeddings/{level}/{precision}/TCRPeg_Epitope_embeddings.npz'
            path_embedding_tcrs = f'../../data/embeddings/{level}/{precision}/TCRPeg_tcr_embeddings.npz'

# Dictionaries to store all embeddings
dictionary_tcr = {'embeddings': [], 'labels': []}
dictionary_epitope = {'embeddings': []}

# Process each dataset
for level, file_path in file_paths:
    print(f"Processing {file_path}")
    df = pd.read_csv(file_path, sep='\t')
    
    # Drop unnecessary columns
    df.drop(columns=['TCR_name', 'TRBV', 'TRBJ', 'TRBC', 'MHC', 'task'], inplace=True, errors='ignore')
    
    # Rename columns
    df.rename(columns={'TRB_CDR3': 'CDR3.beta', 'Binding': 'Label'}, inplace=True)
    
    # Remove sequences longer than 36
    df = df[df['CDR3.beta'].apply(len) <= 36]
    df = df[df['Epitope'].apply(len) <= 36]
    
    # Identify and remove unmapped characters
    unique_chars = set(''.join(df['CDR3.beta']))
    df_train = df['CDR3.beta'].values
    model = TCRpeg(hidden_size=64, num_layers=3, max_length=36, load_data=True, 
                   embedding_path='../../data/embeddings/TCRpeg/tcrpeg/data/embedding_32.txt', path_train=df_train)
    model.create_model()
    
    unmapped_chars = [ch for ch in unique_chars if ch not in model.aa2idx]
    print("Unmapped characters:", unmapped_chars)
    if unmapped_chars:
        pattern = f"[{''.join(unmapped_chars)}]"
        df = df[~df['CDR3.beta'].str.contains(pattern)]
    
    model = TCRpeg(hidden_size=512, num_layers=2, max_length=36, load_data=False, 
                   embedding_path='../../data/embeddings/TCRpeg/tcrpeg/data/embedding_32.txt')
    model.create_model()
    
    batch_size = 10000
    total_samples = len(df)
    num_batches = (total_samples + batch_size - 1) // batch_size
    
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, total_samples)
        print(f"Processing batch {i+1}/{num_batches}, samples {start_idx} to {end_idx}")
        
        batch_tcrs = df['CDR3.beta'].iloc[start_idx:end_idx].values
        batch_epitopes = df['Epitope'].iloc[start_idx:end_idx].values
        batch_labels = df['Label'].iloc[start_idx:end_idx].values
        
        batch_tcr_embeddings = model.get_embedding(batch_tcrs)
        batch_epitope_embeddings = model.get_embedding(batch_epitopes)
        
        dictionary_tcr['embeddings'].extend(batch_tcr_embeddings)
        dictionary_tcr['labels'].extend(batch_labels)
        dictionary_epitope['embeddings'].extend(batch_epitope_embeddings)

# Convert lists to numpy arrays
dictionary_tcr['embeddings'] = np.array(dictionary_tcr['embeddings'])
dictionary_tcr['labels'] = np.array(dictionary_tcr['labels'])
dictionary_epitope['embeddings'] = np.array(dictionary_epitope['embeddings'])

# Save embeddings
np.savez(path_embedding_tcrs, **dictionary_tcr)
np.savez(path_embedding_epitope, **dictionary_epitope)

print(f"Embeddings saved: {path_embedding_tcrs}, {path_embedding_epitope}")




Processing ../../data/splitted_datasets/gene/beta/train.tsv


  df = pd.read_csv(file_path, sep='\t')


Have loaded the data, total training seqs : 250602
Unmapped characters: ['*']
Processing batch 1/25, samples 0 to 10000
Processing batch 2/25, samples 10000 to 20000
Processing batch 3/25, samples 20000 to 30000
Processing batch 4/25, samples 30000 to 40000
Processing batch 5/25, samples 40000 to 50000
Processing batch 6/25, samples 50000 to 60000
Processing batch 7/25, samples 60000 to 70000
Processing batch 8/25, samples 70000 to 80000
Processing batch 9/25, samples 80000 to 90000
Processing batch 10/25, samples 90000 to 100000
Processing batch 11/25, samples 100000 to 110000
Processing batch 12/25, samples 110000 to 120000
Processing batch 13/25, samples 120000 to 130000
Processing batch 14/25, samples 130000 to 140000
Processing batch 15/25, samples 140000 to 150000
Processing batch 16/25, samples 150000 to 160000
Processing batch 17/25, samples 160000 to 170000
Processing batch 18/25, samples 170000 to 180000
Processing batch 19/25, samples 180000 to 190000
Processing batch 20/25,

In [None]:
# Version from arround 3.03.
# ACHTUNG: for allele and paired the code must be revised/checked. Do the for loops work as intended????

# Define paths
precisions = ['gene']
levels = ['beta']
datasets = ['train', 'validation', 'test']
file_paths = []
embedding_paths = []
for precision in precisions:
    for level in levels:
        for dataset in datasets:
            path_dataset = f'../../data/splitted_datasets/{precision}/{level}/{dataset}.tsv'
            file_paths.append(path_dataset)
            path_embedding_epitope = f'../../data/embeddings/{level}/{precision}/TCRPeg_Epitope_embeddings.npz' 
            path_embedding_tcrs = f'../../data/embeddings/{level}/{precision}/TCRPeg_tcr_embeddings.npz' 

# Dictionaries to store all embeddings
dictionary_tcr = {}
dictionary_epitope = {}

# Process each dataset
for k, file_path in enumerate(file_paths):
    print(f"Processing {file_path}")
    df = pd.read_csv(file_path, sep='\t')
    
    # Drop unnecessary columns
    df.drop(columns=['TCR_name', 'TRBV', 'TRBJ', 'TRBC', 'MHC', 'task'], inplace=True, errors='ignore')
    
    # Rename columns
    df.rename(columns={'TRB_CDR3': 'CDR3.beta', 'Binding': 'Label'}, inplace=True)

    # Remove sequences longer than 36
    df = df[df['CDR3.beta'].apply(len) <= 36]
    df = df[df['Epitope'].apply(len) <= 36]

    # Identify and remove unmapped characters
    unique_chars = set(''.join(df['CDR3.beta']))
    df_train = df['CDR3.beta'].values
    model = TCRpeg(hidden_size=64, num_layers=3, max_length=36, load_data=True, 
                   embedding_path='../../data/embeddings/TCRpeg/tcrpeg/data/embedding_32.txt', path_train=df_train)
    model.create_model()
    
    unmapped_chars = [ch for ch in unique_chars if ch not in model.aa2idx]
    print("Unmapped characters:", unmapped_chars)
    if unmapped_chars:
        pattern = f"[{''.join(unmapped_chars)}]"
        df = df[~df['CDR3.beta'].str.contains(pattern)]

    # Unique sequences for embedding generation
    unique_tcrs = df['CDR3.beta'].unique()
    unique_epitopes = df['Epitope'].unique()

    for seq_type, unique_seqs, model_dict in zip(['TCR', 'Epitope'], [unique_tcrs, unique_epitopes], [dictionary_tcr, dictionary_epitope]):
        print(f"Generating embeddings for {seq_type}s...")
        
        batch_size = 10000
        total_sequences = len(unique_seqs)
        num_batches = (total_sequences + batch_size - 1) // batch_size
        all_embeddings = []
        
        model = TCRpeg(hidden_size=512, num_layers=2, max_length=36, load_data=False, 
                       embedding_path='../../data/embeddings/TCRpeg/tcrpeg/data/embedding_32.txt')
        model.create_model()
        
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = min(start_idx + batch_size, total_sequences)
            print(f"Processing batch {i+1}/{num_batches}, sequences {start_idx} to {end_idx}")
            batch_embeddings = model.get_embedding(unique_seqs[start_idx:end_idx])
            all_embeddings.append(batch_embeddings)
        
        final_embeddings = np.concatenate(all_embeddings, axis=0)
        
        for index, element in enumerate(unique_seqs):
            model_dict[element] = final_embeddings[index]

# Save embeddings
np.savez(path_embedding_tcrs, **dictionary_tcr)
np.savez(path_embedding_epitope, **dictionary_epitope)

print(f"Embeddings saved: {path_embedding_tcrs}, {path_embedding_epitope}")


Processing ../../data/splitted_datasets/gene/beta/train.tsv


  df = pd.read_csv(file_path, sep='\t')


TypeError: 'module' object is not callable

In [8]:
import random

# Select one random key from the dictionary
random_key = random.choice(list(dictionary_epitope.keys()))

# Get the corresponding value
random_value = dictionary_epitope[random_key]

# Print the result
print(random_key, ":", random_value)


CASSPEAGYSYEQFF : [ 0.06751539  0.03299918  0.03864877 ... -0.03103181  0.02821492
  0.00885988]


In [14]:
np.size(random_value)

1024

In [15]:
print(model.max_length)

35


In [None]:
import numpy as np

# epi_embedding_path = '../../data/embeddings/beta/gene/Epitope_beta_embeddings.npz'
# tcr_embedding_path = '../../data/embeddings/beta/gene/TRB_beta_embeddings.npz'

# epi_embedding_path = '../../data/embeddings/paired/gene/Epitope_paired_embeddings.npz'
# tcr_embedding_path = '../../data/embeddings/paired/gene/TRA_paired_embeddings.npz'

# epi_embedding_path = '../../data/embeddings/beta/gene/Epitope_PLE_padded.npz'
# tcr_embedding_path = '../../data/embeddings/beta/gene/TRB_PLE_padded.npz'

# epi_embedding_path = '/home/ubuntu/data/embeddings/beta/gene/TCRPeg_Epitope_embeddings.npz'
# tcr_embedding_path = '/home/ubuntu/data/embeddings/beta/gene/TCRPeg_tcr_embeddings.npz'


# Load the embeddings
epi_embeddings = np.load(epi_embedding_path)
tcr_embeddings = np.load(tcr_embedding_path)

# Assuming the embeddings are stored under specific keys, list them
epi_keys = list(epi_embeddings.keys())
tcr_keys = list(tcr_embeddings.keys())

print(len(epi_keys))
print(len(tcr_keys))
# print(f"Epitope embedding keys: {epi_keys}")
# print(f"TCR embedding keys: {tcr_keys}")

# Extract and compare the arrays (assuming a single key per file)
epi_array = epi_embeddings[epi_keys[0]]  # Modify key if needed
tcr_array = tcr_embeddings[tcr_keys[0]]  # Modify key if needed

print(f"Epitope embedding shape: {epi_array.shape}")
print(f"TCR embedding shape: {tcr_array.shape}")

# Compare the number of embeddings
num_epi_embeddings = epi_array.shape[0]
num_tcr_embeddings = tcr_array.shape[0]

print(f"Number of epitope embeddings: {num_epi_embeddings}")
print(f"Number of TCR embeddings: {num_tcr_embeddings}")

# Compare dimensions
if epi_array.shape[1] == tcr_array.shape[1]:
    print("Both embeddings have the same dimensionality.")
else:
    print("Warning: The embeddings have different dimensionalities!")

# Compare the number of embeddings in each file
if num_epi_embeddings == num_tcr_embeddings:
    print("Both files have the same number of embeddings.")
else:
    print("Warning: The number of embeddings differs between the files!")


1864
199160
Epitope embedding shape: (1024,)
TCR embedding shape: (1024,)
Number of epitope embeddings: 1024
Number of TCR embeddings: 1024


IndexError: tuple index out of range

#### Inspections


In [1]:
import pandas as pd

path = '/home/ubuntu/data/cleaned_datasets/IEDB/IEDB_cleaned_data_paired.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,TRAC,TRBC,TRA_leader,TRB_leader,Linker,Link_order,TRA_5_prime_seq,TRA_3_prime_seq,TRB_5_prime_seq,TRB_3_prime_seq,Epitope,TRAV,TRAJ,TRA_CDR3,TRBV,TRBJ,TRB_CDR3,MHC,MHC class
0,,,,,,,,,,,LLFGYPVYV,,,CAVTTDSWGKLQF,,,CASRPGLAGGRPEQYF,HLA-A*02:01,MHCI
1,,,,,,,,,,,LLFGYPVYV,,,CAVTTDSWGKLQF,,,CASRPGLMSAQPEQYF,HLA-A*02:01,MHCI
2,,,,,,,,,,,SLLMWITQC,TRAV21*01,TRAJ6*01,CAVRPTSGGSYIPTF,TRBV6-5*01,TRBJ2-2*01,CASSYVGNTGELFF,HLA-A*02:01,MHCI
3,,,,,,,,,,,AAGIGILTV,TRAV35*02,TRAJ49*01,CAGGTGNQFYF,TRBV10-3,TRBJ1-5*01,CAISEVGVGQPQHF,HLA-A*02:01,MHCI
4,,,,,,,,,,,ELAGIGILTV,TRAV35*02,TRAJ49*01,CAGGTGNQFYF,TRBV10-3,TRBJ1-5*01,CAISEVGVGQPQHF,HLA-A*02:01,MHCI
