In [8]:
import pandas as pd
import numpy as np
import torch
from transformers import T5Tokenizer, T5EncoderModel
from pathlib import Path

# "Rostlab/prot_t5_xl_half_uniref50-enc" (~1.2B params)
# "Rostlab/prot_t5_base_mt_uniref50" (~220M params) 
MODEL_NAME = "Rostlab/prot_t5_xl_half_uniref50-enc"

DEVICE = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
NUM_FOLDS = 3

notebook_dir = Path.cwd()
BASE_DIR = notebook_dir.parent

DATA_PATH = BASE_DIR / "data" / "aufgabe3"
DATA_PATH_FOLDS = DATA_PATH / "3-fold"
MODEL_SAVE_PATH_TEMP = str(BASE_DIR / "models" / "6state_t5_lstm_cnn_fold{}.pt")
MODEL_SAVE_PATH = BASE_DIR / "models" / "6state_t5_lstm_cnn" / "6state_t5_lstm_cnn.pt"
TRAIN_VAL_LOSSES_DATA_SAVE_PATH = DATA_PATH / "6state_t5_lstm_cnn" / "outputs"
TEST_CSV = DATA_PATH / "reduced_30_signalP6_test.csv"

(BASE_DIR / "models" / "6state_t5_lstm_cnn").mkdir(parents=True, exist_ok=True)
(DATA_PATH / "6state_t5_lstm_cnn" / "outputs").mkdir(parents=True, exist_ok=True)
print(f"Project base directory set to: {BASE_DIR}")
print(f"Data path set to: {DATA_PATH}")
print(f"Model save path set to: {MODEL_SAVE_PATH}")
print(f"Using model: {MODEL_NAME}")

Using device: cuda
Project base directory set to: /home/jonas/Documents/Projects/sp-prediction
Data path set to: /home/jonas/Documents/Projects/sp-prediction/data/aufgabe3
Model save path set to: /home/jonas/Documents/Projects/sp-prediction/models/6state_t5_lstm_cnn/6state_t5_lstm_cnn.pt
Using model: Rostlab/prot_t5_xl_half_uniref50-enc


In [None]:
def embed_sequence(sequence: str, tokenizer, encoder, device: str = DEVICE, pooling: str = "mean") -> torch.Tensor:

    seq_spaced = " ".join(list(sequence))
    
    # Tokenize
    tokenized = tokenizer(seq_spaced, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = tokenized['input_ids'].to(device)
    attention_mask = tokenized['attention_mask'].to(device)
    
    # Get embeddings
    with torch.no_grad():
        output = encoder(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = output.last_hidden_state  # (1, seq_len, hidden_dim)
    
    # Remove batch dimension
    embeddings = embeddings.squeeze(0)  # (seq_len, hidden_dim)
    
    # Get sequence length (excluding EOS token)
    seq_len = len(sequence)
    embeddings = embeddings[:seq_len]  # (seq_len, hidden_dim)
    
    if pooling == "mean":
        return embeddings.mean(dim=0).float()  # (hidden_dim,)
    elif pooling == "none":
        return embeddings.float()  # (seq_len, hidden_dim)
    else:
        raise ValueError(f"Unknown pooling method: {pooling}")

In [10]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
encoder = T5EncoderModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float16)
encoder.to(DEVICE)

T5EncoderModel(
  (shared): Embedding(128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=4096, bias=False)
              (k): Linear(in_features=1024, out_features=4096, bias=False)
              (v): Linear(in_features=1024, out_features=4096, bias=False)
              (o): Linear(in_features=4096, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=16384, bias=False)
              (wo): Linear(in_features=16384, out_features=1024, bias=False)
              (dropout): Dropo

In [None]:
# Generate and save embeddings for each fold's train/val sets
all_embeddings = {} # will contain as a value a dict of "uniprot_id: embedding" for all sequences in that fold split

for fold_idx in range(NUM_FOLDS):

    # load data for according fold
    fold_train = pd.read_csv(DATA_PATH_FOLDS / f"fold_{fold_idx + 1}_train.csv")
    fold_val = pd.read_csv(DATA_PATH_FOLDS / f"fold_{fold_idx + 1}_val.csv")
    
    # iterate over train and val sets
    for df, split in [(fold_train, "train"), (fold_val, "val")]:
        embeddings_dict = {}
        print(f"Embedding sequences for fold {fold_idx + 1} {split} set:")
        
        for idx, row in df.iterrows():
            uniprot_id = row['uniprot_id']
            sequence = row['sequence']
            embedding = embed_sequence(sequence, tokenizer, encoder, pooling="none") # embed per residue
            embeddings_dict[uniprot_id] = embedding.cpu().numpy() # save to dict as numpy array for later use for npz saving
            
            if (len(embeddings_dict)) % 100 == 0:
                print(f"  Processed {len(embeddings_dict)}/{len(df)} sequences")
        
        # store in main dict with local dict as value    
        key = f"fold_{fold_idx + 1}_{split}"
        all_embeddings[key] = embeddings_dict
        
        # save embeddings to npz file
        save_path = DATA_PATH_FOLDS / f"fold_{fold_idx + 1}_{split}_embeddings.npz"
        np.savez(save_path, **embeddings_dict)
        print(f"  Saved {len(embeddings_dict)} embeddings to {save_path.name}")

print(f"\nAll embeddings generated and saved!")
print(f"Embedding keys: {list(all_embeddings.keys())}")

Embedding sequences for fold 1 train set:
  Processed 100/9116 sequences
  Processed 200/9116 sequences
  Processed 300/9116 sequences
  Processed 400/9116 sequences
  Processed 500/9116 sequences
  Processed 600/9116 sequences
  Processed 700/9116 sequences
  Processed 800/9116 sequences
  Processed 900/9116 sequences
  Processed 1000/9116 sequences
  Processed 1100/9116 sequences
  Processed 1200/9116 sequences
  Processed 1300/9116 sequences
  Processed 1400/9116 sequences
  Processed 1500/9116 sequences
  Processed 1600/9116 sequences
  Processed 1700/9116 sequences
  Processed 1800/9116 sequences
  Processed 1900/9116 sequences
  Processed 2000/9116 sequences
  Processed 2100/9116 sequences
  Processed 2200/9116 sequences
  Processed 2300/9116 sequences
  Processed 2400/9116 sequences
  Processed 2500/9116 sequences
  Processed 2600/9116 sequences
  Processed 2700/9116 sequences
  Processed 2800/9116 sequences
  Processed 2900/9116 sequences
  Processed 3000/9116 sequences
  Proce

In [None]:
# Example: Loading embeddings during training
# ==========================================

# Load embeddings file
embeddings_data = np.load(DATA_PATH_FOLDS / "fold_1_train_embeddings.npz")

# Load corresponding CSV with labels
train_df = pd.read_csv(DATA_PATH_FOLDS / "fold_1_train.csv")

# Access embedding by uniprot_id
sample_id = train_df.iloc[0]['uniprot_id']
sample_embedding = embeddings_data[sample_id]
sample_label = train_df.iloc[0]['labels']

print(f"Uniprot ID: {sample_id}")
print(f"Embedding shape: {sample_embedding.shape}")
print(f"Label: {sample_label}")

# For training, iterate over dataframe and fetch embeddings:
# for idx, row in train_df.iterrows():
#     embedding = embeddings_data[row['uniprot_id']]
#     label = row['label']
#     # ... use in training

KeyError: 'label'