# ESM1280


In [None]:
!pip install fair-esm==2.0.0

In [None]:
import esm
import torch
import pandas as pd
import gc
import os

def esm_embeddings(peptide_sequence_list, model_name, batch_size=1, chunk_size=50):
    model_dict = {
        'esm2_t36_3B_UR50D': (esm.pretrained.esm2_t36_3B_UR50D, 36),
        'esm2_t33_650M_UR50D': (esm.pretrained.esm2_t33_650M_UR50D, 33),
    }

    if model_name not in model_dict:
        raise ValueError(f"Invalid model name '{model_name}'. Please choose from {list(model_dict.keys())}.")

    model_func, num_layers = model_dict[model_name]
    model, alphabet = model_func()

    batch_converter = alphabet.get_batch_converter()
    model.eval()

    device = torch.device('cpu')  # Switch to CPU
    model = model.to(device)

    embeddings_results = []

    for start in range(0, len(peptide_sequence_list), chunk_size):
        end = start + chunk_size
        chunk_sequences = peptide_sequence_list[start:end]

        for i in range(0, len(chunk_sequences), batch_size):
            batch_sequences = chunk_sequences[i:i + batch_size]
            batch_labels, batch_strs, batch_tokens = batch_converter(batch_sequences)
            batch_tokens = batch_tokens.to(device)

            with torch.no_grad():
                results = model(batch_tokens, repr_layers=[num_layers])

            token_representations = results["representations"][num_layers]
            batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

            for j, tokens_len in enumerate(batch_lens):
                sequence_representation = token_representations[j, 1 : tokens_len - 1].mean(0)
                embeddings_results.append(sequence_representation.cpu().numpy())

            del batch_tokens, results, token_representations, batch_sequences
            torch.cuda.empty_cache()
            gc.collect()

        chunk_df = pd.DataFrame(embeddings_results)
        chunk_df.columns = [f"ESM1280_{i}" for i in range(chunk_df.shape[1])]
        chunk_df.to_csv(f'ESM1280_chunk_{start//chunk_size}.csv', index=False)
        embeddings_results = []
        gc.collect()  # Explicitly run garbage collection

    final_embeddings = pd.concat([pd.read_csv(f'ESM1280_chunk_{i}.csv') for i in range(len(peptide_sequence_list) // chunk_size + 1)], ignore_index=True)
    return final_embeddings

# Load the dataset
dataset = pd.read_csv('clathrin0-7/Clathrin07.csv', na_filter=False)
sequence_list = dataset['seq']
peptide_sequence_list = []

# Prepare sequence_list for ESM processing
for seq in sequence_list:
    format_seq = [seq, seq]
    peptide_sequence_list.append(tuple(format_seq))

# Generate embeddings and save to CSV
embeddings_results = esm_embeddings(peptide_sequence_list, 'esm2_t33_650M_UR50D', batch_size=1, chunk_size=50)
embeddings_results.to_csv('ESM1280.csv', index=True)

# Optional: remove intermediate chunk files to free disk space
for i in range(len(peptide_sequence_list) // 50 + 1):
    os.remove(f'ESM1280_chunk_{i}.csv')

# ProtT5
### - ProtT5
### - ProtT5_xl_uniref50 
### - ProtT5_xl_bfd

In [None]:
from transformers import T5Tokenizer, T5EncoderModel, BertTokenizer, BertModel
import re
import torch
import pandas as pd
import gc
from torch.cuda.amp import autocast

def generate_protein_embeddings(sequence_list, model_name, batch_size=1, max_length=512):
    if model_name not in [
        'Rostlab/ProstT5', 
        'Rostlab/prot_t5_xl_uniref50', 
        'Rostlab/prot_t5_xl_bfd']:
        raise ValueError("Invalid model name. Please choose a valid model.")
    
    sequence_list = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequence_list]
    sequence_lengths = [len(sequence) for sequence in sequence_list]
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    if "bert" in model_name:
        tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
        model = BertModel.from_pretrained(model_name).to(device)
    else:
        tokenizer = T5Tokenizer.from_pretrained(model_name, do_lower_case=False)
        model = T5EncoderModel.from_pretrained(model_name).to(device)
    
    if device.type == 'cpu':
        model.float()
    else:
        model.half()
    
    embeddings_list = []
    
    for i in range(0, len(sequence_list), batch_size):
        batch_sequences = sequence_list[i:i+batch_size]
        batch_lengths = sequence_lengths[i:i+batch_size]
        
        batch_sequences = [" ".join(list(sequence)) for sequence in batch_sequences]
        ids = tokenizer(batch_sequences, add_special_tokens=True, padding="longest", truncation=True, max_length=max_length)
        input_ids = torch.tensor(ids['input_ids']).to(device)
        attention_mask = torch.tensor(ids['attention_mask']).to(device)
        
        with torch.no_grad(), autocast():
            embedding_repr = model(input_ids=input_ids, attention_mask=attention_mask)
        
        embeddings = [embedding_repr.last_hidden_state[j, :length] for j, length in enumerate(batch_lengths)]
        per_protein_embeddings = [emb.mean(dim=0) for emb in embeddings]
        embeddings_list.extend([emb.cpu().numpy() for emb in per_protein_embeddings])
        
        del input_ids, attention_mask, embedding_repr, embeddings, per_protein_embeddings
        torch.cuda.empty_cache()
        gc.collect()
    
    embeddings_df = pd.DataFrame(embeddings_list)
    return embeddings_df

# Load the dataset
dataset = pd.read_csv('clathrin0-7/Clathrin07.csv', na_filter=False) 
#column_names = ['Column1', 'seq'] 
#dataset = pd.read_csv('/kaggle/input/clathrin0-7/Clathrin07.csv', header=None, names=column_names, na_filter=False)


sequence_list = dataset['seq']

# Divide the sequences into 20 equal parts
num_parts = 20
chunk_size = len(sequence_list) // num_parts
chunks = [sequence_list[i:i+chunk_size] for i in range(0, len(sequence_list), chunk_size)]

# List of models to generate embeddings
model_list = [
    'Rostlab/ProstT5',
    'Rostlab/prot_t5_xl_bfd',
    'Rostlab/prot_t5_xl_uniref50'
]

# Generate embeddings for each chunk and each model
for model_name in model_list:
    all_embeddings = []
    for chunk_index, chunk in enumerate(chunks):
        print(f"Processing chunk {chunk_index+1}/{num_parts} for model {model_name}...")
        embeddings_df = generate_protein_embeddings(chunk, model_name, batch_size=2, max_length=256)  # Adjusted batch size and max length
        all_embeddings.append(embeddings_df)
    
    # Combine all embeddings and save to CSV
    final_embeddings_df = pd.concat(all_embeddings, ignore_index=True)
    final_embeddings_df.columns = [f"{model_name.split('/')[-1]}_{i}" for i in range(final_embeddings_df.shape[1])]
    final_embeddings_df.to_csv(f'{model_name.split("/")[-1]}.csv', index=True, header=True)

print("Embeddings generated and saved for all models.")
