In [6]:
import torch
from transformers import AutoTokenizer, EsmModel
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

from Bio import SeqIO
import pandas as pd
import numpy as np

## Convert FASTA file to pd.DataFrame for easier processing

In [7]:
def fasta_to_dataframe(fasta_file):
    records = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):
        # Extract the ID after '>tr'
        id_info = seq_record.id.split('|')[1] # Adjust the index based on your FASTA file format
        sequence = str(seq_record.seq)
        
        # Generate a random index within the range of the sequence length
        random_index = np.random.randint(0, len(sequence))
        # Replace the character at the random index with '<mask>'
        masked_sequence = sequence[:random_index] + '<mask>' + sequence[random_index+1:]
        
        records.append([id_info, sequence, masked_sequence])
    
    # Create a DataFrame from the records
    df = pd.DataFrame(records, columns=['ID', 'Sequence', 'Masked_Sequence'])
    return df

## Create Dataset

In [8]:
fasta_df = fasta_to_dataframe("human_protein_seq/uniprotkb_proteome_UP000005640.fasta")

In [9]:
fasta_df.head()

Unnamed: 0,ID,Sequence,Masked_Sequence
0,A0A075B6G3,MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD...,MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD...
1,A0A087WV00,MDAAGRGCHLLPLPAARGPARAPAAAAAAAASPPGPCSGAACAPSA...,MDAAGRGCHLLPLPAARGPARAPAAAAAAAASPPGPCSGAACAPSA...
2,A0A087WZT3,MELSAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLGS,ME<mask>SAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLGS
3,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
4,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...


## Preparing your model and tokenizer

Now we load our model and tokenizer. If using GPU, use `model.cuda()` to transfer the model to GPU.

In [10]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model = EsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D")

model = model.cuda()

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
tokenized_fasta_df = pd.DataFrame(columns=['ID', 'Tokenized_Sequence'])

for id, fasta_row in fasta_df.iterrows():
    # Tokenize the sequence
    tokenized_sequence = tokenizer([fasta_row["Masked_Sequence"]], return_tensors="pt", add_special_tokens=False)['input_ids']
    
    # Create a DataFrame for the current protein sequence
    current_df = pd.DataFrame({'ID': [fasta_row["ID"]], 'Tokenized_Sequence': [tokenized_sequence]})
    
    # Concatenate the current DataFrame with the existing tokenized_fasta_df
    tokenized_fasta_df = pd.concat([tokenized_fasta_df, current_df], ignore_index=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (3685 > 1024). Running this sequence through the model will result in indexing errors


In [20]:
tokenized_fasta_df.head()

Unnamed: 0,ID,Tokenized_Sequence
0,A0A075B6G3,"[[tensor(20), tensor(4), tensor(22), tensor(22..."
1,A0A087WV00,"[[tensor(20), tensor(13), tensor(5), tensor(5)..."
2,A0A087WZT3,"[[tensor(20), tensor(9), tensor(32), tensor(8)..."
3,A0A087X1C5,"[[tensor(20), tensor(6), tensor(4), tensor(9),..."
4,A0A087X296,"[[tensor(20), tensor(8), tensor(10), tensor(8)..."


In [33]:
fasta_int = tokenized_fasta_df["Tokenized_Sequence"].tolist()

In [36]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    # Move each tensor to the GPU
    fasta_gpu = [tensor.to(device) for tensor in fasta_int]
else:
    print("CUDA is not available. Running on CPU.")

Check if entire DF was moved to GPU

In [40]:
for row in fasta_gpu:
    if not row.is_cuda: # per line should be on GPU
        print("NOT ON GPU")

## Run model per tokenized sequence on GPU from `fasta_gpu`

In [None]:
# convert tokens back to string and compare with original string - add col to original df and set true or false depending on accuracy
with torch.no_grad():
    output = model(fasta_gpu)

## Get model's total accuracy

Get latency, memory, power consumption, energy consumption
- latency is per sequence
- latency graph of sequence length vs latency
- accuracy graph of sequence length vs accuracy