In [1]:
import torch
from transformers import AutoTokenizer, EsmModel
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

from Bio import SeqIO
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Convert FASTA file to pd.DataFrame for easier processing

In [2]:
def fasta_to_dataframe(fasta_file):
    records = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):
        # Extract the ID after '>tr'
        id_info = seq_record.id.split('|')[1] # Adjust the index based on your FASTA file format
        sequence = str(seq_record.seq)
        
        # Generate a random index within the range of the sequence length
        random_index = np.random.randint(0, len(sequence))
        # Replace the character at the random index with '<mask>'
        masked_sequence = sequence[:random_index] + '<mask>' + sequence[random_index+1:]
        
        records.append([id_info, sequence, masked_sequence])
    
    # Create a DataFrame from the records
    df = pd.DataFrame(records, columns=['ID', 'Sequence', 'Masked_Sequence'])
    return df

## Create Dataset

In [3]:
fasta_df = fasta_to_dataframe("human_protein_seq/uniprotkb_proteome_UP000005640.fasta")

In [4]:
fasta_df.head()

Unnamed: 0,ID,Sequence,Masked_Sequence
0,A0A075B6G3,MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD...,MLWWEEVEDCYEREDVQKKTFTKWVNAQFSKFGKQHIENLFSDLQD...
1,A0A087WV00,MDAAGRGCHLLPLPAARGPARAPAAAAAAAASPPGPCSGAACAPSA...,MDAAGRGCHLLPLPAARGPARAPAAAAAAAASPPGPCSGAACAPSA...
2,A0A087WZT3,MELSAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLGS,MELSAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLG<mask>
3,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,MGLEALVPL<mask>MIVAIFLLLVDLMHRHQRWAARYPPGPLPLP...
4,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...


## Preparing your model and tokenizer

Now we load our model and tokenizer. If using GPU, use `model.cuda()` to transfer the model to GPU.

In [5]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model = EsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D")

model = model.cuda()

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# run the model on half size floating point (float16)
print(model)

EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 320, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(1026, 320, padding_idx=1)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-5): 6 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=320, out_features=320, bias=True)
            (key): Linear(in_features=320, out_features=320, bias=True)
            (value): Linear(in_features=320, out_features=320, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=320, out_features=320, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
    

In [None]:
tokenized_fasta_df = pd.DataFrame(columns=['ID', 'Tokenized_Sequence'])

for id, fasta_row in fasta_df.iterrows():
    # Tokenize the sequence
    tokenized_sequence = tokenizer([fasta_row["Masked_Sequence"]], return_tensors="pt", add_special_tokens=False)['input_ids']
    
    # Create a DataFrame for the current protein sequence
    current_df = pd.DataFrame({'ID': [fasta_row["ID"]], 'Tokenized_Sequence': [tokenized_sequence]})
    
    # Concatenate the current DataFrame with the existing tokenized_fasta_df
    tokenized_fasta_df = pd.concat([tokenized_fasta_df, current_df], ignore_index=True)

In [None]:
tokenized_fasta_df.head()

In [None]:
fasta_int = tokenized_fasta_df["Tokenized_Sequence"].tolist()

In [None]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    # Move each tensor to the GPU
    fasta_gpu = [tensor.to(device) for tensor in fasta_int]
else:
    print("CUDA is not available. Running on CPU.")

In [None]:
print(fasta_int)

Check if entire DF was moved to GPU

In [None]:
for row in fasta_gpu:
    if not row.is_cuda: # per line should be on GPU
        print("NOT ON GPU")

## Run model per tokenized sequence on GPU from `fasta_gpu`

In [None]:
# process data using model to get weights
output = []

for row in fasta_gpu:
    output.append(model(row))

In [None]:
# check each row of output and convert it back to original string

## Get model's total accuracy

In [None]:
model_acc = []



Get latency, memory, power consumption, energy consumption
- latency is per sequence
- latency graph of sequence length vs latency
- accuracy graph of sequence length vs accuracy