In [1]:
import pickle
import torch
import numpy as np
from transformers import T5EncoderModel, T5Tokenizer
import pandas as pd

In [2]:
def read_fasta_file(filepath):
    with open(filepath, "r") as file:  
        lines = [line.strip() for line in file]
        sequences_dict = {lines[i][1:]: lines[i+1] for i in range(0, len(lines), 2)}
    return sequences_dict

In [3]:
test_sequences = read_fasta_file('./inputs/test_sequences.txt')

In [4]:
model_path = "/home/jiaoshihu/toolkits/PLM/prot_t5_xl_uniref50"
tokenizer = T5Tokenizer.from_pretrained(model_path, do_lower_case=False)
model = T5EncoderModel.from_pretrained(model_path)
model = model.eval().cuda()

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [5]:
def get_plm_representation(sequences_dict):
    representations = {}
    peptide_names = sequences_dict.keys()
    for name in peptide_names:
        seq = sequences_dict[name]
        seq = [' '.join(seq)]
        ids = tokenizer(seq, add_special_tokens=True, padding=True, return_tensors='pt')
        
        input_ids = ids['input_ids'].clone().detach().to('cuda')
        attention_mask = ids['attention_mask'].clone().detach().to('cuda')

        
        with torch.no_grad():
            embedding_repr = model(input_ids=input_ids, attention_mask=attention_mask)
        
        embedding = embedding_repr.last_hidden_state.cpu().numpy()

        
        seq_len = (attention_mask == 1).sum()
        seq_emd = embedding[0, :seq_len-1, :]
        
        representations[name] = seq_emd
        
    return representations

In [6]:
test_representation = get_plm_representation(test_sequences)

In [8]:
def nom_representation(origin_representation, outfile):
    x_max = np.load('./inputs/x_max.npy')
    x_min = np.load('./inputs/x_min.npy')
    
    x_range = x_max - x_min
    x_range[x_range == 0] = 1 
    
    normalized_representations = {}
    for name, embeddings in origin_representation.items():
        normalized_representations[name] = (embeddings - x_min) / x_range
        
    with open(outfile, 'wb') as file:
        pickle.dump(normalized_representations, file)

In [9]:
nom_representation(test_representation, './outputs/test_plm_representation.pkl')