# Embed Sequences

The primary purpose of this notebook is to take the SCOP CSV of sequences and embed them all.

In [29]:
import os
import itertools
from multiprocessing import Pool

from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
import faiss
import esm

from sklearn.metrics.pairwise import cosine_similarity
from Bio import SeqIO

In [2]:
scop_csv_path = '/scratch/gpfs/jr8867/datasets/scop/scop_data.csv'
scop_df = pd.read_csv(scop_csv_path)
scop_df

Unnamed: 0,uid,fa,sf,seq
0,Q03131,4000119,3000038,MSGPRSRTTSRRTPVRIGAVVVASSTSELLDGLAAVADGRPHASVV...
1,P09147,4000088,3000038,MRVLVTGGSGYIGSHTCVQLLQNGHDVIILDNLCNSKRSVLPVIER...
2,P61889,4000045,3000039,MKVAVLGAAGGIGQALALLLKTQLPSGSELSLYDIAPVTPGVAVDL...
3,P00334,4000029,3000038,MSFTLTNKNVIFVAGLGGIGLDTSKELLKRDLKNLVILDRIENPAA...
4,O33830,4000089,3000039,MPSVKIGIIGAGSAVFSLRLVSDLCKTPGLSGSTVTLMDIDEERLD...
...,...,...,...,...
35972,P20585,4004015,3000587,MSRRKPASGGLAASSSAPARQAVLSRFFQSTGSLKSTSSSTGAADQ...
35973,P20585,4004015,3002020,MSRRKPASGGLAASSSAPARQAVLSRFFQSTGSLKSTSSSTGAADQ...
35974,P52701,4004015,3001688,MSRQSTLYSFFPKSPALSDANKASARASREGGRAAAAPGASPSPGG...
35975,P52701,4004015,3000587,MSRQSTLYSFFPKSPALSDANKASARASREGGRAAAAPGASPSPGG...


# Load Embedding Model, Test on Sample Sequence

In [10]:
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()

In [27]:
model.eval()  # disables dropout for deterministic results

# Prepare data
data = [
    ("protein1", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"),
]
batch_labels, batch_strs, batch_tokens = batch_converter(data)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

# Extract per-residue representations (on CPU)
with torch.no_grad():
    results = model(batch_tokens, repr_layers=[33], return_contacts=True)
token_representations = results["representations"][33]

# Generate per-sequence representations via mean pooling
# This performs mean pooling by averaging all amino acid token embeddings for each sequence
sequence_representations = []
for i, tokens_len in enumerate(batch_lens):
    # Extract embeddings for actual amino acids (excluding special tokens)
    # and average them to get a single vector per sequence
    sequence_representations.append(token_representations[i, 1 : tokens_len - 1])

print(len(sequence_representations[0]))  # Dimension of the embedding vector
print(sequence_representations[0])  # The actual embedding values

65
tensor([[ 0.0826, -0.2050, -0.0171,  ...,  0.1044,  0.1343, -0.0945],
        [-0.0528, -0.0635, -0.2515,  ..., -0.0461,  0.2368, -0.1214],
        [-0.1468,  0.0750, -0.2339,  ...,  0.0143,  0.0404, -0.1388],
        ...,
        [ 0.1396,  0.1195,  0.0518,  ..., -0.1268, -0.0926,  0.1290],
        [ 0.2811,  0.0078,  0.0924,  ..., -0.0181,  0.0036,  0.1823],
        [ 0.1374, -0.0813, -0.0655,  ..., -0.1758, -0.0229,  0.0756]])
