The idea is to explore residues representation in LLM models. Can we get conclussions from aminoacid properties?

In [1]:
import numpy as np
import torch as tr
import pandas as pd

data_path = "https://raw.githubusercontent.com/lbugnon/foundation_models_bioinfo/refs/heads/main/data/"
df = pd.read_csv(f"{data_path}/pfam_some_sequences.csv", index_col="sequence_name")

DEVICE = "cuda"
MAX_LEN = 1022
LAST_LAYER = 30

# https://github.com/facebookresearch/esm#available-models
model, alphabet = tr.hub.load("facebookresearch/esm:main",
                              "esm2_t33_650M_UR50D")
                              #"esm2_t30_150M_UR50D")
model.eval()
model.to(DEVICE)
batch_converter = alphabet.get_batch_converter()

def get_embedding(seq, aggregate=False):
    """Recibe una secuencia, devuelve un tensor de MxL donde M es el tamaño
    del embedding y L la longitud de la secuencia. Si aggregate=True, promedia
    la representación a lo largo de la secuencia devolviendo un vector M"""

    # Recorta el dominio si supera la capacidad del LLM
    center = len(seq)//2
    start = max(0, center - MAX_LEN//2)
    end = min(len(seq), center + MAX_LEN//2)
    seq = seq[start:end]

    # Formato requerido por el tokenizador, se pueden procesar  por lotes
    # en paralelo
    x = [(0, seq)]

    with tr.no_grad():
        _, _, tokens = batch_converter(x)
        emb = model(tokens.to(DEVICE), repr_layers=[LAST_LAYER])["representations"][LAST_LAYER].detach().cpu().squeeze()

    emb = emb.permute(1,0) # [L, M] -> [M, L], por conveniencia mas adelante

    if aggregate:
        emb = np.array(emb.mean(dim=1))

    return emb


Downloading: "https://github.com/facebookresearch/esm/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt


sample a number of sequences from the dataset

In [None]:
sequences = df.sample(100).sequence

In [3]:
# get counts for each aminoacid
aminoacids = sequences.str.cat()
print(f"number of AA: {len(set(aminoacids))}")
print("AA count:")
aa_count = {f"{aa}: {aminoacids.count(aa)}" for aa in set(aminoacids)}
aa_count

number of AA: 20
AA count:


{'A: 1986',
 'C: 285',
 'D: 1227',
 'E: 1304',
 'F: 1029',
 'G: 1534',
 'H: 444',
 'I: 1356',
 'K: 1136',
 'L: 2376',
 'M: 515',
 'N: 998',
 'P: 1034',
 'Q: 811',
 'R: 1231',
 'S: 1523',
 'T: 1261',
 'V: 1523',
 'W: 317',
 'Y: 870'}

Now we want to get a numeric representation of each residue of the sequences and get the average representation for each AA.
- why average?
- why we can't just compute the embedding for the 20 AAs?

In [None]:
aa_embeddings = {}

for seq in sequences:
    emb_seq = get_embedding(seq, aggregate=False)

    # TODO get representation for each aminoacid and sum them
    

# normalize embedding sum
for aa in aa_embeddings:
    aa_embeddings[aa] /= aminoacids.count(aa)

In [5]:
# sorting all
aa_names = list(aa_embeddings.keys())
aa_embeddings = np.array([aa_embeddings[aa] for aa in aa_names])

In [None]:
# use pca instead of umap
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
emb2D = pca.fit_transform(aa_embeddings)

In [None]:
#!pip install matplotlib
import matplotlib.pyplot as plt

# get some aminoacids properties
negatively_charged = ['D', 'E']
positively_charged = ['R', 'K', 'H']
polar = ['Q', 'N', 'S', 'T']
unique = ['C', 'G', 'P']
aromatic = ['F', 'Y', 'W']
hydrophobic = ['A', 'V', 'L', 'I', 'M']
small = ['A', 'G', 'S', 'T', 'V']
large = ['K', 'R', 'D', 'E', 'Q', 'N']


# plot properties
plt.figure(figsize=(12, 6))
for k, aa in enumerate(aa_names):
    s = 50
    if aa in small:
        s = 30
    elif aa in large:
        s = 100

    if aa in negatively_charged:
        plt.scatter(emb2D[k, 0], emb2D[k, 1], label=aa, s=s, marker="x", color='red');
    elif aa in positively_charged:
        plt.scatter(emb2D[k, 0], emb2D[k, 1], label=aa, s=s, marker="s", color='red');
    elif aa in polar:
        plt.scatter(emb2D[k, 0], emb2D[k, 1], label=aa, s=s, color='blue');
    elif aa in unique:
        plt.scatter(emb2D[k, 0], emb2D[k, 1], label=aa, s=s, color='orange');
    elif aa in hydrophobic:
        plt.scatter(emb2D[k, 0], emb2D[k, 1], label=aa, s=s, marker="o", color='green');
    elif aa in aromatic:
        plt.scatter(emb2D[k, 0], emb2D[k, 1], label=aa, s=s, marker="+", color='green');


plt.legend()