## Generamos Embeddings con ESM2

### 1. Configuración del Tokenizador y Modelo 

In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [2]:
#Cargamos modelo y tokenizador
MODEL_NAME = "facebook/esm2_t33_650M_UR50D"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
model = AutoModel.from_pretrained(MODEL_NAME) #Descarga el modelo por primera vez
model.eval()

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 1280, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-32): 33 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=1280, out_features=1280, bias=True)
            (key): Linear(in_features=1280, out_features=1280, bias=True)
            (value): Linear(in_features=1280, out_features=1280, bias=True)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
          (dense): Linear(in_features=1280, out_features=5120, bias=True)
        )
        (output): EsmOut

### 2. Datos Entrada

In [5]:

from custom_dataloader import CustomProteinDataset
from torch.utils.data import DataLoader, random_split

#Creamos CustomDataset
CSV_PATH = '../alphafold_predictor/data/train.csv'
dataset =CustomProteinDataset(CSV_PATH)

#Train/Val Split
total_size = len(dataset)
train_size = int(total_size * 0.8)  # 80% para entrenamiento
val_size = total_size - train_size  # 20% para validación
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

#Dataloaders con Batch size
BATCH_SIZE = 8
data_loader_train = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
data_loader_val = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
print(f"Registros de entrenamiento: {len(train_dataset)}")
print(f"Registros de validación: {len(val_dataset)}")

Registros de entrenamiento: 672
Registros de validación: 168


### 3. Tokenizador

In [7]:
next(iter(data_loader_train))

[('WETVQKWREYRRQCQRSLTEDWRETVRYRRRQREEER',
  'WETVQKWREYRRQCQRSLTEDWWETVRYLREQRRREE',
  'WIFRLYVSIGWGVPLLFVVPWGIVKYLYVWLPGLILLIVLAFLRLLE',
  'SLWETVQKWREYRRQCQRSLTEDPPPATDLFCNRTFDEYACWPDGEPGSFVNVSCPWYLPWASSVPQGHVYRFCTAEGLWLSAEVREALERALRELLEALA',
  'EQLLFLYIIYTVGYALSFSALVIASAILLGEQLLILLIIVLLLLL',
  'WETVQKWREYRRQCQRSLTEDWWETVRYRREQRRRAR',
  'TRNYIHLNLFASFILRALSVFIKDAALKWMYSTAAQFDELFAAILAALAAA',
  'WETVQKWREYRRQCQRSLTEDWWETVRYRRRQREEER'),
 tensor([[0.5654],
         [0.6265],
         [0.5657],
         [0.4947],
         [0.4927],
         [0.5303],
         [0.5373],
         [0.5771]])]

In [8]:

inputs = tokenizer(
    next(iter(data_loader_train))[0], 
    padding=True, #Padding para el mismo tamaño
    truncation=True, #Recorta en caso de que sean muy largas
    return_tensors="pt" #Regresa tensores de pytorch
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
inputs['input_ids'].shape

torch.Size([8, 103])

In [10]:
with torch.no_grad():
    # Pasamos los tensores tokenizados al modelo
    outputs = model(**inputs)

In [11]:
#Luego de pasar por la última capa tenemos  [num_secuencias, longitud_max_secuencia, dimension_emb]
outputs.last_hidden_state.shape 

torch.Size([8, 103, 1280])

In [12]:
embedding_por_residuo = outputs.last_hidden_state
embedding_por_secuencia = embedding_por_residuo.mean(dim=1) #Promedio por cada secuencia

In [13]:
embedding_por_secuencia.shape

torch.Size([8, 1280])

In [13]:
#Opcional si queremos usar numpy, sklearn
#embeddings_finales_numpy = embedding_por_secuencia.cpu().numpy()

# Predictor para pLDDT de Alphafold

In [14]:
from mlp import MLP_pLDDT
import torch.nn as nn
import torch.optim as optim

input_size = 1280
hidden_sizes = [1280,512,254]  # 3 capas ocultas
output_size = 1

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model_mlp = MLP_pLDDT(input_size, hidden_sizes, output_size).to(device)
print(model_mlp)

criterion = nn.MSELoss()  # Mean Squared Error para regresión
optimizer = optim.Adam(model_mlp.parameters(), lr=0.001)

MLP_pLDDT(
  (network): Sequential(
    (0): Linear(in_features=1280, out_features=1280, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=1280, out_features=512, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=512, out_features=254, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=254, out_features=1, bias=True)
  )
)


In [15]:
#MODELO DE MULTILAYER PERCEPTRON
EPOCHS = 10
for epoch in range(EPOCHS):
    #TRAINING
    model_mlp.train()
    total_epoch_loss = 0.0
    for i , (sequences_batch, plddts_batch) in enumerate(data_loader_train):
        plddts_batch = plddts_batch.to(device)
        inputs = tokenizer(sequences_batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        
        embeddings_batch = outputs.last_hidden_state.mean(dim=1)
        predictions = model_mlp(embeddings_batch)
        loss = criterion(predictions, plddts_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_epoch_loss += loss.item()

    avg_epoch_loss = total_epoch_loss / len(data_loader_train)
    print(f"ÉPOCA {epoch + 1} / {EPOCHS} FINALIZADA")
    print(f"Pérdida Promedio de la Época: {avg_epoch_loss:.4f}")

    #VALIDATION
    model_mlp.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for i , (sequences_batch, plddts_batch) in enumerate(data_loader_val):
            plddts_batch = plddts_batch.to(device)
            inputs = tokenizer(sequences_batch, padding=True, truncation=True, return_tensors="pt").to(device)
            outputs = model(**inputs)
            
            embeddings_batch = outputs.last_hidden_state.mean(dim=1)
            predictions = model_mlp(embeddings_batch)
            loss = criterion(predictions, plddts_batch)

            total_val_loss += loss.item()

        avg_epoch_loss_val = total_val_loss / len(data_loader_val)
        print(f"ÉPOCA {epoch + 1} / {EPOCHS} FINALIZADA VAL")
        print(f"Pérdida Promedio de la Época VAL: {avg_epoch_loss_val:.4f}")




ÉPOCA 1 / 10 FINALIZADA
Pérdida Promedio de la Época: 0.0137
ÉPOCA 1 / 10 FINALIZADA VAL
Pérdida Promedio de la Época VAL: 0.0023
ÉPOCA 2 / 10 FINALIZADA
Pérdida Promedio de la Época: 0.0046
ÉPOCA 2 / 10 FINALIZADA VAL
Pérdida Promedio de la Época VAL: 0.0010
ÉPOCA 3 / 10 FINALIZADA
Pérdida Promedio de la Época: 0.0035
ÉPOCA 3 / 10 FINALIZADA VAL
Pérdida Promedio de la Época VAL: 0.0029
ÉPOCA 4 / 10 FINALIZADA
Pérdida Promedio de la Época: 0.0041
ÉPOCA 4 / 10 FINALIZADA VAL
Pérdida Promedio de la Época VAL: 0.0028
ÉPOCA 5 / 10 FINALIZADA
Pérdida Promedio de la Época: 0.0031
ÉPOCA 5 / 10 FINALIZADA VAL
Pérdida Promedio de la Época VAL: 0.0012
ÉPOCA 6 / 10 FINALIZADA
Pérdida Promedio de la Época: 0.0031
ÉPOCA 6 / 10 FINALIZADA VAL
Pérdida Promedio de la Época VAL: 0.0006
ÉPOCA 7 / 10 FINALIZADA
Pérdida Promedio de la Época: 0.0026
ÉPOCA 7 / 10 FINALIZADA VAL
Pérdida Promedio de la Época VAL: 0.0017
ÉPOCA 8 / 10 FINALIZADA
Pérdida Promedio de la Época: 0.0027
ÉPOCA 8 / 10 FINALIZADA VAL
P

In [17]:
#Guardamos modelo 
torch.save(model_mlp.state_dict(), 'mlp_weights.pth')

In [None]:
#MODELO DE CONVOLUCIÓN 1 DIMENSIÓN
from conv1d_protein import Simple1CNN
model_cnn = Simple1CNN().to(device)
optimizer = optim.Adam(model_cnn.parameters(), lr=0.001)
EPOCHS = 50

for epoch in range(EPOCHS):
    total_epoch_loss = 0.0
    model_cnn.train()
    for i , (sequences_batch, plddts_batch) in enumerate(data_loader):
        plddts_batch = plddts_batch.to(device)
        inputs = tokenizer(sequences_batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        
        embeddings_batch = outputs.last_hidden_state #Diferencia con el método anterior
        predictions = model_cnn(embeddings_batch)
        loss = criterion(predictions, plddts_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_epoch_loss += loss.item()

    avg_epoch_loss = total_epoch_loss / len(data_loader)
    print(f"ÉPOCA {epoch + 1} / {EPOCHS} FINALIZADA")
    print(f"Pérdida Promedio de la Época: {avg_epoch_loss:.4f}")



# Predictor para ipTM de Alphafold