In [1]:
# Core
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import math

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# GCN
import torch_geometric.nn as geom_nn

# TCN
from pytorch_tcn import TCN

# Métricas y optimización
from sklearn.metrics import classification_report, confusion_matrix
import optuna

# Carga de datos

In [12]:
# Configuración de rutas y clases
DATA_PATH = os.path.join("data")
actions = np.array(['TouchingChest', 'Hit', 'AleatoryMovement', 'Static'])
n_people = 25

# Configuración de dataset
no_vids_pp = 15
no_vids = n_people * no_vids_pp
vid_length = 16
label_map = {label: idx for idx, label in enumerate(actions)}

# Partición de datos
no_vid_for_training = 255 # 68% del conjunto total
no_vid_for_val = 45 # 12% del conjunto total
no_vid_for_test = 75 # 20% del conjunto total

# Función auxiliar para cargar un conjunto (train/val/test)
def load_videos(start_idx, end_idx, actions, data_path, vid_length, label_map):
    videos, labels = [], []
    for vid in range(start_idx, end_idx):
        for action in actions:
            frames = [
                np.load(os.path.join(data_path, action, str(vid), f"{frame_num}.npy"))
                for frame_num in range(vid_length)
            ]
            videos.append(frames)
            labels.append(label_map[action])
    return videos, labels

# Cargar datasets
vids_training, labels_training = load_videos(0, no_vid_for_training, actions, DATA_PATH, vid_length, label_map)
vids_val, labels_val = load_videos(no_vid_for_training, no_vid_for_training + no_vid_for_val, actions, DATA_PATH, vid_length, label_map)
vids_test, labels_test = load_videos(no_vid_for_training + no_vid_for_val, no_vids, actions, DATA_PATH, vid_length, label_map)


In [13]:
print(len(vids_training))
print(len(vids_val))
print(len(vids_test))

print(len(vids_training) + len(vids_val) + len(vids_test))

1020
180
300
1500


In [14]:
print(len(labels_test))
print(labels_test)

300
[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]


# Modelo 1: Transformer (Baseline)

In [15]:
class PositionalEncoding(nn.Module):
    """Positional encoding sinusoidal para secuencias cortas (L=16)."""
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 16):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)  # (L, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)  # (L, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)   # pares
        pe[:, 1::2] = torch.cos(position * div_term)   # impares
        self.register_buffer('pe', pe.unsqueeze(0))    # (1, L, d_model)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, L, d_model)
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [16]:
class TransformerBaseline(nn.Module):
    """Clasificador con Transformer Encoder.
    Entrada: (B, 16, 258)  -> Salida: logits (B, 4)
    """
    def __init__(
        self,
        in_features: int = 258,
        seq_len: int = 16,
        num_classes: int = 4,
        d_model: int = 128,
        nhead: int = 4,
        num_layers: int = 2,
        dim_feedforward: int = 256,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.seq_len = seq_len

        # Proyección a la dimensión del modelo
        self.input_proj = nn.Linear(in_features, d_model)

        # PE + Encoder
        self.pos_enc = PositionalEncoding(d_model, dropout=dropout, max_len=seq_len)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,   # acepta (B, L, E)
            norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Cabeza de clasificación
        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x esperado: (B, 16, 258)
        x = self.input_proj(x)              # (B, L, d_model)
        x = self.pos_enc(x)                 # (B, L, d_model)
        x = self.encoder(x)                 # (B, L, d_model)
        x = x.mean(dim=1)                   # pooling temporal -> (B, d_model)
        logits = self.head(x)               # (B, num_classes)
        return logits

In [17]:
B = 8
dummy = torch.randn(B, 16, 258)  # (batch, seq_len, features)
model = TransformerBaseline()
out = model(dummy)
print(out.shape)  # torch.Size([8, 4])

torch.Size([8, 4])




In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [19]:
# --- Dataset ---
class VideoDataset(Dataset):
    def __init__(self, videos, labels):
        self.videos = videos
        self.labels = labels

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        x = torch.tensor(self.videos[idx], dtype=torch.float32)  # (16, 258)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

In [20]:
# Crear datasets
train_dataset = VideoDataset(vids_training, labels_training)
val_dataset   = VideoDataset(vids_val, labels_val)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [21]:
# --- Modelo ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerBaseline().to(device)

# --- Optimizer + Loss ---
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [27]:
# --- Loop entrenamiento (baseline, pocos epochs) ---
for epoch in range(100):  # valores arbitrarios
    model.train()
    running_loss = 0.0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)

    # --- Validación ---
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    val_acc = correct / total
    print(f"Epoch {epoch+1}: Train Loss={avg_loss:.4f}, Val Acc={val_acc:.4f}")

  x = torch.tensor(self.videos[idx], dtype=torch.float32)  # (16, 258)


Epoch 1: Train Loss=1.3877, Val Acc=0.4000
Epoch 2: Train Loss=1.0412, Val Acc=0.7056
Epoch 3: Train Loss=0.8809, Val Acc=0.7778
Epoch 4: Train Loss=0.7484, Val Acc=0.7556
Epoch 5: Train Loss=0.7177, Val Acc=0.8000
Epoch 6: Train Loss=0.6343, Val Acc=0.7889
Epoch 7: Train Loss=0.6424, Val Acc=0.7667
Epoch 8: Train Loss=0.5949, Val Acc=0.8056
Epoch 9: Train Loss=0.5259, Val Acc=0.8333
Epoch 10: Train Loss=0.5428, Val Acc=0.8556
Epoch 11: Train Loss=0.4960, Val Acc=0.8444
Epoch 12: Train Loss=0.4529, Val Acc=0.8278
Epoch 13: Train Loss=0.4187, Val Acc=0.8556
Epoch 14: Train Loss=0.3683, Val Acc=0.8444
Epoch 15: Train Loss=0.3492, Val Acc=0.9000
Epoch 16: Train Loss=0.3780, Val Acc=0.8667
Epoch 17: Train Loss=0.3019, Val Acc=0.8556
Epoch 18: Train Loss=0.2841, Val Acc=0.8778
Epoch 19: Train Loss=0.2780, Val Acc=0.8722
Epoch 20: Train Loss=0.2659, Val Acc=0.8889
Epoch 21: Train Loss=0.2283, Val Acc=0.8556
Epoch 22: Train Loss=0.2396, Val Acc=0.8778
Epoch 23: Train Loss=0.2118, Val Acc=0.83

# Modelo 2: GCN (Baseline)

In [22]:
# --- Imports específicos para GCN (aditivos, sin conflictos) ---
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader as GeoDataLoader  # evita conflicto con torch.utils.data.DataLoader

In [23]:
class GCNBaseline(nn.Module):
    """GCN simple para clasificación de 4 clases.
    Entrada: grafo de un frame/video -> salida: logits (B, 4)
    """
    def __init__(self, in_channels=258, hidden_channels=256, num_classes=4):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = nn.Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index, batch):
        # x: nodos (N, in_channels), edge_index: aristas, batch: asignación de nodos a grafos
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)  # pooling global
        return self.lin(x)

In [24]:
from torch_geometric.data import Data, InMemoryDataset

def build_graph(frames, label):
    """
    Convierte un video (16 x 258) en un grafo:
    - 16 nodos (uno por frame).
    - Cada nodo con 258 features.
    - Aristas entre frames consecutivos.
    """
    x = torch.tensor(frames, dtype=torch.float32)  # (16, 258)

    # Conexiones secuenciales (cadena temporal)
    edge_index = []
    for i in range(len(frames) - 1):
        edge_index.append([i, i+1])
        edge_index.append([i+1, i])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()  # (2, E)

    y = torch.tensor([label], dtype=torch.long)  # etiqueta del grafo
    return Data(x=x, edge_index=edge_index, y=y)

In [25]:
class VideoGraphDataset(InMemoryDataset):
    def __init__(self, videos, labels, transform=None):
        self.videos = videos
        self.labels = labels
        super().__init__('.', transform)

        self.data_list = [build_graph(v, l) for v, l in zip(videos, labels)]

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        return self.data_list[idx]

In [26]:
# Crear datasets para train/val
train_graph_dataset = VideoGraphDataset(vids_training, labels_training)
val_graph_dataset   = VideoGraphDataset(vids_val, labels_val)

# DataLoaders de PyG
from torch_geometric.loader import DataLoader as GeoDataLoader
train_loader = GeoDataLoader(train_graph_dataset, batch_size=32, shuffle=True)
val_loader   = GeoDataLoader(val_graph_dataset, batch_size=32, shuffle=False)

  x = torch.tensor(frames, dtype=torch.float32)  # (16, 258)


In [33]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GCNBaseline().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(100):  # baseline: 100 epochs arbitrarios
    # --- TRAIN ---
    model.train()
    train_loss, correct, total = 0.0, 0, 0
    for batch in train_loader:
        batch = batch.to(device)

        optimizer.zero_grad()
        outputs = model(batch.x, batch.edge_index, batch.batch)
        loss = criterion(outputs, batch.y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == batch.y).sum().item()
        total += batch.y.size(0)

    train_loss /= len(train_loader)
    train_acc = correct / total

    # --- VALIDATION ---
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            outputs = model(batch.x, batch.edge_index, batch.batch)
            loss = criterion(outputs, batch.y)

            val_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == batch.y).sum().item()
            total += batch.y.size(0)

    val_loss /= len(val_loader)
    val_acc = correct / total

    print(f"Epoch {epoch+1}: "
          f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
          f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

Epoch 1: Train Loss=1.2613, Train Acc=0.4206 | Val Loss=1.0034, Val Acc=0.6111
Epoch 2: Train Loss=1.0457, Train Acc=0.5402 | Val Loss=0.8558, Val Acc=0.6778
Epoch 3: Train Loss=0.9655, Train Acc=0.6049 | Val Loss=0.8443, Val Acc=0.6556
Epoch 4: Train Loss=0.8831, Train Acc=0.6392 | Val Loss=0.8298, Val Acc=0.6389
Epoch 5: Train Loss=0.9067, Train Acc=0.6284 | Val Loss=1.0534, Val Acc=0.5222
Epoch 6: Train Loss=0.8241, Train Acc=0.6598 | Val Loss=0.6928, Val Acc=0.7833
Epoch 7: Train Loss=0.7367, Train Acc=0.7127 | Val Loss=0.6928, Val Acc=0.7722
Epoch 8: Train Loss=0.6795, Train Acc=0.7559 | Val Loss=0.6175, Val Acc=0.8278
Epoch 9: Train Loss=0.6455, Train Acc=0.7706 | Val Loss=0.5923, Val Acc=0.8111
Epoch 10: Train Loss=0.6340, Train Acc=0.7520 | Val Loss=0.5786, Val Acc=0.8056
Epoch 11: Train Loss=0.5939, Train Acc=0.7716 | Val Loss=0.5935, Val Acc=0.7778
Epoch 12: Train Loss=0.5428, Train Acc=0.8059 | Val Loss=0.5596, Val Acc=0.8389
Epoch 13: Train Loss=0.5632, Train Acc=0.8020 | V

# Modelo 3: Multiscale Temporal Convolution Network (MS-TCN) (Baseline)

In [27]:
class VideoDataset(Dataset):
    def __init__(self, videos, labels):
        self.videos = videos
        self.labels = labels

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        x = torch.tensor(self.videos[idx], dtype=torch.float32)  # (16, 258)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

In [28]:
class MSTCNBaseline(nn.Module):
    def __init__(self, in_features=258, num_classes=4):
        super().__init__()
        self.tcn = TCN(
            num_inputs=in_features,
            num_channels=[128, 128],
            kernel_size=3,
            dropout=0.2
        )
        self.head = nn.Linear(128, num_classes)  # 128 = último canal

    def forward(self, x):
        # Entrada: (B, 16, 258)
        x = x.transpose(1, 2)   # (B, 258, 16)
        out = self.tcn(x)       # (B, 128, 16)
        out = out.mean(dim=2)   # (B, 128)
        return self.head(out)   # (B, 4)

In [29]:
train_dataset = VideoDataset(vids_training, labels_training)
val_dataset   = VideoDataset(vids_val, labels_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MSTCNBaseline().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(100):  # baseline, 100 epochs arbitrarios
    # Train
    model.train()
    train_loss, correct, total = 0.0, 0, 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    train_loss /= len(train_loader)
    train_acc = correct / total

    # Val
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)

            val_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    val_loss /= len(val_loader)
    val_acc = correct / total

    print(f"Epoch {epoch+1}: "
          f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
          f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

Epoch 1: Train Loss=1.3053, Train Acc=0.3990 | Val Loss=1.0964, Val Acc=0.5333
Epoch 2: Train Loss=1.1073, Train Acc=0.5235 | Val Loss=0.8692, Val Acc=0.6889
Epoch 3: Train Loss=0.9552, Train Acc=0.6216 | Val Loss=0.7433, Val Acc=0.7778
Epoch 4: Train Loss=0.8425, Train Acc=0.6735 | Val Loss=0.7079, Val Acc=0.7500
Epoch 5: Train Loss=0.7550, Train Acc=0.7118 | Val Loss=0.5522, Val Acc=0.8500
Epoch 6: Train Loss=0.6992, Train Acc=0.7353 | Val Loss=0.5395, Val Acc=0.8444
Epoch 7: Train Loss=0.6629, Train Acc=0.7382 | Val Loss=0.6286, Val Acc=0.7500
Epoch 8: Train Loss=0.5761, Train Acc=0.7853 | Val Loss=0.4569, Val Acc=0.8833
Epoch 9: Train Loss=0.5308, Train Acc=0.8069 | Val Loss=0.4759, Val Acc=0.8000
Epoch 10: Train Loss=0.5197, Train Acc=0.8020 | Val Loss=0.5810, Val Acc=0.8556
Epoch 11: Train Loss=0.4902, Train Acc=0.8176 | Val Loss=0.5778, Val Acc=0.8389
Epoch 12: Train Loss=0.4566, Train Acc=0.8314 | Val Loss=0.4915, Val Acc=0.8333
Epoch 13: Train Loss=0.4059, Train Acc=0.8549 | V

# Modelo 4: Temporal Gate Unit (TGU) (Baseline)

In [31]:
class VideoDataset(Dataset):
    def __init__(self, videos, labels):
        self.videos = videos
        self.labels = labels

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        x = torch.tensor(self.videos[idx], dtype=torch.float32)  # (16, 258)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

In [32]:
class TGUBlock(nn.Module):
    """Bloque temporal con gating multiplicativo."""
    def __init__(self, in_channels, out_channels, kernel_size=3, dropout=0.2):
        super().__init__()
        self.conv_filter = nn.Conv1d(in_channels, out_channels, kernel_size, padding=1)
        self.conv_gate   = nn.Conv1d(in_channels, out_channels, kernel_size, padding=1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (B, C, L)
        f = torch.tanh(self.conv_filter(x))
        g = torch.sigmoid(self.conv_gate(x))
        return self.dropout(f * g)  # gating multiplicativo

In [33]:
class TGUBaseline(nn.Module):
    def __init__(self, in_features=258, num_classes=4, hidden=128):
        super().__init__()
        self.block1 = TGUBlock(in_features, hidden)
        self.block2 = TGUBlock(hidden, hidden)
        self.head   = nn.Linear(hidden, num_classes)

    def forward(self, x):
        # Entrada: (B, 16, 258)
        x = x.transpose(1, 2)     # (B, 258, 16)
        x = self.block1(x)        # (B, hidden, L)
        x = self.block2(x)        # (B, hidden, L)
        x = x.mean(dim=2)         # pooling temporal -> (B, hidden)
        return self.head(x)       # logits (B, num_classes)

In [34]:
train_dataset = VideoDataset(vids_training, labels_training)
val_dataset   = VideoDataset(vids_val, labels_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TGUBaseline().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [43]:
for epoch in range(100):  # baseline: 3 epochs arbitrarios
    # Train
    model.train()
    train_loss, correct, total = 0.0, 0, 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    train_loss /= len(train_loader)
    train_acc = correct / total

    # Val
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)

            val_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    val_loss /= len(val_loader)
    val_acc = correct / total

    print(f"Epoch {epoch+1}: "
          f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
          f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

Epoch 1: Train Loss=1.3018, Train Acc=0.3775 | Val Loss=1.0532, Val Acc=0.5167
Epoch 2: Train Loss=1.0619, Train Acc=0.5304 | Val Loss=0.8304, Val Acc=0.6667
Epoch 3: Train Loss=0.9948, Train Acc=0.5745 | Val Loss=0.7928, Val Acc=0.7056
Epoch 4: Train Loss=0.9409, Train Acc=0.6010 | Val Loss=0.7384, Val Acc=0.7222
Epoch 5: Train Loss=0.8510, Train Acc=0.6510 | Val Loss=0.7178, Val Acc=0.7444
Epoch 6: Train Loss=0.7724, Train Acc=0.6902 | Val Loss=0.6937, Val Acc=0.7333
Epoch 7: Train Loss=0.7185, Train Acc=0.7176 | Val Loss=0.5469, Val Acc=0.8333
Epoch 8: Train Loss=0.6494, Train Acc=0.7696 | Val Loss=0.6280, Val Acc=0.7333
Epoch 9: Train Loss=0.6357, Train Acc=0.7559 | Val Loss=0.6409, Val Acc=0.7222
Epoch 10: Train Loss=0.5979, Train Acc=0.7745 | Val Loss=0.6151, Val Acc=0.7611
Epoch 11: Train Loss=0.6382, Train Acc=0.7618 | Val Loss=0.6210, Val Acc=0.7444
Epoch 12: Train Loss=0.5369, Train Acc=0.8108 | Val Loss=0.4738, Val Acc=0.8500
Epoch 13: Train Loss=0.4790, Train Acc=0.8225 | V

# Modelo 5: Self-Attention Network (Baseline)

In [36]:
class VideoDataset(Dataset):
    def __init__(self, videos, labels):
        self.videos = videos
        self.labels = labels

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        x = torch.tensor(self.videos[idx], dtype=torch.float32)  # (16, 258)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

In [37]:
class SelfAttentionBlock(nn.Module):
    def __init__(self, embed_dim=258, num_heads=4, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # x: (B, L, E)
        attn_out, _ = self.attn(x, x, x)     # self-attention
        x = self.norm1(x + attn_out)         # residual
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)           # residual
        return x

In [38]:
class SANBaseline(nn.Module):
    def __init__(self, in_features=258, num_classes=4, num_heads=4, num_layers=2, hidden=128):
        super().__init__()
        self.input_proj = nn.Linear(in_features, hidden)
        self.layers = nn.ModuleList([
            SelfAttentionBlock(embed_dim=hidden, num_heads=num_heads) for _ in range(num_layers)
        ])
        self.head = nn.Linear(hidden, num_classes)

    def forward(self, x):
        # x: (B, 16, 258)
        x = self.input_proj(x)   # (B, L, hidden)
        for layer in self.layers:
            x = layer(x)         # (B, L, hidden)
        x = x.mean(dim=1)        # pooling temporal
        return self.head(x)      # (B, num_classes)

In [39]:
train_dataset = VideoDataset(vids_training, labels_training)
val_dataset   = VideoDataset(vids_val, labels_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SANBaseline().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(100):  # baseline: 100 epochs arbitrarios
    # Train
    model.train()
    train_loss, correct, total = 0.0, 0, 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    train_loss /= len(train_loader)
    train_acc = correct / total

    # Val
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)

            val_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    val_loss /= len(val_loader)
    val_acc = correct / total

    print(f"Epoch {epoch+1}: "
          f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
          f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

Epoch 1: Train Loss=1.2901, Train Acc=0.3902 | Val Loss=0.8844, Val Acc=0.6389
Epoch 2: Train Loss=1.0178, Train Acc=0.5745 | Val Loss=0.7282, Val Acc=0.7167
Epoch 3: Train Loss=0.8707, Train Acc=0.6461 | Val Loss=0.5143, Val Acc=0.8333
Epoch 4: Train Loss=0.7400, Train Acc=0.7020 | Val Loss=0.4833, Val Acc=0.8500
Epoch 5: Train Loss=0.6838, Train Acc=0.7480 | Val Loss=0.5095, Val Acc=0.8389
Epoch 6: Train Loss=0.6234, Train Acc=0.7755 | Val Loss=0.4253, Val Acc=0.8222
Epoch 7: Train Loss=0.5020, Train Acc=0.8176 | Val Loss=0.4259, Val Acc=0.8833
Epoch 8: Train Loss=0.4132, Train Acc=0.8588 | Val Loss=0.5220, Val Acc=0.8500
Epoch 9: Train Loss=0.3921, Train Acc=0.8618 | Val Loss=0.2999, Val Acc=0.9278
Epoch 10: Train Loss=0.3566, Train Acc=0.8706 | Val Loss=0.4704, Val Acc=0.8667
Epoch 11: Train Loss=0.3156, Train Acc=0.8833 | Val Loss=0.2862, Val Acc=0.9333
Epoch 12: Train Loss=0.2811, Train Acc=0.9029 | Val Loss=0.2902, Val Acc=0.9111
Epoch 13: Train Loss=0.3234, Train Acc=0.8912 | V

# Modelo 6: Long-Short Term Memory Neural Network (LSTM) (Baseline)

In [40]:
class VideoDataset(Dataset):
    def __init__(self, videos, labels):
        self.videos = videos
        self.labels = labels

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        x = torch.tensor(self.videos[idx], dtype=torch.float32)  # (16, 258)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

In [41]:
class LSTMBaseline(nn.Module):
    def __init__(self, in_features=258, hidden1=128, hidden2=128, hidden3=64, num_classes=4):
        super().__init__()
        self.lstm1 = nn.LSTM(input_size=in_features, hidden_size=hidden1,
                             batch_first=True, dropout=0.0, bidirectional=False)
        self.lstm2 = nn.LSTM(input_size=hidden1, hidden_size=hidden2,
                             batch_first=True, dropout=0.0, bidirectional=False)
        self.lstm3 = nn.LSTM(input_size=hidden2, hidden_size=hidden3,
                             batch_first=True, dropout=0.0, bidirectional=False)
        self.fc1 = nn.Linear(hidden3, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, num_classes)

    def forward(self, x):
        # Entrada: (B, 16, 258)
        out, _ = self.lstm1(x)   # (B, 16, 128)
        out, _ = self.lstm2(out) # (B, 16, 128)
        out, _ = self.lstm3(out) # (B, 16, 64)
        out = out[:, -1, :]      # tomamos el último estado (B, 64)
        out = torch.relu(self.fc1(out))
        out = torch.relu(self.fc2(out))
        return self.fc3(out)     # (B, num_classes)

In [42]:
train_dataset = VideoDataset(vids_training, labels_training)
val_dataset   = VideoDataset(vids_val, labels_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMBaseline().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [53]:
for epoch in range(100):  # baseline, 100 epochs arbitrarios
    # Train
    model.train()
    train_loss, correct, total = 0.0, 0, 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    train_loss /= len(train_loader)
    train_acc = correct / total

    # Val
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)

            val_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    val_loss /= len(val_loader)
    val_acc = correct / total

    print(f"Epoch {epoch+1}: "
          f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
          f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

Epoch 1: Train Loss=1.3518, Train Acc=0.2794 | Val Loss=1.1952, Val Acc=0.4500
Epoch 2: Train Loss=1.1941, Train Acc=0.4765 | Val Loss=1.0593, Val Acc=0.5167
Epoch 3: Train Loss=1.0532, Train Acc=0.5412 | Val Loss=0.7380, Val Acc=0.6889
Epoch 4: Train Loss=0.9130, Train Acc=0.6020 | Val Loss=0.7635, Val Acc=0.7111
Epoch 5: Train Loss=0.8008, Train Acc=0.6618 | Val Loss=0.6037, Val Acc=0.8000
Epoch 6: Train Loss=0.7555, Train Acc=0.7049 | Val Loss=0.6000, Val Acc=0.7667
Epoch 7: Train Loss=0.6988, Train Acc=0.7206 | Val Loss=0.5488, Val Acc=0.8389
Epoch 8: Train Loss=0.5897, Train Acc=0.7804 | Val Loss=0.6156, Val Acc=0.7833
Epoch 9: Train Loss=0.7007, Train Acc=0.7245 | Val Loss=0.5133, Val Acc=0.8000
Epoch 10: Train Loss=0.5512, Train Acc=0.7990 | Val Loss=0.5718, Val Acc=0.8167
Epoch 11: Train Loss=0.4921, Train Acc=0.8216 | Val Loss=0.6406, Val Acc=0.7389
Epoch 12: Train Loss=0.4482, Train Acc=0.8363 | Val Loss=0.5308, Val Acc=0.8167
Epoch 13: Train Loss=0.4703, Train Acc=0.8412 | V

# Notas 3 de octubre
* Hacer el cálculo del costo computacional en cada uno de los modelos.
* Se puede hacer un análisis de accuracy versus costo computacional.

# Optimización de Hiperparámetros

In [53]:
import optuna
import json
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
)
import numpy as np

In [54]:
# --- Almacenamiento de resultados ---
all_results = []
best_by_model = {}

In [55]:
# Para secuenciales
train_loader_seq = DataLoader(VideoDataset(vids_training, labels_training), batch_size=32, shuffle=True)
val_loader_seq   = DataLoader(VideoDataset(vids_val, labels_val), batch_size=32, shuffle=False)

# Para GCN
train_loader_graph = GeoDataLoader(VideoGraphDataset(vids_training, labels_training), batch_size=32, shuffle=True)
val_loader_graph   = GeoDataLoader(VideoGraphDataset(vids_val, labels_val), batch_size=32, shuffle=False)

In [81]:
def compute_metrics(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    tn = np.diag(cm)  # diagonal principal son los verdaderos positivos por clase
    FP = cm.sum(axis=0) - tn
    FN = cm.sum(axis=1) - tn
    TP = tn
    TN = cm.sum() - (FP + FN + TP)
    
    # Evitar divisiones por cero
    eps = 1e-8
    specificity = np.mean(TN / (TN + FP + eps))
    precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)

    return {
        "accuracy": acc,
        "precision_macro": precision,
        "recall_macro": recall,
        "specificity_macro": specificity,
        "f1_macro": f1,
        "balanced_accuracy": bal_acc,
    }

In [57]:
def train_and_validate(model, lr, train_loader, val_loader, device, is_graph=False):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    epochs = 100

    for _ in range(epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            if is_graph:
                batch = batch.to(device)
                outputs = model(batch.x, batch.edge_index, batch.batch)
                loss = criterion(outputs, batch.y)
            else:
                X, y = batch
                X, y = X.to(device), y.to(device)
                outputs = model(X)
                loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

    # --- Validación ---
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in val_loader:
            if is_graph:
                batch = batch.to(device)
                outputs = model(batch.x, batch.edge_index, batch.batch)
                preds = outputs.argmax(dim=1).cpu().numpy()
                y_true.extend(batch.y.cpu().numpy())
                y_pred.extend(preds)
            else:
                X, y = batch
                X, y = X.to(device), y.to(device)
                outputs = model(X)
                preds = outputs.argmax(dim=1).cpu().numpy()
                y_true.extend(y.cpu().numpy())
                y_pred.extend(preds)

    metrics = compute_metrics(y_true, y_pred, labels=list(range(4)))
    return metrics

In [58]:
# --- Función objective global ---
def objective(trial):
    model_name = trial.suggest_categorical("model", ["Transformer", "GCN", "MS-TCN", "TGU", "SAN", "LSTM"])
    lr = trial.suggest_loguniform("lr", 1e-5, 1e-2)
    hidden = trial.suggest_categorical("hidden", [64, 128, 256])
    dropout = trial.suggest_float("dropout", 0.1, 0.5)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if model_name == "Transformer":
        model = TransformerBaseline(d_model=hidden, dropout=dropout).to(device)
        is_graph = False
        train_loader, val_loader = train_loader_seq, val_loader_seq

    elif model_name == "GCN":
        model = GCNBaseline(hidden_channels=hidden).to(device)
        is_graph = True
        train_loader, val_loader = train_loader_graph, val_loader_graph

    elif model_name == "MS-TCN":
        model = MSTCNBaseline().to(device)
        is_graph = False
        train_loader, val_loader = train_loader_seq, val_loader_seq

    elif model_name == "TGU":
        model = TGUBaseline(hidden=hidden).to(device)
        is_graph = False
        train_loader, val_loader = train_loader_seq, val_loader_seq

    elif model_name == "SAN":
        model = SANBaseline(hidden=hidden).to(device)
        is_graph = False
        train_loader, val_loader = train_loader_seq, val_loader_seq

    elif model_name == "LSTM":
        model = LSTMBaseline(hidden1=hidden, hidden2=hidden, hidden3=hidden//2).to(device)
        is_graph = False
        train_loader, val_loader = train_loader_seq, val_loader_seq

    metrics = train_and_validate(model, lr, train_loader, val_loader, device, is_graph)
    val_acc = metrics["accuracy"]

    # Guardar resultados
    result = {
        "trial": trial.number,
        "model": model_name,
        "lr": lr,
        "hidden": hidden,
        "dropout": dropout,
        "metrics": metrics
    }
    all_results.append(result)

    # Mejor por modelo
    if model_name not in best_by_model or val_acc > best_by_model[model_name]["metrics"]["accuracy"]:
        best_by_model[model_name] = result

    return val_acc

In [112]:
"""
# --- Ejecución ---
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# --- Guardar resultados ---
with open("optuna_all_results.json", "w") as f:
    json.dump(all_results, f, indent=4)
3
with open("optuna_best_models.json", "w") as f:
    json.dump(best_by_model, f, indent=4)

print("\n✅ Resultados guardados en:")
print("  • optuna_all_results.json")
print("  • optuna_best_models.json")
print("\n🏆 Mejor modelo global:")
print(study.best_params)
print(f"Accuracy de validación: {study.best_value:.4f}")
"""

[I 2025-10-12 19:59:22,484] A new study created in memory with name: no-name-1c6b8ee8-1ef9-4fd3-b982-372dcbab513c
  lr = trial.suggest_loguniform("lr", 1e-5, 1e-2)
[I 2025-10-12 20:02:54,394] Trial 0 finished with value: 0.8777777777777778 and parameters: {'model': 'Transformer', 'lr': 4.46196489907191e-05, 'hidden': 256, 'dropout': 0.37225294478346715}. Best is trial 0 with value: 0.8777777777777778.
  lr = trial.suggest_loguniform("lr", 1e-5, 1e-2)
[I 2025-10-12 20:04:31,470] Trial 1 finished with value: 0.7888888888888889 and parameters: {'model': 'LSTM', 'lr': 0.00044484736104396925, 'hidden': 128, 'dropout': 0.118817167731916}. Best is trial 0 with value: 0.8777777777777778.
  lr = trial.suggest_loguniform("lr", 1e-5, 1e-2)
[I 2025-10-12 20:06:04,917] Trial 2 finished with value: 0.8055555555555556 and parameters: {'model': 'TGU', 'lr': 0.0003699549264336474, 'hidden': 128, 'dropout': 0.3979163716275135}. Best is trial 0 with value: 0.8777777777777778.
  lr = trial.suggest_logunif


✅ Resultados guardados en:
  • optuna_all_results.json
  • optuna_best_models.json

🏆 Mejor modelo global:
{'model': 'Transformer', 'lr': 0.0012043476164750062, 'hidden': 256, 'dropout': 0.17141203879587477}
Accuracy de validación: 0.9222


# Optimización de hiperparámetros por modelo

In [59]:
import json, os, math
import numpy as np
import optuna
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, f1_score,
    accuracy_score, balanced_accuracy_score
)
import torch
import torch.nn as nn
import torch.optim as optim

In [60]:
# ========= utilidades métricas =========
def _metrics_from_preds(y_true, y_pred, n_classes=4):
    cm = confusion_matrix(y_true, y_pred, labels=list(range(n_classes)))
    tp = np.diag(cm).astype(float)
    fp = cm.sum(axis=0) - tp
    fn = cm.sum(axis=1) - tp
    tn = cm.sum() - (tp + fp + fn)
    eps = 1e-8
    specificity_macro = np.mean(tn / (tn + fp + eps))
    out = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision_macro": precision_score(y_true, y_pred, average="macro", zero_division=0),
        "recall_macro": recall_score(y_true, y_pred, average="macro", zero_division=0),
        "specificity_macro": float(specificity_macro),
        "f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "confusion_matrix": cm.tolist()
    }
    return out

In [68]:
# ========= TRANSFORMER =========
def optimize_transformer(train_loader_seq, val_loader_seq, n_trials=50, max_epochs=80, save_prefix="hpo_transformer"):
    import torch
    import torch.nn as nn

    class PositionalEncoding(nn.Module):
        def __init__(self, d_model, dropout=0.1, max_len=16):
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            pe = torch.zeros(max_len, d_model)
            pos = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
            div = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32)*(-math.log(10000.0)/d_model))
            pe[:, 0::2] = torch.sin(pos*div)
            pe[:, 1::2] = torch.cos(pos*div)
            self.register_buffer("pe", pe.unsqueeze(0))
        def forward(self, x):
            return self.dropout(x + self.pe[:, :x.size(1), :])

    class Model(nn.Module):
        def __init__(self, in_feat=258, n_classes=4, d_model=128, nhead=4, nl=2, ff=256, dropout=0.1, pool="mean"):
            super().__init__()
            self.proj = nn.Linear(in_feat, d_model)
            self.pe = PositionalEncoding(d_model, dropout, max_len=16)
            enc_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead,
                                                   dim_feedforward=ff, dropout=dropout,
                                                   batch_first=True, norm_first=True)
            self.enc = nn.TransformerEncoder(enc_layer, num_layers=nl)
            self.norm = nn.LayerNorm(d_model)
            self.head = nn.Linear(d_model, n_classes)
            self.pool = pool
        def forward(self, x):
            x = self.proj(x)
            x = self.pe(x)
            x = self.enc(x)
            if self.pool == "mean":
                x = x.mean(dim=1)
            else:
                x = x[:, -1, :]
            x = self.norm(x)
            return self.head(x)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
    all_trials = []

    def objective(trial):
        d_model  = trial.suggest_categorical("d_model", [128, 192, 256])
        nhead    = trial.suggest_categorical("nhead",  [2, 4, 8])
        nlayers  = trial.suggest_int("num_layers", 1, 4)
        ff       = trial.suggest_categorical("ff",     [256, 384, 512, 768])
        dropout  = trial.suggest_float("dropout", 0.0, 0.5)
        lr       = trial.suggest_float("lr", 5e-5, 5e-3, log=True)
        wd       = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)
        pool     = trial.suggest_categorical("pool", ["mean", "last"])

        model = Model(d_model=d_model, nhead=nhead, nl=nlayers, ff=ff, dropout=dropout, pool=pool).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

        best_val = 0.0
        for epoch in range(max_epochs):
            model.train()
            for X, y in train_loader_seq:
                X, y = X.to(device), y.to(device)
                optimizer.zero_grad()
                logits = model(X)
                loss = criterion(logits, y)
                loss.backward()
                optimizer.step()
            # val
            model.eval()
            y_true, y_pred = [], []
            with torch.no_grad():
                for X, y in val_loader_seq:
                    X, y = X.to(device), y.to(device)
                    pred = model(X).argmax(1)
                    y_true.extend(y.cpu().numpy()); y_pred.extend(pred.cpu().numpy())
            metrics = _metrics_from_preds(y_true, y_pred, n_classes=4)
            val_acc = metrics["accuracy"]
            trial.report(val_acc, epoch)
            if val_acc > best_val: best_val = val_acc
            if trial.should_prune(): raise optuna.TrialPruned()

        all_trials.append({"trial": trial.number, "params": trial.params, "best_val_acc": best_val})
        return best_val

    study.optimize(objective, n_trials=n_trials)
    # guardar
    with open(f"{save_prefix}_all.json", "w") as f: json.dump(all_trials, f, indent=2)
    with open(f"{save_prefix}_best.json", "w") as f: json.dump({"best_params": study.best_params, "best_value": study.best_value}, f, indent=2)
    return study

In [71]:
# ========= SAN (Self-Attention Network) =========
def optimize_san(train_loader_seq, val_loader_seq, n_trials=50, max_epochs=80, save_prefix="hpo_san"):
    class Block(nn.Module):
        def __init__(self, dim, heads, dropout):
            super().__init__()
            self.attn = nn.MultiheadAttention(dim, heads, dropout=dropout, batch_first=True)
            self.ff = nn.Sequential(nn.Linear(dim, dim), nn.ReLU(), nn.Dropout(dropout))
            self.n1 = nn.LayerNorm(dim); self.n2 = nn.LayerNorm(dim)
        def forward(self, x):
            a,_ = self.attn(x, x, x); x = self.n1(x + a)
            f = self.ff(x);            x = self.n2(x + f)
            return x

    class Model(nn.Module):
        def __init__(self, in_features=258, hidden=128, heads=4, layers=2, dropout=0.1, pool="mean"):
            super().__init__()
            self.proj = nn.Linear(in_features, hidden)
            self.blocks = nn.ModuleList([Block(hidden, heads, dropout) for _ in range(layers)])
            self.head = nn.Linear(hidden, 4)
            self.pool = pool
        def forward(self, x):
            x = self.proj(x)
            for b in self.blocks: x = b(x)
            x = x.mean(1) if self.pool == "mean" else x[:, -1, :]
            return self.head(x)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
    all_trials = []

    def objective(trial):
        hidden = trial.suggest_categorical("hidden", [128, 192, 256])
        heads  = trial.suggest_categorical("heads",  [2, 4, 8])
        layers = trial.suggest_int("layers", 1, 4)
        dropout= trial.suggest_float("dropout", 0.0, 0.5)
        lr     = trial.suggest_float("lr", 5e-5, 5e-3, log=True)
        wd     = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)
        pool   = trial.suggest_categorical("pool", ["mean", "last"])

        model = Model(hidden=hidden, heads=heads, layers=layers, dropout=dropout, pool=pool).to(device)
        crit = nn.CrossEntropyLoss()
        opt  = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

        best_val = 0.0
        for epoch in range(max_epochs):
            model.train()
            for X,y in train_loader_seq:
                X,y = X.to(device), y.to(device)
                opt.zero_grad(); loss = crit(model(X), y); loss.backward(); opt.step()
            # val
            model.eval()
            y_true, y_pred = [], []
            with torch.no_grad():
                for X,y in val_loader_seq:
                    X,y = X.to(device), y.to(device)
                    pred = model(X).argmax(1)
                    y_true.extend(y.cpu().numpy()); y_pred.extend(pred.cpu().numpy())
            val_acc = _metrics_from_preds(y_true, y_pred)["accuracy"]
            if val_acc > best_val: best_val = val_acc
            trial.report(val_acc, epoch)
            if trial.should_prune(): raise optuna.TrialPruned()

        all_trials.append({"trial": trial.number, "params": trial.params, "best_val_acc": best_val})
        return best_val

    study.optimize(objective, n_trials=n_trials)
    with open(f"{save_prefix}_all.json","w") as f: json.dump(all_trials,f,indent=2)
    with open(f"{save_prefix}_best.json","w") as f: json.dump({"best_params":study.best_params,"best_value":study.best_value},f,indent=2)
    return study

In [72]:
# ========= MS-TCN =========
def optimize_mstcn(train_loader_seq, val_loader_seq, n_trials=50, max_epochs=80, save_prefix="hpo_mstcn"):
    from pytorch_tcn import TCN
    class Model(nn.Module):
        def __init__(self, in_features=258, channels=[128,128], k=3, dropout=0.2, pool="mean"):
            super().__init__()
            self.tcn = TCN(num_inputs=in_features, num_channels=channels, kernel_size=k, dropout=dropout)
            self.head = nn.Linear(channels[-1], 4)
            self.pool = pool
        def forward(self, x):
            x = x.transpose(1,2)         # (B, C, L)
            feat = self.tcn(x)           # (B, H, L)
            feat = feat.mean(2) if self.pool=="mean" else feat[:,:,-1]
            return self.head(feat)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
    all_trials = []

    def objective(trial):
        depth   = trial.suggest_int("depth", 2, 5)
        width   = trial.suggest_categorical("width", [64, 96, 128, 160])
        channels= [width]*(depth)
        ksize   = trial.suggest_categorical("kernel_size", [2,3,5,7])
        dropout = trial.suggest_float("dropout", 0.0, 0.5)
        lr      = trial.suggest_float("lr", 5e-5, 5e-3, log=True)
        wd      = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)
        pool    = trial.suggest_categorical("pool", ["mean","last"])

        model = Model(channels=channels, k=ksize, dropout=dropout, pool=pool).to(device)
        crit = nn.CrossEntropyLoss()
        opt  = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

        best_val = 0.0
        for epoch in range(max_epochs):
            model.train()
            for X,y in train_loader_seq:
                X,y = X.to(device), y.to(device)
                opt.zero_grad(); loss = crit(model(X), y); loss.backward(); opt.step()
            model.eval()
            y_true, y_pred = [], []
            with torch.no_grad():
                for X,y in val_loader_seq:
                    X,y = X.to(device), y.to(device)
                    pred = model(X).argmax(1)
                    y_true.extend(y.cpu().numpy()); y_pred.extend(pred.cpu().numpy())
            val_acc = _metrics_from_preds(y_true, y_pred)["accuracy"]
            if val_acc > best_val: best_val = val_acc
            trial.report(val_acc, epoch)
            if trial.should_prune(): raise optuna.TrialPruned()

        all_trials.append({"trial": trial.number, "params": trial.params, "best_val_acc": best_val})
        return best_val

    study.optimize(objective, n_trials=n_trials)
    with open(f"{save_prefix}_all.json","w") as f: json.dump(all_trials,f,indent=2)
    with open(f"{save_prefix}_best.json","w") as f: json.dump({"best_params":study.best_params,"best_value":study.best_value},f,indent=2)
    return study

In [73]:
# ========= TGU =========
def optimize_tgu(train_loader_seq, val_loader_seq, n_trials=50, max_epochs=80, save_prefix="hpo_tgu"):
    class TGUBlock(nn.Module):
        def __init__(self, in_ch, out_ch, k=3, d=1, drop=0.2):
            super().__init__()
            pad = (k-1)//2 * d
            self.f = nn.Conv1d(in_ch, out_ch, k, padding=pad, dilation=d)
            self.g = nn.Conv1d(in_ch, out_ch, k, padding=pad, dilation=d)
            self.drop = nn.Dropout(drop)
        def forward(self, x):
            return self.drop(torch.tanh(self.f(x))*torch.sigmoid(self.g(x)))

    class Model(nn.Module):
        def __init__(self, in_features=258, hidden=128, blocks=2, k=3, drop=0.2, pool="mean", dilations=(1,2,4,8)):
            super().__init__()
            layers = []
            ch_in = in_features
            for i in range(blocks):
                d = dilations[i % len(dilations)]
                layers.append(TGUBlock(ch_in, hidden, k=k, d=d, drop=drop))
                ch_in = hidden
            self.net = nn.Sequential(*layers)
            self.head = nn.Linear(hidden, 4)
            self.pool = pool
        def forward(self, x):
            x = x.transpose(1,2)             # (B, C, L)
            x = self.net(x)                   # (B, H, L)
            x = x.mean(2) if self.pool=="mean" else x[:,:,-1]
            return self.head(x)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
    all_trials = []

    def objective(trial):
        hidden   = trial.suggest_categorical("hidden", [96,128,160,192])
        blocks   = trial.suggest_int("blocks", 2, 5)
        k        = trial.suggest_categorical("kernel", [3,5,7])
        drop     = trial.suggest_float("dropout", 0.0, 0.5)
        pool     = trial.suggest_categorical("pool", ["mean","last"])
        lr       = trial.suggest_float("lr", 5e-5, 5e-3, log=True)
        wd       = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)

        model = Model(hidden=hidden, blocks=blocks, k=k, drop=drop, pool=pool).to(device)
        crit = nn.CrossEntropyLoss()
        opt  = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

        best_val = 0.0
        for epoch in range(max_epochs):
            model.train()
            for X,y in train_loader_seq:
                X,y = X.to(device), y.to(device)
                opt.zero_grad(); loss = crit(model(X), y); loss.backward(); opt.step()
            model.eval()
            y_true, y_pred = [], []
            with torch.no_grad():
                for X,y in val_loader_seq:
                    X,y = X.to(device), y.to(device)
                    pred = model(X).argmax(1)
                    y_true.extend(y.cpu().numpy()); y_pred.extend(pred.cpu().numpy())
            val_acc = _metrics_from_preds(y_true, y_pred)["accuracy"]
            if val_acc > best_val: best_val = val_acc
            trial.report(val_acc, epoch)
            if trial.should_prune(): raise optuna.TrialPruned()

        all_trials.append({"trial": trial.number, "params": trial.params, "best_val_acc": best_val})
        return best_val

    study.optimize(objective, n_trials=n_trials)
    with open(f"{save_prefix}_all.json","w") as f: json.dump(all_trials,f,indent=2)
    with open(f"{save_prefix}_best.json","w") as f: json.dump({"best_params":study.best_params,"best_value":study.best_value},f,indent=2)
    return study

In [74]:
# ========= LSTM =========
def optimize_lstm(train_loader_seq, val_loader_seq, n_trials=50, max_epochs=80, save_prefix="hpo_lstm"):
    class Model(nn.Module):
        def __init__(self, in_features=258, h1=128, h2=128, h3=64, bidir=False, drop=0.0, dense=64):
            super().__init__()
            self.bidir = bidir
            self.l1 = nn.LSTM(in_features, h1, batch_first=True, bidirectional=bidir, dropout=0.0)
            in2 = h1*(2 if bidir else 1)
            self.l2 = nn.LSTM(in2, h2, batch_first=True, bidirectional=bidir, dropout=0.0)
            in3 = h2*(2 if bidir else 1)
            self.l3 = nn.LSTM(in3, h3, batch_first=True, bidirectional=bidir, dropout=0.0)
            last = h3*(2 if bidir else 1)
            self.dropout = nn.Dropout(drop)
            self.fc1 = nn.Linear(last, dense)
            self.fc2 = nn.Linear(dense, 32)
            self.fc3 = nn.Linear(32, 4)
        def forward(self, x):
            x,_ = self.l1(x); x,_ = self.l2(x); x,_ = self.l3(x)
            x = x[:, -1, :]
            x = self.dropout(torch.relu(self.fc1(x)))
            x = torch.relu(self.fc2(x))
            return self.fc3(x)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
    all_trials = []

    def objective(trial):
        h1 = trial.suggest_categorical("h1",[96,128,160,192])
        h2 = trial.suggest_categorical("h2",[96,128,160,192])
        h3 = trial.suggest_categorical("h3",[48,64,96,128])
        bidir = trial.suggest_categorical("bidirectional",[False, True])
        dense = trial.suggest_categorical("dense",[32,64,96,128])
        drop = trial.suggest_float("dropout",0.0,0.5)
        lr  = trial.suggest_float("lr",5e-5,5e-3,log=True)
        wd  = trial.suggest_float("weight_decay", 1e-10,1e-3,log=True)

        model = Model(h1=h1,h2=h2,h3=h3,bidir=bidir,drop=drop,dense=dense).to(device)
        crit = nn.CrossEntropyLoss()
        opt  = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

        best_val=0.0
        for epoch in range(max_epochs):
            model.train()
            for X,y in train_loader_seq:
                X,y = X.to(device), y.to(device)
                opt.zero_grad(); loss = crit(model(X), y); loss.backward(); opt.step()
            model.eval()
            y_true, y_pred = [], []
            with torch.no_grad():
                for X,y in val_loader_seq:
                    X,y = X.to(device), y.to(device)
                    pred = model(X).argmax(1)
                    y_true.extend(y.cpu().numpy()); y_pred.extend(pred.cpu().numpy())
            val_acc = _metrics_from_preds(y_true, y_pred)["accuracy"]
            if val_acc > best_val: best_val = val_acc
            trial.report(val_acc, epoch)
            if trial.should_prune(): raise optuna.TrialPruned()

        all_trials.append({"trial": trial.number, "params": trial.params, "best_val_acc": best_val})
        return best_val

    study.optimize(objective, n_trials=n_trials)
    with open(f"{save_prefix}_all.json","w") as f: json.dump(all_trials,f,indent=2)
    with open(f"{save_prefix}_best.json","w") as f: json.dump({"best_params":study.best_params,"best_value":study.best_value},f,indent=2)
    return study

In [75]:
# ========= GCN =========
def optimize_gcn(train_loader_graph, val_loader_graph, n_trials=50, max_epochs=80, save_prefix="hpo_gcn"):
    from torch_geometric.nn import GCNConv, global_mean_pool

    class Model(nn.Module):
        def __init__(self, in_ch=258, hidden=128, layers=2, dropout=0.1):
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.layers = nn.ModuleList()
            self.layers.append(GCNConv(in_ch, hidden))
            for _ in range(layers-1):
                self.layers.append(GCNConv(hidden, hidden))
            self.lin = nn.Linear(hidden, 4)
        def forward(self, x, edge_index, batch):
            for i,conv in enumerate(self.layers):
                x = conv(x, edge_index)
                x = torch.relu(x)
                x = self.dropout(x)
            x = global_mean_pool(x, batch)
            return self.lin(x)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
    all_trials = []

    def objective(trial):
        hidden  = trial.suggest_categorical("hidden",[96,128,160,192])
        layers  = trial.suggest_int("layers", 2, 5)
        drop    = trial.suggest_float("dropout", 0.0, 0.5)
        lr      = trial.suggest_float("lr", 5e-5, 5e-3, log=True)
        wd      = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)

        model = Model(hidden=hidden, layers=layers, dropout=drop).to(device)
        crit = nn.CrossEntropyLoss()
        opt  = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)

        best_val=0.0
        for epoch in range(max_epochs):
            model.train()
            for batch in train_loader_graph:
                batch = batch.to(device)
                opt.zero_grad()
                out = model(batch.x, batch.edge_index, batch.batch)
                loss = crit(out, batch.y)
                loss.backward(); opt.step()
            # val
            model.eval()
            y_true, y_pred = [], []
            with torch.no_grad():
                for batch in val_loader_graph:
                    batch = batch.to(device)
                    pred = model(batch.x, batch.edge_index, batch.batch).argmax(1)
                    y_true.extend(batch.y.cpu().numpy()); y_pred.extend(pred.cpu().numpy())
            val_acc = _metrics_from_preds(y_true, y_pred)["accuracy"]
            if val_acc > best_val: best_val = val_acc
            trial.report(val_acc, epoch)
            if trial.should_prune(): raise optuna.TrialPruned()

        all_trials.append({"trial": trial.number, "params": trial.params, "best_val_acc": best_val})
        return best_val

    study.optimize(objective, n_trials=n_trials)
    with open(f"{save_prefix}_all.json","w") as f: json.dump(all_trials,f,indent=2)
    with open(f"{save_prefix}_best.json","w") as f: json.dump({"best_params":study.best_params,"best_value":study.best_value},f,indent=2)
    return study

In [69]:
study_tf  = optimize_transformer(train_loader_seq, val_loader_seq, n_trials=40, max_epochs=100)

[I 2025-10-13 12:52:30,632] A new study created in memory with name: no-name-d2f5b581-c3f9-4453-acdf-98702efb4a60
[I 2025-10-13 12:54:21,611] Trial 0 finished with value: 0.9277777777777778 and parameters: {'d_model': 128, 'nhead': 8, 'num_layers': 1, 'ff': 384, 'dropout': 0.03435628388273493, 'lr': 0.0009937840388937165, 'weight_decay': 8.122583154571129e-10, 'pool': 'last'}. Best is trial 0 with value: 0.9277777777777778.
[I 2025-10-13 12:59:09,977] Trial 1 finished with value: 0.9 and parameters: {'d_model': 192, 'nhead': 2, 'num_layers': 3, 'ff': 512, 'dropout': 0.33442943209867587, 'lr': 0.00011574651308542461, 'weight_decay': 2.967641556059196e-05, 'pool': 'last'}. Best is trial 0 with value: 0.9277777777777778.
[I 2025-10-13 13:02:44,363] Trial 2 finished with value: 0.9111111111111111 and parameters: {'d_model': 128, 'nhead': 4, 'num_layers': 2, 'ff': 768, 'dropout': 0.07635445365344684, 'lr': 0.00016461025786795473, 'weight_decay': 0.00010527532171827261, 'pool': 'mean'}. Best

In [76]:
study_san = optimize_san(train_loader_seq, val_loader_seq, n_trials=40, max_epochs=100)

[I 2025-10-13 13:27:57,387] A new study created in memory with name: no-name-b5c28b08-0136-42c8-81d3-cfa0f3c16238
[I 2025-10-13 13:31:14,386] Trial 0 finished with value: 0.9444444444444444 and parameters: {'hidden': 256, 'heads': 2, 'layers': 3, 'dropout': 0.09228891164213615, 'lr': 0.000223814062092218, 'weight_decay': 5.456686692227437e-06, 'pool': 'mean'}. Best is trial 0 with value: 0.9444444444444444.
[I 2025-10-13 13:33:29,553] Trial 1 finished with value: 0.9111111111111111 and parameters: {'hidden': 192, 'heads': 4, 'layers': 2, 'dropout': 0.24858524705564422, 'lr': 5.044241589123786e-05, 'weight_decay': 0.0001078192872518563, 'pool': 'last'}. Best is trial 0 with value: 0.9444444444444444.
[I 2025-10-13 13:35:52,184] Trial 2 finished with value: 0.9222222222222223 and parameters: {'hidden': 192, 'heads': 8, 'layers': 2, 'dropout': 0.45809627493388444, 'lr': 0.0002666899425838391, 'weight_decay': 5.4398945921073874e-05, 'pool': 'mean'}. Best is trial 0 with value: 0.9444444444

In [77]:
study_tcn = optimize_mstcn(train_loader_seq, val_loader_seq, n_trials=40, max_epochs=100)

[I 2025-10-13 14:01:48,504] A new study created in memory with name: no-name-c6080afa-1ec8-4a1d-bad2-6c5ac40a286b
[I 2025-10-13 14:04:45,944] Trial 0 finished with value: 0.9111111111111111 and parameters: {'depth': 5, 'width': 128, 'kernel_size': 2, 'dropout': 0.22436922973443996, 'lr': 0.001325731770375339, 'weight_decay': 0.00025610437619332275, 'pool': 'last'}. Best is trial 0 with value: 0.9111111111111111.
[I 2025-10-13 14:07:16,386] Trial 1 finished with value: 0.9 and parameters: {'depth': 5, 'width': 64, 'kernel_size': 5, 'dropout': 0.41698800945745235, 'lr': 0.00017903798852132868, 'weight_decay': 1.3948167581014725e-05, 'pool': 'mean'}. Best is trial 0 with value: 0.9111111111111111.
[I 2025-10-13 14:10:29,203] Trial 2 finished with value: 0.9333333333333333 and parameters: {'depth': 3, 'width': 128, 'kernel_size': 7, 'dropout': 0.14250338040497074, 'lr': 0.001740373489282973, 'weight_decay': 1.4063216434301287e-10, 'pool': 'mean'}. Best is trial 2 with value: 0.933333333333

In [78]:
study_tgu = optimize_tgu(train_loader_seq, val_loader_seq, n_trials=40, max_epochs=100)

[I 2025-10-13 14:25:37,934] A new study created in memory with name: no-name-8e4d05ec-ce8e-46e1-b3a0-db900f712fb4
[I 2025-10-13 14:28:42,706] Trial 0 finished with value: 0.8277777777777777 and parameters: {'hidden': 128, 'blocks': 5, 'kernel': 5, 'dropout': 0.12750410955633024, 'pool': 'last', 'lr': 0.0004582958477769386, 'weight_decay': 0.0008309063574448469}. Best is trial 0 with value: 0.8277777777777777.
[I 2025-10-13 14:31:54,608] Trial 1 finished with value: 0.8444444444444444 and parameters: {'hidden': 160, 'blocks': 3, 'kernel': 7, 'dropout': 0.09139646604833113, 'pool': 'last', 'lr': 9.303623239731108e-05, 'weight_decay': 2.8353456172477696e-05}. Best is trial 1 with value: 0.8444444444444444.
[I 2025-10-13 14:35:53,591] Trial 2 finished with value: 0.8444444444444444 and parameters: {'hidden': 192, 'blocks': 3, 'kernel': 7, 'dropout': 0.4903379591094786, 'pool': 'mean', 'lr': 0.0007408980181069967, 'weight_decay': 6.938805810037711e-07}. Best is trial 1 with value: 0.8444444

In [79]:
study_lstm= optimize_lstm(train_loader_seq, val_loader_seq, n_trials=40, max_epochs=100)

[I 2025-10-13 14:53:17,629] A new study created in memory with name: no-name-6ee8af51-8222-42cc-b343-552b82289b20
[I 2025-10-13 14:55:27,956] Trial 0 finished with value: 0.8944444444444445 and parameters: {'h1': 192, 'h2': 128, 'h3': 96, 'bidirectional': False, 'dense': 96, 'dropout': 0.46570867055380855, 'lr': 0.0001948545988142792, 'weight_decay': 4.57655127220711e-05}. Best is trial 0 with value: 0.8944444444444445.
[I 2025-10-13 14:57:58,761] Trial 1 finished with value: 0.8444444444444444 and parameters: {'h1': 96, 'h2': 128, 'h3': 48, 'bidirectional': True, 'dense': 64, 'dropout': 0.07536366867991445, 'lr': 6.01608705214315e-05, 'weight_decay': 1.0468824926347064e-05}. Best is trial 0 with value: 0.8944444444444445.
[I 2025-10-13 15:00:06,917] Trial 2 finished with value: 0.9111111111111111 and parameters: {'h1': 160, 'h2': 160, 'h3': 96, 'bidirectional': False, 'dense': 64, 'dropout': 0.16126507565338144, 'lr': 6.889203750503244e-05, 'weight_decay': 4.394533926102311e-06}. Best

In [80]:
study_gcn = optimize_gcn(train_loader_graph, val_loader_graph, n_trials=40, max_epochs=100)

[I 2025-10-13 15:19:19,443] A new study created in memory with name: no-name-4ca841aa-a724-46b0-a3c0-5655218f0686
[I 2025-10-13 15:19:54,619] Trial 0 finished with value: 0.8777777777777778 and parameters: {'hidden': 128, 'layers': 3, 'dropout': 0.039493792770526726, 'lr': 0.00014986488393252883, 'weight_decay': 1.7651207864991065e-07}. Best is trial 0 with value: 0.8777777777777778.
[I 2025-10-13 15:20:29,986] Trial 1 finished with value: 0.8944444444444445 and parameters: {'hidden': 192, 'layers': 2, 'dropout': 0.03072330581040994, 'lr': 0.0044483684902061815, 'weight_decay': 9.811324510588843e-07}. Best is trial 1 with value: 0.8944444444444445.
[I 2025-10-13 15:21:26,022] Trial 2 finished with value: 0.85 and parameters: {'hidden': 128, 'layers': 5, 'dropout': 0.30914367360466427, 'lr': 0.0008551047009904848, 'weight_decay': 5.9182554121248435e-05}. Best is trial 1 with value: 0.8944444444444445.
[I 2025-10-13 15:22:11,112] Trial 3 finished with value: 0.8777777777777778 and parame

# Entrenamiento de modelos con hiperparámetros encontrados

In [101]:
import json, math, random, numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, f1_score,
    accuracy_score, balanced_accuracy_score
)

In [103]:
SEED = 42  # o el número que prefieras

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Opcional pero recomendado para reproducibilidad completa:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [104]:
def _compute_metrics(y_true, y_pred, n_classes=4):
    cm = confusion_matrix(y_true, y_pred, labels=list(range(n_classes)))
    tp = np.diag(cm).astype(float)
    fp = cm.sum(axis=0) - tp
    fn = cm.sum(axis=1) - tp
    tn = cm.sum() - (tp + fp + fn)
    eps = 1e-8
    specificity_macro = np.mean(tn / (tn + fp + eps))
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision_macro": precision_score(y_true, y_pred, average="macro", zero_division=0),
        "recall_macro": recall_score(y_true, y_pred, average="macro", zero_division=0),
        "specificity_macro": float(specificity_macro),
        "f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "confusion_matrix": cm.tolist()
    }

In [105]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=16):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, L, d_model)
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1), :])

In [106]:
# === Modelo EXACTO usado en HPO (no el baseline) ===
class HPOTransformer(nn.Module):
    """
    Coincide con el modelo definido en optimize_transformer:
    - input_proj -> PE -> TransformerEncoder(nlayers, nhead, dim_feedforward=ff, dropout)
    - pooling: 'mean' o 'last'
    - LayerNorm + Linear final
    """
    def __init__(self, in_feat=258, seq_len=16, num_classes=4,
                 d_model=128, nhead=4, num_layers=2, ff=256,
                 dropout=0.1, pool="mean"):
        super().__init__()
        self.pool = pool
        self.proj = nn.Linear(in_feat, d_model)
        self.pe = PositionalEncoding(d_model, dropout, max_len=seq_len)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=ff,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.enc = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, num_classes)

    def forward(self, x):
        # x: (B, 16, 258)
        x = self.proj(x)
        x = self.pe(x)
        x = self.enc(x)  # (B, L, d)
        if self.pool == "mean":
            x = x.mean(dim=1)
        else:  # 'last'
            x = x[:, -1, :]
        x = self.norm(x)
        return self.head(x)  # (B, num_classes)

In [130]:
def train_eval_save_transformer_hpo_from_json(
    params_path: str,
    train_loader,
    val_loader,
    device: torch.device,
    out_prefix: str = "Transformer_HPO_final",
    default_epochs: int = 90,
    patience: int = 12  # número de epochs sin mejora antes de detener
):
    with open(params_path, "r") as f:
        blob = json.load(f)
    hp = blob.get("best_params", blob)

    required = ["d_model", "nhead", "num_layers", "dropout"]
    missing = [k for k in required if k not in hp]
    if missing:
        raise ValueError(f"Faltan hiperparámetros en JSON: {missing}")

    ff = hp.get("ff", hp.get("dim_feedforward", None))
    if ff is None:
        raise ValueError("El JSON no contiene 'ff' ni 'dim_feedforward'.")

    pool = hp.get("pool", "mean")
    lr = hp.get("lr", 1e-3)
    weight_decay = hp.get("weight_decay", 0.0)
    epochs = int(hp.get("epochs", hp.get("max_epochs", default_epochs)))

    model = HPOTransformer(
        in_feat=258, seq_len=16, num_classes=4,
        d_model=hp["d_model"], nhead=hp["nhead"], num_layers=hp["num_layers"],
        ff=ff, dropout=hp["dropout"], pool=pool
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    train_losses, val_losses, val_accuracies = [], [], []
    best_val_loss = float("inf")
    best_epoch, best_state = 0, None
    epochs_no_improve = 0  # 👈 contador de paciencia

    for epoch in range(epochs):
        model.train()
        correct_train, total_train, running_loss_train = 0, 0, 0.0

        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            running_loss_train += loss.item()
            _, predicted = outputs.max(1)
            total_train += y.size(0)
            correct_train += predicted.eq(y).sum().item()

        train_loss = running_loss_train / len(train_loader)
        train_acc = 100. * correct_train / total_train
        train_losses.append(train_loss)

        model.eval()
        correct_val, total_val, running_loss_val = 0, 0, 0.0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                outputs = model(X)
                loss = criterion(outputs, y)
                running_loss_val += loss.item()
                _, predicted = outputs.max(1)
                total_val += y.size(0)
                correct_val += predicted.eq(y).sum().item()

        val_loss = running_loss_val / len(val_loader)

        y_true, y_pred = [], []
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                outputs = model(X)
                preds = outputs.argmax(1).cpu().numpy()
                y_true.extend(y.cpu().numpy())
                y_pred.extend(preds)
        
        epoch_metrics = compute_metrics(y_true, y_pred, labels=list(range(4)))
        val_acc = 100. * epoch_metrics["accuracy"]  # para mantener el mismo formato %
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)

        # --- Seguimiento del mejor modelo ---
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            best_train_metrics = {"accuracy": train_acc}
            best_val_metrics = {"accuracy": val_acc}
            best_state = model.state_dict().copy()
            epochs_no_improve = 0  # 👈 reset paciencia
        else:
            epochs_no_improve += 1  # 👈 aumenta paciencia
            if epochs_no_improve >= patience:
                print(f"\n⏹️ Early stopping en epoch {epoch+1} (sin mejora en {patience} epochs).")
                break  # 👈 detiene el entrenamiento

        marker = "*" if epoch == best_epoch else ""
        print(f"[Transformer HPO] Epoch {epoch+1}/{epochs} {marker} | "
              f"TrainLoss={train_loss:.4f} | ValLoss={val_loss:.4f}")

    if best_state is None:
        best_state = model.state_dict()

    model.load_state_dict(best_state)
    print(f"\n🏆 Mejor epoch: {best_epoch+1} | ValLoss={best_val_loss:.4f}")

    def _metrics_on(loader):
        model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for X, y in loader:
                X = X.to(device)
                logits = model(X)
                preds = logits.argmax(1).cpu().numpy()
                y_true.extend(y.numpy())
                y_pred.extend(preds)
        return _compute_metrics(y_true, y_pred, n_classes=4)

    metrics_train = _metrics_on(train_loader)
    metrics_val = _metrics_on(val_loader)

    plt.figure(figsize=(7,4))
    plt.plot(train_losses, label="Train Loss", color="blue")
    plt.plot(val_losses, label="Val Loss", color="orange")
    plt.axvline(best_epoch, color='red', linestyle='--', label=f"Best Epoch ({best_epoch+1})")
    plt.title("Transformer (HPO) - Train vs Validation Loss")
    plt.xlabel("Epoch"); plt.ylabel("Loss")
    plt.legend(); plt.grid(True); plt.tight_layout()
    plt.savefig(f"{out_prefix}_loss.png")
    plt.close()

    torch.save(model.state_dict(), f"{out_prefix}_best.pt")
    report = {
        "hyperparams": hp,
        "optimizer": {"lr": lr, "weight_decay": weight_decay, "epochs": epochs},
        "best_epoch": best_epoch + 1,
        "best_val_loss": best_val_loss,
        "train_metrics": metrics_train,
        "val_metrics": metrics_val,
        "loss_curves": {"train": train_losses, "val": val_losses, "val_acc": val_accuracies},
        "artifacts": {
            "weights_pt": f"{out_prefix}_best.pt",
            "loss_png": f"{out_prefix}_loss.png"
        }
    }
    with open(f"{out_prefix}_metrics.json", "w") as f:
        json.dump(report, f, indent=4)

    print(f"✅ Guardado: {out_prefix}_best.pt | {out_prefix}_loss.png | {out_prefix}_metrics.json")
    print(f"📈 Val Acc (best): {metrics_val['accuracy']:.4f} | Train Acc (final): {metrics_train['accuracy']:.4f}")
    return model, report

In [131]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_tf, report_tf = train_eval_save_transformer_hpo_from_json(
    params_path="hpo_transformer_best.json",  # el que guardó optimize_transformer
    train_loader=train_loader_seq,
    val_loader=val_loader_seq,
    device=device,
    out_prefix="Transformer_HPO_best"
)



[Transformer HPO] Epoch 1/90 * | TrainLoss=1.5729 | ValLoss=1.4000
[Transformer HPO] Epoch 2/90 * | TrainLoss=1.3899 | ValLoss=1.3331
[Transformer HPO] Epoch 3/90 * | TrainLoss=1.3109 | ValLoss=1.2243
[Transformer HPO] Epoch 4/90 * | TrainLoss=1.1013 | ValLoss=0.7647
[Transformer HPO] Epoch 5/90 * | TrainLoss=0.8571 | ValLoss=0.6187
[Transformer HPO] Epoch 6/90 * | TrainLoss=0.7537 | ValLoss=0.5559
[Transformer HPO] Epoch 7/90  | TrainLoss=0.6469 | ValLoss=0.5631
[Transformer HPO] Epoch 8/90 * | TrainLoss=0.6248 | ValLoss=0.4622
[Transformer HPO] Epoch 9/90  | TrainLoss=0.6482 | ValLoss=0.5061
[Transformer HPO] Epoch 10/90  | TrainLoss=0.5708 | ValLoss=0.4742
[Transformer HPO] Epoch 11/90  | TrainLoss=0.5746 | ValLoss=0.6043
[Transformer HPO] Epoch 12/90  | TrainLoss=0.5088 | ValLoss=0.5396
[Transformer HPO] Epoch 13/90 * | TrainLoss=0.5021 | ValLoss=0.4095
[Transformer HPO] Epoch 14/90  | TrainLoss=0.4698 | ValLoss=0.4631
[Transformer HPO] Epoch 15/90  | TrainLoss=0.4158 | ValLoss=0.4