In [None]:
# Notebook 6 — ST-GNN (Spatio-Temporal Graph Neural Network)

# Este notebook implementa un modelo espaciotemporal tipo **ST-GNN**, que combina:

# **GCN (Graph Convolutional Network)** para capturar dependencias espaciales entre años a través de sus embeddings.
# **LSTM** para capturar dependencias temporales en la evolución del puntaje global Saber 11.

# El objetivo es predecir los puntajes nacionales para **2023–2026**, usando como entrada los embeddings generados previamente en Notebook 2–3 y los 
# puntajes reales procesados en Notebook 5.

# **Pipeline del modelo ST-GNN**
# 1. Grafo de años (2015–2022) → matriz de adyacencia.
# 2. Embeddings de cada año como señales del grafo.
# 3. Capa GCN para extracción espacial.
# 4. Capa LSTM para modelado temporal.
# 5. Capa FC para predicción de puntaje futuro.

# Este modelo es más robusto que LSTM puro y más simple que EvolveGCN.

In [None]:
# Importaciones + configuración

In [None]:
# Importaciones

In [1]:
import os
import gc
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8")

print("Librerías cargadas correctamente.")


Librerías cargadas correctamente.


In [2]:
# Cargar embeddings reales (2015–2022)

In [3]:
# Cargar embeddings (2015–2022)

ruta_emb = r"C:/Users/john/Desktop/Saber_11_2025/resultados/embeddings/embeddings_2015_2022.csv"

df_emb = pd.read_csv(ruta_emb, index_col=0)

print("Embeddings cargados:", df_emb.shape)
display(df_emb.head())


Embeddings cargados: (8, 16)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
anio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015,-0.013285,-0.09036,0.014918,-0.193424,-0.085967,0.108265,0.36162,0.111973,-0.0943,-0.068606,-0.195047,-0.181644,-0.096827,0.127936,-0.013351,-0.16177
2016,-0.195073,0.281308,-0.024735,0.117337,0.042593,-0.216274,-0.04827,-0.019949,-0.045212,-0.292472,-0.254604,0.290325,-0.053898,-0.112172,0.069332,-0.137538
2017,-0.084518,-0.202669,-0.163925,0.03446,0.00333,0.24771,0.005441,-0.120514,-0.071859,0.015435,0.019399,-0.03772,-0.173882,0.04821,0.088221,0.077796
2018,0.111962,-0.017034,0.506638,-0.220439,0.359079,-0.125214,0.166344,-0.070628,0.164679,0.043943,0.401484,-0.114283,0.120481,0.054075,0.678628,-0.328001
2019,0.095019,-0.421412,-0.07754,0.175998,-0.088899,-0.119443,0.002129,-0.127941,-0.104979,0.110968,-0.003392,0.242536,-0.329259,-0.234814,0.20809,-0.141206


In [4]:
# Cargar puntajes (2015–2022)

In [5]:
ruta_scores = r"C:/Users/john/Desktop/Saber_11_2025/resultados/puntajes/puntajes_agregados_2015_2022.csv"

df_scores = pd.read_csv(ruta_scores)

print("Puntajes cargados:", df_scores.shape)
display(df_scores.head())


Puntajes cargados: (248, 3)


Unnamed: 0,COLE_DEPTO_UBICACION,PUNTAJE_GLOBAL_PROMEDIO,ANIO
0,AMAZONAS,220.590085,2015
1,ANTIOQUIA,247.540718,2015
2,ARAUCA,246.008498,2015
3,ATLANTICO,245.708802,2015
4,BOGOTA,270.212909,2015


In [6]:
# Unir Embeddings + Puntajes (igual que Notebook 5)

In [7]:
# Asegurar que los embeddings tienen columna ANIO
df_emb2 = df_emb.copy()
df_emb2["ANIO"] = df_emb2.index.astype(int)

df_scores_year = (
    df_scores.groupby("ANIO", as_index=False)
             .agg(PUNTAJE=("PUNTAJE_GLOBAL_PROMEDIO", "mean"))
)

df_final = df_emb2.merge(df_scores_year, on="ANIO", how="inner")

print("Dataset final (embeddings + puntaje real):", df_final.shape)
display(df_final)


Dataset final (embeddings + puntaje real): (8, 18)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,ANIO,PUNTAJE
0,-0.013285,-0.09036,0.014918,-0.193424,-0.085967,0.108265,0.36162,0.111973,-0.0943,-0.068606,-0.195047,-0.181644,-0.096827,0.127936,-0.013351,-0.16177,2015,243.005971
1,-0.195073,0.281308,-0.024735,0.117337,0.042593,-0.216274,-0.04827,-0.019949,-0.045212,-0.292472,-0.254604,0.290325,-0.053898,-0.112172,0.069332,-0.137538,2016,250.967215
2,-0.084518,-0.202669,-0.163925,0.03446,0.00333,0.24771,0.005441,-0.120514,-0.071859,0.015435,0.019399,-0.03772,-0.173882,0.04821,0.088221,0.077796,2017,248.747612
3,0.111962,-0.017034,0.506638,-0.220439,0.359079,-0.125214,0.166344,-0.070628,0.164679,0.043943,0.401484,-0.114283,0.120481,0.054075,0.678628,-0.328001,2018,256.157674
4,0.095019,-0.421412,-0.07754,0.175998,-0.088899,-0.119443,0.002129,-0.127941,-0.104979,0.110968,-0.003392,0.242536,-0.329259,-0.234814,0.20809,-0.141206,2019,241.799973
5,-0.105746,0.283967,-0.127792,-0.324531,0.574045,0.039864,-0.261769,-0.301018,0.1663,-0.20102,0.091978,0.212203,0.125527,-0.174737,-0.018771,-0.012808,2020,269.514077
6,-0.187736,-0.292265,0.126767,-0.09137,0.128451,0.413479,0.16718,-0.033451,0.202138,-0.000119,-0.000227,0.077904,-0.023739,0.048156,0.303581,-0.029455,2021,271.848492
7,-0.054985,-0.073847,-0.193893,-0.129026,0.174212,-0.235259,0.114465,0.144495,-0.121797,-0.070777,-0.20215,-0.009115,0.092984,0.23277,0.080997,0.098271,2022,242.407903


In [8]:
# Construcción del grafo de años

In [9]:
# Crear grafo temporal (cadena lineal)

years = df_final["ANIO"].tolist()

# Aristas: año t → año t+1
edge_index = []
for i in range(len(years) - 1):
    edge_index.append([i, i+1])
    edge_index.append([i+1, i])

edge_index = torch.tensor(edge_index, dtype=torch.long).t()

print("edge_index shape:", edge_index.shape)
print(edge_index)


edge_index shape: torch.Size([2, 14])
tensor([[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7],
        [1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6]])


In [10]:
# Preparar tensores para el modelo

In [11]:
X = torch.tensor(df_final.iloc[:, :16].values, dtype=torch.float32)
y = torch.tensor(df_final["PUNTAJE"].values, dtype=torch.float32)

data = Data(x=X, edge_index=edge_index)

print("Data.x shape:", data.x.shape)
print("Data.edge_index:", data.edge_index)


Data.x shape: torch.Size([8, 16])
Data.edge_index: tensor([[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7],
        [1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6]])


In [12]:
# Definir modelo ST-GNN (GCN → LSTM → FC)

In [13]:
# Modelo ST-GNN

class STGNN(nn.Module):
    def __init__(self, in_dim=16, gcn_dim=32, lstm_dim=32, num_layers=1):
        super().__init__()
        
        # Capa espacial (GCN)
        self.gcn = GCNConv(in_dim, gcn_dim)
        
        # Capa temporal (LSTM)
        self.lstm = nn.LSTM(gcn_dim, lstm_dim, num_layers, batch_first=True)
        
        # Predicción
        self.fc = nn.Linear(lstm_dim, 1)

    def forward(self, data):
        # --- GCN ---
        h = self.gcn(data.x, data.edge_index)  # (8, 32)
        h = torch.relu(h)

        # --- LSTM ---
        h = h.unsqueeze(0)  # (1, 8, 32)
        out, _ = self.lstm(h)  # temporal
        out = out[:, -1, :]    # último paso temporal

        # --- Capa final ---
        pred = self.fc(out)
        return pred.squeeze()


In [14]:
# Entrenamiento

In [17]:
model = STGNN()
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

for epoch in range(1500):
    optimizer.zero_grad()
    y_pred = model(data)
    loss = criterion(y_pred, y.mean())   # supervisión global por año
    loss.backward()
    optimizer.step()

    if epoch % 200 == 0:
        print(f"Epoch {epoch} — Loss {loss.item():.6f}")

print("Entrenamiento completado.")


Epoch 0 — Loss 63992.949219
Epoch 200 — Loss 34644.394531
Epoch 400 — Loss 17575.001953
Epoch 600 — Loss 8002.025391
Epoch 800 — Loss 3196.413574
Epoch 1000 — Loss 1096.245850
Epoch 1200 — Loss 316.343262
Epoch 1400 — Loss 75.375656
Entrenamiento completado.


In [None]:
# Predicciones 2023–2026 usando el modelo entrenado

In [18]:
# Predicciones 2023–2026 usando el modelo entrenado

future_pred = []
years_future = [2023, 2024, 2025, 2026]

with torch.no_grad():
    for year in years_future:
        pred = model(data).item()
        future_pred.append([year, pred])
        #print(f"{year}: {pred:.4f}")

# Guardar resultados en DataFrame
df_pred_stgnn = pd.DataFrame(future_pred, columns=["ANIO", "PUNTAJE_PRED"])
display(df_pred_stgnn)


Unnamed: 0,ANIO,PUNTAJE_PRED
0,2023,247.2211
1,2024,247.2211
2,2025,247.2211
3,2026,247.2211


In [19]:
# TABLA COMPLETA DE RESULTADOS ST-GNN

In [21]:
# TABLA COMPLETA DE RESULTADOS — ST-GNN

df_result = pd.DataFrame({
    "ANIO": list(df_final["ANIO"]) + list(df_pred_stgnn["ANIO"]),
    "REAL": list(df_final["PUNTAJE"]) + [None] * len(df_pred_stgnn),
    "ST-GNN": [None] * len(df_final) + list(df_pred_stgnn["PUNTAJE_PRED"])
})

# Último valor real (2022)
ultimo_real = df_final["PUNTAJE"].iloc[-1]

# Crear columnas de diferencia solo para predicciones
df_result["DIF_ABS"] = df_result["ST-GNN"] - ultimo_real
df_result["DIF_%"]   = (df_result["ST-GNN"] / ultimo_real - 1) * 100

# Mostrar tabla
print("TABLA COMPLETA DE RESULTADOS (ENRIQUECIDA)")
display(df_result)


TABLA COMPLETA DE RESULTADOS (ENRIQUECIDA)


Unnamed: 0,ANIO,REAL,ST-GNN,DIF_ABS,DIF_%
0,2015,243.005971,,,
1,2016,250.967215,,,
2,2017,248.747612,,,
3,2018,256.157674,,,
4,2019,241.799973,,,
5,2020,269.514077,,,
6,2021,271.848492,,,
7,2022,242.407903,,,
8,2023,,247.2211,4.813197,1.985578
9,2024,,247.2211,4.813197,1.985578
