In [None]:
# ===============================================================
# NOTEBOOK 4 — DECODER FINAL: Predicción de Puntajes Saber 11
# ===============================================================
#
# OBJETIVO:
# Este notebook implementa el DECODER que convierte los embeddings
# generados por GraphSAGE (2015–2022) y las proyecciones temporales
# (2023–2026) en puntajes reales del examen Saber 11.
#
# MAPEO QUE REALIZA EL MODELO:
#    Embeddings (16 dimensiones)
#            ↓
#    Puntajes Saber 11 (7 variables):
#       - PUNT_GLOBAL
#       - PUNT_MATEMATICAS
#       - PUNT_LECTURA_CRITICA
#       - PUNT_SOCIALES_CIUDADANAS
#       - PUNT_C_NATURALES
#       - PUNT_INGLES
#       - DESEMP_INGLES
#
# CONTENIDO DEL NOTEBOOK:
# 1. Carga de embeddings reales (2015–2022) y futuros (2023–2026)
# 2. Construcción del dataset supervisado para entrenamiento
# 3. Definición del modelo Decoder (MLP)
# 4. Entrenamiento, validación y métricas del modelo
# 5. Predicción de puntajes futuros: 2023, 2024, 2025, 2026
# 6. Exportación del modelo entrenado y de las predicciones
#
# ESTE NOTEBOOK COMPLETA EL PIPELINE:
#    GraphSAGE → Embeddings → Modelo Temporal → Decoder Final → Puntajes
# ===============================================================


In [None]:
# IMPORTACIONES + RUTAS

In [17]:
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

print("Librerías cargadas correctamente.")


Librerías cargadas correctamente.


In [None]:
# Cargar rutas

In [18]:
ruta_embeddings = r"C:/Users/john/Desktop/Saber_11_2025/resultados/embeddings"
ruta_salida = r"C:/Users/john/Desktop/Saber_11_2025/resultados/predicciones"

os.makedirs(ruta_salida, exist_ok=True)

print("Rutas listas.")


Rutas listas.


In [None]:
# Cargar embeddings + puntajes reales

In [20]:
# Embeddings (2015–2026) generados en Notebook 3
df_emb = pd.read_csv(os.path.join(ruta_embeddings, "embeddings_2015_2022.csv"), index_col=0)

print("Embeddings cargados:", df_emb.shape)

# Puntajes reales (agregados por departamento o municipio)
df_scores = pd.read_csv("puntajes_agregados_2015_2022.csv", index_col=0)

print("Puntajes cargados:", df_scores.shape)


Embeddings cargados: (8, 16)


FileNotFoundError: [Errno 2] No such file or directory: 'puntajes_agregados_2015_2022.csv'

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt


Rutas listas


In [None]:
# Configuración de rutas

In [4]:
ruta_embeddings = r"C:/Users/john/Desktop/Saber_11_2025/resultados/embeddings"
ruta_salida = r"C:/Users/john/Desktop/Saber_11_2025/resultados/predicciones"

os.makedirs(ruta_salida, exist_ok=True)

print("Rutas listas")


Rutas listas


In [None]:
# Carga del archivo de embeddings

In [7]:
df = pd.read_csv(
    os.path.join(ruta_embeddings, "embeddings_2015_2022.csv"),
    index_col=0
)

print("Embeddings cargados:")
df


Embeddings cargados:


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
anio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015,-0.013285,-0.09036,0.014918,-0.193424,-0.085967,0.108265,0.36162,0.111973,-0.0943,-0.068606,-0.195047,-0.181644,-0.096827,0.127936,-0.013351,-0.16177
2016,-0.195073,0.281308,-0.024735,0.117337,0.042593,-0.216274,-0.04827,-0.019949,-0.045212,-0.292472,-0.254604,0.290325,-0.053898,-0.112172,0.069332,-0.137538
2017,-0.084518,-0.202669,-0.163925,0.03446,0.00333,0.24771,0.005441,-0.120514,-0.071859,0.015435,0.019399,-0.03772,-0.173882,0.04821,0.088221,0.077796
2018,0.111962,-0.017034,0.506638,-0.220439,0.359079,-0.125214,0.166344,-0.070628,0.164679,0.043943,0.401484,-0.114283,0.120481,0.054075,0.678628,-0.328001
2019,0.095019,-0.421412,-0.07754,0.175998,-0.088899,-0.119443,0.002129,-0.127941,-0.104979,0.110968,-0.003392,0.242536,-0.329259,-0.234814,0.20809,-0.141206
2020,-0.105746,0.283967,-0.127792,-0.324531,0.574045,0.039864,-0.261769,-0.301018,0.1663,-0.20102,0.091978,0.212203,0.125527,-0.174737,-0.018771,-0.012808
2021,-0.187736,-0.292265,0.126767,-0.09137,0.128451,0.413479,0.16718,-0.033451,0.202138,-0.000119,-0.000227,0.077904,-0.023739,0.048156,0.303581,-0.029455
2022,-0.054985,-0.073847,-0.193893,-0.129026,0.174212,-0.235259,0.114465,0.144495,-0.121797,-0.070777,-0.20215,-0.009115,0.092984,0.23277,0.080997,0.098271


In [2]:
# PREPARAR INPUT PARA EL DECODER

In [8]:
series = torch.tensor(df.values, dtype=torch.float32)   # shape = (8, 16)

print("Shape serie:", series.shape)


Shape serie: torch.Size([8, 16])


In [None]:
# DEFINIR EL DECODER TEMPORAL

In [12]:
class Decoder(nn.Module):
    def __init__(self, input_dim=16, hidden_dim=16, output_dim=16):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # x: (batch, seq_len, features)
        last_step = x[:, -1, :]           # tomar el último embedding
        hidden = self.relu(self.fc1(last_step))
        out = self.fc2(hidden)
        return out


In [None]:
# Crear modelo

In [13]:
model = Decoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

print("Decoder creado.")


Decoder creado.


In [None]:
# ENTRENAMIENTO

In [14]:
model = Decoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(800):
    optimizer.zero_grad()
    pred = model(X)
    loss = criterion(pred, y[:, -1, :])
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print("Epoch", epoch, "Loss", loss.item())


Epoch 0 Loss 0.022474151104688644
Epoch 100 Loss 7.653306255406278e-08
Epoch 200 Loss 3.2514869702932048e-12
Epoch 300 Loss 1.8821749714348357e-16
Epoch 400 Loss 3.903127820947816e-17
Epoch 500 Loss 2.66496893996937e-16
Epoch 600 Loss 3.903127820947816e-17
Epoch 700 Loss 6.353424730765056e-17


In [16]:
future_years = [2023, 2024, 2025, 2026]
preds = []

current = series.clone()

for year in future_years:
    pred = model(current.unsqueeze(0))
    preds.append(pred.detach().numpy().flatten())
    
    # añadir el año predicho para permitir forecasting autoregresivo
    current = torch.cat([current, pred.unsqueeze(0)], dim=0)


RuntimeError: Tensors must have same number of dimensions: got 2 and 3

In [None]:
# Convertir a DataFrame

In [None]:
df_pred = pd.DataFrame(preds, index=future_years, columns=df.columns)
df_pred


In [None]:
# EXPORTAR CSV

In [None]:
out_csv = os.path.join(ruta_salida, "predicciones_2023_2026.csv")
df_pred.to_csv(out_csv)
print("Predicciones guardadas en:", out_csv)


In [None]:
# GRAFICAR RESULTADOS (PC1 o PC2)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(df.index, df["0"], label="Embeddings reales")
plt.plot(df_pred.index, df_pred["0"], label="Embeddings predichos 2023–2026", linestyle="--")
plt.legend()
plt.title("Serie temporal embeddings (PC1)")
plt.show()


In [None]:
# EXPORTAR MODELO

In [None]:
torch.save(model.state_dict(), os.path.join(ruta_salida, "decoder_lstm.pt"))
