# Librerías y configuraciones

In [None]:
import pandas as pd
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from TFM_GRAPH_EMBEDDINGS.src.models.LINE import PreproLINE, full_train_line as train_line
from TFM_GRAPH_EMBEDDINGS.src.models.GTMAE import PreproGTMAE, train_edge_node_multitask as train_gtmae
from TFM_GRAPH_EMBEDDINGS.src.models.GTMVAE import train_edge_node_multitask_v as train_gtmvae
from TFM_GRAPH_EMBEDDINGS.src.models.E2A_SAGE_MAE import train_edge_node_multitask_sage
from TFM_GRAPH_EMBEDDINGS.src.models.TGT import PreproTGT, TGTConfig, compute_tgt_embeddings
from TFM_GRAPH_EMBEDDINGS.src.models.HGT_TE import train_hgtte

Se parte de los dataframes generados tras el apartado de preproceado y EDA.

In [None]:
tabu = pd.read_csv(...)
temp = pd.read_csv(...)
mdir = pd.read_csv(...)
mndi = pd.read_csv(...)

Se decide si se quiere entrenar incluyendo:
* `NO_MAD`: La ciudad de Madrid. Si es `True`, se excluye, si es `False`, se incluye.
* `ADD_IDEA_EMB`: El _embedding_ medio de las descripciones de anuncios de viviendas. Si es `True`, se inluye, si es `False`, se excluye.

In [None]:
NO_MAD = False
ADD_IDEA_EMB = True

# LINE

## Preprocesado

In [None]:
prepro = PreproLINE(
    add_idea_emb=ADD_IDEA_EMB,
    no_mad=NO_MAD,
    year=2022
)
df_line_mms_mean, df_line_mms_sum, df_line_r_mean, df_line_r_sum = prepro.run(tabu, temp, mdir, mndi)

## Entrenamiento
Se opta por el modelo **LINE-R-Sum**.

In [None]:
line_emb_df = train_line(
    device=device,
    df=df_line_r_sum.copy(),
    emb_dim=256,
    n_epochs=100,
    batch_size=10000,
    neg=10,
    lr=0.01
)

# GTMAE & GTMVAE

## Preprocesado y variables globales

In [None]:
prepro = PreproGTMAE(add_idea_emb=True, no_mad=False)
data, nodes, node2idx, node_feat_names, edge_attr_names = prepro.run(tabu, temp, mdir, mndi)

In [None]:
# Aristas y variables nodales a predecir en el AE
edge_targets = edge_attr_names
node_targets = [c for c in node_feat_names if not c.startswith("idea_emb_") and c not in {"idea_centroid_lat", "idea_centroid_lon"}]

## Entrenamiento GTMAE

In [None]:
model, Z = train_gtmae(
    data=data,
    # Columnas de arista a predecir
    target_cols=edge_targets,
    # Listado de columnas nodales
    node_feat_names=node_feat_names,
    # Columnas nodales a predecir
    node_target_cols=node_targets,
    # Parámetros del encoder
    hid=128, out=64, lr=1e-3, epochs=150,
    weight_decay=1e-4, dropout=0.2, heads=2,
    # Parámetros de early stopping
    patience=30, min_delta=0.0, monitor="val_edge_rmse", restore_best=True,
    # Parámetros de split
    val_ratio=0.2, test_ratio=0.2, seed=33,
    # Parámetros del Decoder y pair features
    use_pair_feats=True, pair_mode="cosine_l2_absdiff",
    # Parámetros de LOSS
    edge_loss_type="huber", edge_huber_delta=1.0,
    node_loss_type="huber", node_huber_delta=1.0,
    # Parámetros de ranking edge (opcional)
    add_ranking=False, lambda_rank=0.5, margin=0.1,
    # Peso de la tarea nodal
    lambda_node=1.0,
    # Parámetros de dropout de aristas
    edge_drop_prob=0.2,
    # Enmascarado de targets nodales
    node_mask_rate=0.15,
    # Otros parámetros
    dbg_print=True, print_every=20, device=device
)

## Entrenamiento GTMVAE

In [None]:
model, Z = train_gtmvae(
    data=data,
    target_cols=edge_targets,
    node_feat_names=node_feat_names,
    node_target_cols=node_targets,
    hid=128, out=64, lr=1e-3, epochs=150,
    weight_decay=1e-4, dropout=0.2, heads=2,
    patience=30, min_delta=0.0, monitor="val_edge_rmse", restore_best=True,
    val_ratio=0.2, test_ratio=0.2, seed=33,
    use_pair_feats=True, pair_mode="cosine_l2_absdiff",
    edge_loss_type="huber", edge_huber_delta=1.0,
    node_loss_type="huber", node_huber_delta=1.0,
    add_ranking=False, lambda_rank=0.5, margin=0.1,
    lambda_node=1.0,
    edge_drop_prob=0.2,
    node_mask_rate=0.15,
    dbg_print=True, print_every=20, device=device,
    # Parámetros de KL (únicos parámetros nuevos respecto al GTMAE)
    beta_kl=1e-3, kl_warmup=10
)

# E2A-SAGE-MAE

## Preprocesado
Se parte del mismo preprocesado de GTMAE.

## Entrenamiento

In [None]:
model, Z = train_edge_node_multitask_sage(
    data=data,
    target_cols=edge_targets,
    node_feat_names=node_feat_names,
    node_target_cols=node_targets,
    hid=128,
    out=256,
    dropout=0.2,
    use_batchnorm=True,
    l2_norm_layers=False,
    lr=0.001,
    weight_decay=0.0001,
    edge_drop_prob=0.2,
    edge_loss_type="huber",
    edge_huber_delta=1.0,
    node_loss_type="huber",
    node_huber_delta=1.0,
    lambda_node=1.0,
    node_mask_rate=0.2,
    add_ranking=True,
    lambda_rank=0.3,
    margin=0.1,
    monitor="val_edge_rmse",
    patience=30,
    min_delta=0.0,
    val_ratio=0.2,
    test_ratio=0.2,
    seed=33,
    use_pair_feats=True,
    pair_mode="cosine_l2_absdiff",
    print_every=25,
    epochs=250,
    device=device,
    dbg_print=False
)

# TGT

## Preprocesado

In [None]:
p = PreproTGT(add_idea_emb=ADD_IDEA_EMB, no_mad=NO_MAD)
prep = p.run(tabu, temp, mdir, mndi)

## Entrenamiento

In [None]:
cfg = TGTConfig(
    hidden=96,
    heads=2,
    tf_layers=2,
    tf_ff=256,
    dropout=0.2,
    time_enc_dim=32,
    tf_heads=5
)

Z_2022, years, w = compute_tgt_embeddings(
    prep,
    target_year=2022,
    device=device,
    cfg=cfg,
    decay=0.3
)

# HGT-TE

## Preprocesado
El preprocesado va implícito en la función de entrenamiento

## Entrenamiento

In [None]:
Z_2022 = train_hgtte(
    device=device,
    tabu=tabu,
    temp=temp,
    mdir=mdir,
    mndi=mndi,
    add_idea_emb=ADD_IDEA_EMB,
    no_mad=NO_MAD,
    node_id_col="cc",
    src_col="cc_origen",
    dst_col="cc_destino",
    year_col="year",
    static_cols=None,
    temp_cols=None,
    edge_attr_cols_dir=None,
    edge_attr_cols_undir=None,
    spatial_hidden=128,
    spatial_out=128,
    heads=2,
    dropout=0.2,
    temporal_layers=2,
    temporal_heads=4,
    temporal_ff=512,
    target_year=2022,
    lambda_focus=0.25
)