# Librerías y configuraciones

In [None]:
import pandas as pd
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Local Validation
from TFM_GRAPH_EMBEDDINGS.src.validation.local_validation.LINE import gridsearch_params as gs_line
from TFM_GRAPH_EMBEDDINGS.src.models.GTMAE import PreproGTMAE
from TFM_GRAPH_EMBEDDINGS.src.validation.local_validation.GTMAE import run_gtmae_gridsearch as gs_gtmae
from TFM_GRAPH_EMBEDDINGS.src.validation.local_validation.GTMVAE import run_gtmvae_gridsearch as gs_gtmvae
from TFM_GRAPH_EMBEDDINGS.src.validation.local_validation.E2A_SAGE_GTMAE import run_sage_gridsearch as gs_sage
from TFM_GRAPH_EMBEDDINGS.src.validation.local_validation.TGT import run_tgt_gridsearch as gs_tgt
from TFM_GRAPH_EMBEDDINGS.src.validation.local_validation.HGT_TE import run_hgtte_gridsearch as gs_hgtte

# Global Validation
from TFM_GRAPH_EMBEDDINGS.src.validation.global_validation import global_validation

Se parte de los dataframes generados tras los apartados de preprocesado y EDA.

In [None]:
tabu = pd.read_csv(...)
temp = pd.read_csv(...)
mdir = pd.read_csv(...)
mndi = pd.read_csv(...)

# Global Validation DataFrame
val_df = pd.read_csv(...)

Se decide si se quiere entrenar incluyendo:
* `NO_MAD`: La ciudad de Madrid. Si es `True`, se excluye, si es `False`, se incluye.
* `ADD_IDEA_EMB`: El _embedding_ medio de las descripciones de anuncios de viviendas. Si es `True`, se inluye, si es `False`, se excluye.

In [None]:
NO_MAD = False
ADD_IDEA_EMB = True

# Validación local
Búsqueda óptima de hiperparámetros en cada uno de los modelos.

## LINE

In [None]:
params_line = {
    'emb_dim': [128, 256],
    'n_epochs': [100],
    'batch_size': [10000],
    'neg': [5, 10],
    'lr': [0.005, 0.01, 0.025]
}

df_val = gs_line(
    device,
    tabu=tabu,
    temp=temp,
    mdir=mdir,
    mndi=mndi,
    gs_dict=params_line,
    validation_cols=["eur_gastos_mean", "idea_price_mean", "idea_size_mean", "n_migr_inter_mean"],
    n_loops=5,
    add_idea_emb=ADD_IDEA_EMB,
    no_mad=NO_MAD
)
df_val.head()

## GTMAE & GTMVAE

In [None]:
p = PreproGTMAE(add_idea_emb=ADD_IDEA_EMB, no_mad=NO_MAD)
data, nodes, node2idx, node_feat_names, edge_attr_names = p.run(tabu, temp, mdir, mndi)

node_targets = [c for c in node_feat_names if not c.startswith("idea_emb_") and c not in {"idea_centroid_lat", "idea_centroid_lon"}]

In [None]:
gtmae_grid = {
    "hid": [96, 128],
    "out": [64, 128],
    "heads": [2],
    "dropout": [0.1, 0.2],
    "lr": [1e-3, 5e-4],
    "weight_decay": [1e-5, 1e-4],
    "edge_drop_prob": [0.0, 0.2],
    "edge_loss_type": ["huber"],
    "edge_huber_delta": [1.0],
    "node_loss_type": ["huber"],
    "node_huber_delta": [1.0],
    "lambda_node": [0.3, 0.5, 1.0],
    "node_mask_rate": [0.0, 0.2],
    "add_ranking": [False, True],
    "lambda_rank": [0.3],
    "margin": [0.1],
    "monitor": ["val_edge_rmse"],
    "patience": [30],
    "min_delta": [0.0],
    "val_ratio": [0.2],
    "test_ratio": [0.2],
    "seed": [33],
    "pair_mode": ["cosine_l2_absdiff"],
    "use_pair_feats": [True],
    "print_every": [25],
    "epochs": [250]
}

### GTMAE

In [None]:
df, best_model, best_Z = gs_gtmae(
    data=data,
    node_feat_names=node_feat_names,
    edge_attr_names=edge_attr_names,
    device=device,
    param_grid=gtmae_grid,
    sort_weights=(0.5, 0.5)
)
df.head()

### GTMVAE

In [None]:
gtmvae_grid = {**gtmae_grid, "beta_kl": [1e-3, 5e-4], "kl_warmup": [10, 25]}

df, best_model, best_Z = gs_gtmvae(
    data=data,
    node_feat_names=node_feat_names,
    edge_attr_names=edge_attr_names,
    device=device,
    param_grid=gtmvae_grid,
    sort_weights=(0.5, 0.5)
)
df.head()

## E2A-SAGE-MAE

Se parte del preprocesado de GTMAE.

In [None]:
sage_grid = {
    "hid": [96, 128],
    "out": [64, 128],
    "dropout": [0.1, 0.2],
    "use_batchnorm": [True],
    "l2_norm_layers": [True, False],
    "lr": [1.3e-3, 6.5e-4],
    "weight_decay": [1e-5, 1e-4],
    "edge_drop_prob": [0.0, 0.2],
    "edge_loss_type": ["huber"],
    "edge_huber_delta": [1.0],
    "node_loss_type": ["huber"],
    "node_huber_delta": [1.0],
    "lambda_node": [0.3, 0.5, 1.0],
    "node_mask_rate": [0.0, 0.2],
    "add_ranking": [False, True],
    "lambda_rank": [0.3],
    "margin": [0.1],
    "monitor": ["val_edge_rmse"],
    "patience": [30],
    "min_delta": [0.0],
    "val_ratio": [0.2],
    "test_ratio": [0.2],
    "seed": [33],
    "pair_mode": ["cosine_l2_absdiff"],
    "use_pair_feats": [True],
    "print_every": [25],
    "epochs": [250]
}

In [None]:
df, best_model, best_Z = gs_sage(
    data=data,
    node_feat_names=node_feat_names,
    edge_attr_names=edge_attr_names,
    device=device,
    param_grid=sage_grid,
    sort_weights=(0.5, 0.5)
)
df.head()

## TGT

In [None]:
tgt_grid = {
    "hidden": [64, 96],
    "heads": [2, 4],
    "tf_layers": [1, 2],
    "tf_ff": [128, 256],
    "dropout": [0.1, 0.2],
    "time_enc_dim": [16, 32],
    "decay": [0.3, 0.5]
}

El preprocesado de los datos va implícito en la función de _GridSearch_.

In [None]:
results = gs_tgt(
    tabu=tabu,
    temp=temp,
    mdir=mdir,
    mndi=mndi,
    add_idea_emb=ADD_IDEA_EMB,
    no_mad=NO_MAD,
    target_year=2022,
    target_cols=['eur_renta_b_xhab', 'n_migr_intra_por_hab', 'n_paro_por_hab'],
    param_grid=tgt_grid,
    alpha_ridge=1.0,
    k_folds=5,
    device=device,
    verbose=True,
    csv_path="path/to/tgt/grid/results.csv",
    resume=True,
)
results.head()

## HGT-TE

In [None]:
hgtte_grid = {
    "spatial_hidden": [64, 128],
    "spatial_out": [64, 128],
    "heads": [2, 4],
    "dropout": [0.1, 0.3],
    "temporal_layers": [1, 2],
    "temporal_heads": [2, 4],
    "temporal_ff": [128, 256],
    "temporal_pe_dim": [None, 64],
    "lambda_focus": [0.1, 0.25, 0.5],
    "target_year": [2022]
}

El preprocesado de los datos va implícito en la función de _GridSearch_.

In [None]:
results_df, best_config, best_metrics, best_Z = gs_hgtte(
    device=device,
    tabu=tabu,
    temp=temp,
    mdir=mdir,
    mndi=mndi,
    add_idea_emb=ADD_IDEA_EMB,
    no_mad=NO_MAD,
    node_id_col="cc",
    src_col="cc_origen",
    dst_col="cc_destino",
    year_col="year",
    static_cols=None,
    temp_cols=None,
    edge_attr_cols_dir=None,
    edge_attr_cols_undir=None,
    target_year=2022,
    static_proxy_cols=["idea_price_mean", "geo_distancia_capital"],
    temp_proxy_cols=["n_ss_general_por_hab", "n_nacimientos_por_hab"],
    drop_proxy_rows_with_na=True,
    param_grid=hgtte_grid,
    gs_mode="linear_probe"
)
print(best_config)
results_df.head()

# Validación global
Comparativa entre los _embeddings_ resultantes de los entrenamientos de los modelos óptimas (mejores parametrías resultantes del apartado de "Validación local").

In [None]:
# Embeddings generados tras la validación local
emb_LINE = pd.read_csv(...)
emb_GTMAE = pd.read_csv(...)
emb_GTMVAE = pd.read_csv(...)
emb_E2ASAGEMAE = pd.read_csv(...)
emb_TGT = pd.read_csv(...)
emb_HGTTE = pd.read_csv(...)

embeddings = [
    ("LINE", emb_LINE),
    ("GTMAE", emb_GTMAE),
    ("GTMVAE", emb_GTMVAE),
    ("E2A-SAGE-MAE", emb_E2ASAGEMAE),
    ("TGT", emb_TGT),
    ("HGT-TE", emb_HGTTE),
]

In [None]:
global_results = global_validation(
    tabu=tabu,
    temp=temp,
    mdir=mdir,
    mndi=mndi,
    no_mad=NO_MAD,
    add_idea_emb=False,
    val_df=val_df,
    val_n_splits=3,
    node_id_col='cc',
    year_col='year',
    emb_year=2022,
    embeddings=embeddings,
    seed=33
)
global_results.head()