# 1. Ingeniería de Características y Creación de Secuencias LIGHTGBM

## 2.1. Configuración e importación de librerías

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from tqdm import tqdm
import dask.dataframe as dd

## 2.2. Carga 

In [None]:
import pandas as pd
import numpy as np
FINAL_DATASET_PATH = './dataset_FINAL_COMPLETO'

# Definición de los horizontes de predicción (en número de registros/horas)
HORIZONTES_PREDICCION = {
    'HOY': 1, 
    'MANANA': 24, 
    '7DIAS': 168
}
ID_COLUMN = 'POLISSA_SUBM'

# Lista de columnas de metadatos (para exclusión)
METADATA_AND_TARGET_COLUMNS = [
    'POLISSA_SUBM', 'NUMEROSERIECONTADOR', 'SECCIO_CENSAL', 'KEY_DISTRITO', 'KEY_SECCION', 
    'FECHA_HORA', 'FECHA', 'HORA', 'DATA_INI_FACT', 'DATA_FIN_FACT', 'FECHA_HORA_CRONO',
    'FUGA_DETECTADA', 'FUGA_REITERADA', 
]

# --- 2. Carga y Preparación de Datos ---
print("--- 2. Carga y Preparación Inicial ---")

try:
    df = pd.read_parquet(FINAL_DATASET_PATH) 
except FileNotFoundError:
    print(f"ERROR: No se encontró el dataset final en {FINAL_DATASET_PATH}.")
    exit()

--- 2. Carga y Preparación Inicial ---


In [2]:
# --- 0. Configuración y Librerías ---
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from tqdm import tqdm
import dask.dataframe as dd

# --- 1. CONFIGURACIÓN DEL PROYECTO ---
FINAL_DATASET_PATH = './dataset_FINAL_COMPLETO'
ID_COLUMN = 'POLISSA_SUBM'

HORIZONTES_PREDICCION = {'HOY': 1, 'MANANA': 24, '7DIAS': 168}

# Usamos el 10% de los datos para evitar ArrowMemoryError (16GB)
SUBSAMPLE_PERCENT = 0.10 

# Listas de Columnas
METADATA_AND_TARGET_COLUMNS = [
    'POLISSA_SUBM', 'NUMEROSERIECONTADOR', 'SECCIO_CENSAL', 'KEY_DISTRITO', 'KEY_SECCION', 
    'FECHA_HORA', 'FECHA', 'HORA', 'DATA_INI_FACT', 'DATA_FIN_FACT', 'FECHA_HORA_CRONO',
    'FUGA_DETECTADA', 'FUGA_REITERADA', 
]

# Feature base (Numéricas + Categóricas)
FEATURE_COLUMNS_BASE = [
    'CONSUMO_REAL', 'TEMP_MEDIA', 'TEMP_MIN', 'TEMP_MAX', 'PRECIPITACION', 
    'HUMEDAD_RELATIVA_MEDIA', 'FESTIVO', 'Renda_Media_Euros', 'Antig_1901_a_1940', 
    'Antig_1941_a_1950', 'Antig_1951_a_1960', 'Antig_1961_a_1970', 'Antig_1971_a_1980', 
    'Antig_1981_a_1990', 'Antig_1991_a_2000', 'Antig_2001_a_2010', 'Antig_2011_a_2020', 
    'Antig_2021_a_2030', 'Antig_Menor_1901', 'Pob_0_14_anys', 'Pob_15_24_anys', 
    'Pob_25_39_anys', 'Pob_40_64_anys', 'Pob_65_o_mas', 'Pob_Total_Seccio', 
    'Pct_Pob_0_14_anys', 'Pct_Pob_15_24_anys', 'Pct_Pob_25_39_anys', 
    'Pct_Pob_40_64_anys', 'Pct_Pob_65_o_mas', 'Num_Obres_Recents',
    'US_AIGUA_SUBM', 'TIPO_DIA' 
]

# --- 2. Carga y Muestreo del Dataset (DASK) ---
print("--- 2. Carga Distribuida y Muestreo ---")

try:
    df_dask_full = dd.read_parquet(FINAL_DATASET_PATH)
    print(f"Aplicando muestreo del {SUBSAMPLE_PERCENT:.0%}...")
    df_dask_sampled = df_dask_full.sample(frac=SUBSAMPLE_PERCENT, random_state=42)
    df = df_dask_sampled.compute()
    print(f"Dataset cargado en RAM: {df.shape[0]} filas.")
except Exception as e:
    print(f"ERROR CRÍTICO: {e}")
    exit()

# 2.1. Ordenamiento Cronológico
print("Ordenando cronológicamente...")
df['FECHA_HORA_CRONO'] = pd.to_datetime(df['FECHA'].astype(str) + ' ' + df['HORA'].astype(str), errors='coerce')
df = df.sort_values(by=[ID_COLUMN, 'FECHA_HORA_CRONO']).reset_index(drop=True)

# --- 3. Ingeniería de Características ---
print("\n--- 3. Ingeniería de Características ---")

LAG_FEATURES = ['CONSUMO_REAL', 'TEMP_MEDIA', 'PRECIPITACION']
LAG_STEPS = [1, 6, 12, 24, 72] 

for col in tqdm(LAG_FEATURES, desc="Creando Lags"):
    for lag in LAG_STEPS:
        df[f'{col}_LAG_{lag}H'] = df.groupby(ID_COLUMN)[col].shift(lag)

# Rolling
WINDOW_SIZE = 168
df['CONSUMO_ROLLING_MEAN_7D'] = df.groupby(ID_COLUMN)['CONSUMO_REAL'].transform(lambda x: x.rolling(WINDOW_SIZE, min_periods=1).mean())
df['CONSUMO_ROLLING_STD_7D'] = df.groupby(ID_COLUMN)['CONSUMO_REAL'].transform(lambda x: x.rolling(WINDOW_SIZE, min_periods=1).std())

# Targets
TARGET_COLS = ['TARGET_HOY', 'TARGET_MANANA', 'TARGET_7DIAS']
df['TARGET_HOY'] = df['FUGA_DETECTADA'].shift(-1)
df['TARGET_MANANA'] = df['FUGA_DETECTADA'].shift(-24)
df['TARGET_7DIAS'] = df['FUGA_DETECTADA'].shift(-168)

# --- 4. Definición de Features Finales ---
lag_cols = [c for c in df.columns if '_LAG_' in c or '_ROLLING_' in c]
X_COLS = [c for c in FEATURE_COLUMNS_BASE + lag_cols if c in df.columns and c not in METADATA_AND_TARGET_COLUMNS]

print(f"Features Finales ({len(X_COLS)}): {X_COLS[:5]} ...")

# Limpieza de NaNs generados por Lags
df = df.dropna(subset=['TARGET_7DIAS'] + X_COLS)

# Definición de Categóricas
CATEGORICAL_COLS = ['US_AIGUA_SUBM', 'TIPO_DIA']
for col in CATEGORICAL_COLS:
    if col in X_COLS:
        df[col] = df[col].astype('category')

# Time Split
split_index = int(len(df) * 0.80)
df_train = df.iloc[:split_index].copy()
df_test = df.iloc[split_index:].copy()

print(f"Train: {len(df_train)} | Test: {len(df_test)}")

# --- 5. Entrenamiento ---
print("\n--- 5. ENTRENAMIENTO ---")
modelos_finales = {}
output_dir = '../data/modelos_finales/'
os.makedirs(output_dir, exist_ok=True)

for target_col in TARGET_COLS:
    print(f"\n[PROCESANDO] {target_col}...")
    
    # Preparar datos
    X = df_train[X_COLS].copy()
    y = df_train[target_col].astype(int)
    
    # --- LIMPIEZA DE TIPOS ROBUSTA (Anti-ValueError) ---
    num_cols = [c for c in X_COLS if c not in CATEGORICAL_COLS]
    for col in num_cols:
        if X[col].dtype == 'object':
            X[col] = X[col].astype(str).str.replace(',', '.', regex=False)
        X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0.0)

    # División interna para validación (Early Stopping)
    X_fit, X_val, y_fit, y_val = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
    
    # --- CONFIGURACIÓN LIGHTGBM (Con métrica 'auc') ---
    lgbm = lgb.LGBMClassifier(
        objective='binary',
        metric='auc', # <--- CORRECCIÓN CRÍTICA: Usamos 'auc' que es estándar
        is_unbalance=True,
        n_estimators=300,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=42,
        verbose=-1
    )
    
    # --- ENTRENAMIENTO ---
    lgbm.fit(
        X_fit, y_fit,
        categorical_feature=[c for c in X_COLS if c in CATEGORICAL_COLS],
        eval_set=[(X_val, y_val)], # Set de validación explícito
        eval_metric='auc', # <--- CORRECCIÓN CRÍTICA: Debe coincidir
        callbacks=[lgb.early_stopping(50, verbose=True)]
    )
    
    modelos_finales[target_col] = lgbm
    joblib.dump(lgbm, os.path.join(output_dir, f'lgbm_model_{target_col}.joblib'))

# --- 6. Evaluación Final ---
print("\n--- 6. EVALUACIÓN FINAL (TEST SET) ---")

# Preparamos X_test igual que X_train
X_test_final = df_test[X_COLS].copy()
for col in num_cols:
    if X_test_final[col].dtype == 'object':
        X_test_final[col] = X_test_final[col].astype(str).str.replace(',', '.', regex=False)
    X_test_final[col] = pd.to_numeric(X_test_final[col], errors='coerce').fillna(0.0)

for target_col, modelo in modelos_finales.items():
    y_test_final = df_test[target_col].astype(int)
    
    # Predicción
    y_pred_proba = modelo.predict_proba(X_test_final)[:, 1]
    
    # Métricas
    auc_pr = average_precision_score(y_test_final, y_pred_proba)
    roc_auc = roc_auc_score(y_test_final, y_pred_proba)
    
    print(f"\n>>> RESULTADOS {target_col} <<<")
    print(f"AUC-PR: {auc_pr:.4f} | AUC-ROC: {roc_auc:.4f}")
    
    y_pred = (y_pred_proba > 0.5).astype(int)
    print(classification_report(y_test_final, y_pred, target_names=['No Fuga', 'Fuga']))

print("\n✅ ENTRENAMIENTO COMPLETADO.")

--- 2. Carga Distribuida y Muestreo ---
Aplicando muestreo del 10%...
Dataset cargado en RAM: 7501955 filas.
Ordenando cronológicamente...

--- 3. Ingeniería de Características ---


Creando Lags: 100%|██████████| 3/3 [00:05<00:00,  1.67s/it]


Features Finales (50): ['CONSUMO_REAL', 'TEMP_MEDIA', 'TEMP_MIN', 'TEMP_MAX', 'PRECIPITACION'] ...
Train: 5848018 | Test: 1462005

--- 5. ENTRENAMIENTO ---

[PROCESANDO] TARGET_HOY...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.98047

[PROCESANDO] TARGET_MANANA...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.978312

[PROCESANDO] TARGET_7DIAS...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.966331

--- 6. EVALUACIÓN FINAL (TEST SET) ---

>>> RESULTADOS TARGET_HOY <<<
AUC-PR: 0.8334 | AUC-ROC: 0.6690
              precision    recall  f1-score   support

     No Fuga       0.37      0.77      0.50    434170
        Fuga       0.82      0.45      0.59   1027835

    accuracy                           0.55   1462005
   macro avg     

In [3]:
import joblib
import os

# 1. Configurar dónde se guardarán
output_dir = '../data/modelos_finales/'
os.makedirs(output_dir, exist_ok=True)

print("--- Guardando Modelos desde la Memoria ---")

# 2. Verificar si la variable existe en memoria
if 'modelos_finales' in locals() or 'modelos_finales' in globals():
    
    # 3. Recorrer el diccionario y guardar cada modelo
    for target_col, modelo in modelos_finales.items():
        filename = f'lgbm_model_{target_col}.joblib'
        filepath = os.path.join(output_dir, filename)
        
        joblib.dump(modelo, filepath)
        print(f"✅ Modelo guardado exitosamente: {filepath}")
        
    print("\n¡Todos los modelos han sido persistidos en disco!")

else:
    print("❌ ERROR: No encuentro la variable 'modelos_finales' en la memoria.")
    print("Asegúrate de no haber reiniciado el kernel después del entrenamiento.")

--- Guardando Modelos desde la Memoria ---
✅ Modelo guardado exitosamente: ../data/modelos_finales/lgbm_model_TARGET_HOY.joblib
✅ Modelo guardado exitosamente: ../data/modelos_finales/lgbm_model_TARGET_MANANA.joblib
✅ Modelo guardado exitosamente: ../data/modelos_finales/lgbm_model_TARGET_7DIAS.joblib

¡Todos los modelos han sido persistidos en disco!


In [None]:
# --- Bloque para Exportar Resultados a CSV ---
import pandas as pd
import os

print("--- Generando CSV de Resultados ---")

# 1. Verificamos que existan los datos en memoria
if 'df_test' in locals() and 'modelos_finales' in locals():
    
    # 2. Creamos un DataFrame base con la info del cliente
    # (Asegúrate de incluir la fecha para el análisis temporal)
    cols_info = [ID_COLUMN, 'FECHA_HORA_CRONO'] if 'FECHA_HORA_CRONO' in df_test.columns else [ID_COLUMN]
    df_resultados = df_test[cols_info].copy()

    # 3. Iteramos por cada modelo para añadir sus predicciones
    for target_col, modelo in modelos_finales.items():
        print(f"Procesando predicciones para: {target_col}...")
        
        # Preparar X_test (Mismo casting que en el entrenamiento para evitar errores)
        X_test_iter = df_test[X_COLS].copy()
        num_cols = [c for c in X_COLS if c not in CATEGORICAL_COLS]
        for col in num_cols:
            # Limpieza de tipos robusta
            if X_test_iter[col].dtype == 'object':
                X_test_iter[col] = X_test_iter[col].astype(str).str.replace(',', '.', regex=False)
            X_test_iter[col] = pd.to_numeric(X_test_iter[col], errors='coerce').fillna(0.0)
            
        # Predecir Probabilidad (La "Confianza" del modelo)
        probs = modelo.predict_proba(X_test_iter, raw_score=False)[:, 1]
        
        # Guardar en el DataFrame
        df_resultados[f'PROB_{target_col}'] = probs
        df_resultados[f'REAL_{target_col}'] = df_test[target_col].astype(int).values

    # 4. Guardar a CSV
    ruta_csv = '../data/analisis_predicciones.csv'
    df_resultados.to_csv(ruta_csv, index=False, sep=';', decimal=',') 
    # Nota: Uso sep=';' y decimal=',' para que Excel en español lo abra directo
    
    print(f"\n✅ Archivo guardado exitosamente: {ruta_csv}")
    print("Columnas generadas:", df_resultados.columns.tolist())

else:
    print("❌ Error: No se encuentran 'df_test' o 'modelos_finales' en memoria. Ejecuta el entrenamiento primero.")

In [5]:
#load dataset ../data/analisis_predicciones.csv


import pandas as pd
df_analysis = pd.read_csv('../data/analisis_predicciones.csv', sep=';', decimal=',')


In [6]:
df_analysis.head()

Unnamed: 0,POLISSA_SUBM,FECHA_HORA_CRONO,PROB_TARGET_HOY,REAL_TARGET_HOY,PROB_TARGET_MANANA,REAL_TARGET_MANANA,PROB_TARGET_7DIAS,REAL_TARGET_7DIAS
0,TLFF6N34NTT4Y4EL,2024-11-03 13:27:29,0.975845,1,0.959602,1,0.961532,1
1,TLFF6N34NTT4Y4EL,2024-11-03 14:27:28,0.975845,1,0.959602,1,0.961532,1
2,TLFF6N34NTT4Y4EL,2024-11-03 16:27:28,0.975845,1,0.959602,1,0.961532,1
3,TLFF6N34NTT4Y4EL,2024-11-03 16:27:28,0.975845,1,0.959602,1,0.961532,1
4,TLFF6N34NTT4Y4EL,2024-11-03 16:27:28,0.975845,1,0.959602,1,0.961532,1


In [7]:
df.head()

Unnamed: 0,POLISSA_SUBM,DATA_INI_FACT,DATA_FIN_FACT,US_AIGUA_SUBM,SECCIO_CENSAL,NUMEROSERIECONTADOR,CONSUMO_REAL,FECHA_HORA,FUGA_DETECTADA,FUGA_REITERADA,...,PRECIPITACION_LAG_1H,PRECIPITACION_LAG_6H,PRECIPITACION_LAG_12H,PRECIPITACION_LAG_24H,PRECIPITACION_LAG_72H,CONSUMO_ROLLING_MEAN_7D,CONSUMO_ROLLING_STD_7D,TARGET_HOY,TARGET_MANANA,TARGET_7DIAS
72,22KX53JQU5AD26SG,2023-04-25 00:00:00,2023-06-23 00:00:00,DOMÈSTIC,801908047,KKF44CYBRFOXT57S,134.0,2024-01-04 21:38:37,0.0,0.0,...,3,3,3,0,0,92.164384,28.278876,0.0,0.0,0.0
73,22KX53JQU5AD26SG,2023-04-25 00:00:00,2023-06-23 00:00:00,DOMÈSTIC,801908047,KKF44CYBRFOXT57S,123.0,2024-01-04 23:38:38,0.0,0.0,...,3,3,3,0,0,92.581081,28.312351,0.0,0.0,0.0
74,22KX53JQU5AD26SG,2023-08-23 00:00:00,2023-10-24 00:00:00,DOMÈSTIC,801908047,KKF44CYBRFOXT57S,123.0,2024-01-04 23:38:38,0.0,0.0,...,3,3,3,0,0,92.986667,28.33892,0.0,0.0,0.0
75,22KX53JQU5AD26SG,2024-06-26 00:00:00,2024-08-23 00:00:00,DOMÈSTIC,801908047,KKF44CYBRFOXT57S,82.0,2024-01-05 00:38:38,0.0,0.0,...,3,3,3,0,0,92.842105,28.177557,0.0,0.0,0.0
76,22KX53JQU5AD26SG,2023-10-24 00:00:00,2023-12-22 00:00:00,DOMÈSTIC,801908047,KKF44CYBRFOXT57S,82.0,2024-01-05 02:38:38,0.0,0.0,...,84,3,3,0,0,92.701299,28.018821,0.0,0.0,0.0


In [8]:
subset_duplicados = ['POLISSA_SUBM', 'FECHA', 'HORA']
df = df.drop_duplicates(subset=subset_duplicados)

In [None]:
# --- 0. Configuración y Librerías ---
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from tqdm import tqdm
import dask.dataframe as dd

# --- 1. CONFIGURACIÓN DEL PROYECTO ---
FINAL_DATASET_PATH = '../data/processed/dataset_FINAL_COMPLETO.parquet'
ID_COLUMN = 'POLISSA_SUBM'

HORIZONTES_PREDICCION = {'HOY': 1, 'MANANA': 24, '7DIAS': 168}

# Memoria: Usamos el 10% de los datos para evitar ArrowMemoryError
SUBSAMPLE_PERCENT = 0.10 

# --- 2. DEFINICIÓN DE FEATURES (LIMPIEZA Y ORDENACIÓN) ---

# 2.1. Columnas a EXCLUIR explícitamente (Metadata y Redundantes)
# Eliminamos: IDs, Fechas, Targets, y las redundantes (Temp Min/Max, Población Absoluta)
EXCLUDED_COLS = [
    # Metadata y Claves
    'POLISSA_SUBM', 'NUMEROSERIECONTADOR', 'SECCIO_CENSAL', 'KEY_DISTRITO', 'KEY_SECCION', 
    'FECHA_HORA', 'FECHA', 'HORA', 'FECHA_HORA_CRONO', 
    # Fechas Administrativas (Sin valor predictivo horario)
    'DATA_INI_FACT', 'DATA_FIN_FACT', 
    # Targets
    'FUGA_DETECTADA', 'FUGA_REITERADA', 
    # Redundancias Climáticas (Alta correlación con Media)
    'TEMP_MIN', 'TEMP_MAX', 
    # Redundancias Demográficas (Usamos Porcentajes en su lugar)
    'Pob_0_14_anys', 'Pob_15_24_anys', 'Pob_25_39_anys', 'Pob_40_64_anys', 'Pob_65_o_mas'
]

# 2.2. Lista Ordenada de Features Base (23 Columnas)
FEATURE_COLUMNS_BASE = [
    # Core
    'CONSUMO_REAL', 
    # Contexto
    'FESTIVO', 'TIPO_DIA', 'US_AIGUA_SUBM',
    # Clima
    'TEMP_MEDIA', 'PRECIPITACION', 'HUMEDAD_RELATIVA_MEDIA',
    # Socioeconómico
    'Renda_Media_Euros', 'Num_Obres_Recents', 'Pob_Total_Seccio',
    # Demografía (Perfil)
    'Pct_Pob_0_14_anys', 'Pct_Pob_15_24_anys', 'Pct_Pob_25_39_anys', 
    'Pct_Pob_40_64_anys', 'Pct_Pob_65_o_mas',
    # Infraestructura (Antigüedad)
    'Antig_Menor_1901', 'Antig_1901_a_1940', 'Antig_1941_a_1950', 'Antig_1951_a_1960',
    'Antig_1961_a_1970', 'Antig_1971_a_1980', 'Antig_1981_a_1990', 'Antig_1991_a_2000',
    'Antig_2001_a_2010', 'Antig_2011_a_2020', 'Antig_2021_a_2030'
]

# --- 3. Carga y Muestreo del Dataset (DASK) ---
print("--- 3. Carga Distribuida y Muestreo ---")

try:
    df_dask_full = dd.read_parquet(FINAL_DATASET_PATH)
    print(f"Aplicando muestreo del {SUBSAMPLE_PERCENT:.0%}...")
    df_dask_sampled = df_dask_full.sample(frac=SUBSAMPLE_PERCENT, random_state=42)
    df = df_dask_sampled.compute()
    print(f"Dataset cargado en RAM: {df.shape[0]} filas.")
except Exception as e:
    print(f"ERROR CRÍTICO: {e}")
    exit()

# 3.1. Ordenamiento Cronológico
print("Ordenando cronológicamente...")
df['FECHA_HORA_CRONO'] = pd.to_datetime(df['FECHA'].astype(str) + ' ' + df['HORA'].astype(str), errors='coerce')
df = df.sort_values(by=[ID_COLUMN, 'FECHA_HORA_CRONO']).reset_index(drop=True)

# --- 4. Ingeniería de Características (Feature Engineering) ---
print("\n--- 4. Ingeniería de Características ---")

# Filtramos el DF para quedarnos solo con las columnas base antes de generar lags
# (Esto ahorra memoria antes de duplicar columnas)
cols_to_keep = FEATURE_COLUMNS_BASE + [ID_COLUMN, 'FECHA_HORA_CRONO', 'FUGA_DETECTADA']
# Solo mantenemos las que existen en el DF
cols_to_keep = [c for c in cols_to_keep if c in df.columns]
df = df[cols_to_keep]

# 4.1. Lags (Historia)
LAG_FEATURES = ['CONSUMO_REAL', 'TEMP_MEDIA', 'PRECIPITACION']
LAG_STEPS = [1, 6, 12, 24, 72] 

for col in tqdm(LAG_FEATURES, desc="Creando Lags"):
    for lag in LAG_STEPS:
        if col in df.columns:
            df[f'{col}_LAG_{lag}H'] = df.groupby(ID_COLUMN)[col].shift(lag)

# 4.2. Rolling (Tendencias)
WINDOW_SIZE = 168
if 'CONSUMO_REAL' in df.columns:
    df['CONSUMO_ROLLING_MEAN_7D'] = df.groupby(ID_COLUMN)['CONSUMO_REAL'].transform(lambda x: x.rolling(WINDOW_SIZE, min_periods=1).mean())
    df['CONSUMO_ROLLING_STD_7D'] = df.groupby(ID_COLUMN)['CONSUMO_REAL'].transform(lambda x: x.rolling(WINDOW_SIZE, min_periods=1).std())

# 4.3. Targets
TARGET_COLS = ['TARGET_HOY', 'TARGET_MANANA', 'TARGET_7DIAS']
df['TARGET_HOY'] = df['FUGA_DETECTADA'].shift(-1)
df['TARGET_MANANA'] = df['FUGA_DETECTADA'].shift(-24)
df['TARGET_7DIAS'] = df['FUGA_DETECTADA'].shift(-168)

# --- 5. Definición Final de X e y ---
# Recopilamos todas las features generadas
X_COLS = [c for c in df.columns if c not in EXCLUDED_COLS and c not in TARGET_COLS + ['FUGA_DETECTADA', ID_COLUMN, 'FECHA_HORA_CRONO']]

print(f"Features Finales ({len(X_COLS)}): {X_COLS[:5]} ...")

# Limpieza de NaNs
df = df.dropna(subset=['TARGET_7DIAS'] + X_COLS)

# Definición de Categóricas
CATEGORICAL_COLS = ['US_AIGUA_SUBM', 'TIPO_DIA']
for col in CATEGORICAL_COLS:
    if col in X_COLS:
        df[col] = df[col].astype('category')

# Time Split
split_index = int(len(df) * 0.80)
df_train = df.iloc[:split_index].copy()
df_test = df.iloc[split_index:].copy()

print(f"Train: {len(df_train)} | Test: {len(df_test)}")

# --- 6. Entrenamiento ---
print("\n--- 6. ENTRENAMIENTO ---")
modelos_finales = {}
output_dir = '../data/modelos_finales/'
os.makedirs(output_dir, exist_ok=True)

for target_col in TARGET_COLS:
    print(f"\n[PROCESANDO] {target_col}...")
    
    # Preparar datos
    X = df_train[X_COLS].copy()
    y = df_train[target_col].astype(int)
    
    # Limpieza de tipos (Anti-ValueError)
    num_cols = [c for c in X_COLS if c not in CATEGORICAL_COLS]
    for col in num_cols:
        if X[col].dtype == 'object':
            X[col] = X[col].astype(str).str.replace(',', '.', regex=False)
        X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0.0)

    # División interna
    X_fit, X_val, y_fit, y_val = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
    
    # LightGBM
    lgbm = lgb.LGBMClassifier(
        objective='binary',
        metric='auc', 
        is_unbalance=True,
        n_estimators=300,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=42,
        verbose=-1
    )
    
    lgbm.fit(
        X_fit, y_fit,
        categorical_feature=[c for c in X_COLS if c in CATEGORICAL_COLS],
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50, verbose=True)]
    )
    
    modelos_finales[target_col] = lgbm
    joblib.dump(lgbm, os.path.join(output_dir, f'lgbm_model_{target_col}.joblib'))

# --- 7. Evaluación Final ---
print("\n--- 7. EVALUACIÓN FINAL (TEST SET) ---")

# Preparamos X_test
X_test_final = df_test[X_COLS].copy()
for col in num_cols:
    if X_test_final[col].dtype == 'object':
        X_test_final[col] = X_test_final[col].astype(str).str.replace(',', '.', regex=False)
    X_test_final[col] = pd.to_numeric(X_test_final[col], errors='coerce').fillna(0.0)

for target_col, modelo in modelos_finales.items():
    y_test_final = df_test[target_col].astype(int)
    y_pred_proba = modelo.predict_proba(X_test_final)[:, 1]
    
    auc_pr = average_precision_score(y_test_final, y_pred_proba)
    roc_auc = roc_auc_score(y_test_final, y_pred_proba)
    
    print(f"\n>>> RESULTADOS {target_col} <<<")
    print(f"AUC-PR: {auc_pr:.4f} | AUC-ROC: {roc_auc:.4f}")
    
    y_pred = (y_pred_proba > 0.5).astype(int)
    print(classification_report(y_test_final, y_pred, target_names=['No Fuga', 'Fuga']))

print("\n✅ ENTRENAMIENTO COMPLETADO.")

--- 3. Carga Distribuida y Muestreo ---
ERROR CRÍTICO: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: c:/Users/barco/OneDrive/Documentos/GitHub/GeSAI-AB_Data_Challenge/project-notebooks/../data/processed/dataset_FINAL_COMPLETO.parquet
Ordenando cronológicamente...

--- 4. Ingeniería de Características ---


Creando Lags: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]


Features Finales (43): ['CONSUMO_REAL', 'FESTIVO', 'TIPO_DIA', 'US_AIGUA_SUBM', 'TEMP_MEDIA'] ...
Train: 3514661 | Test: 878666

--- 6. ENTRENAMIENTO ---

[PROCESANDO] TARGET_HOY...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.971684

[PROCESANDO] TARGET_MANANA...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.966413

[PROCESANDO] TARGET_7DIAS...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.945164

--- 7. EVALUACIÓN FINAL (TEST SET) ---

>>> RESULTADOS TARGET_HOY <<<
AUC-PR: 0.7037 | AUC-ROC: 0.6363
              precision    recall  f1-score   support

     No Fuga       0.51      0.70      0.59    376311
        Fuga       0.69      0.49      0.57    502355

    accuracy                           0.58    878666
   macro avg      

: 