In [1]:
# TFM_07_final_taninig_col_importance_and_export.ipynb

In [2]:
import pandas as pd
import numpy as np
import time
import logging
import joblib
import json
import math
import warnings
from tqdm import tqdm
import gc

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, cross_validate

pd.set_option('display.float_format', lambda x: '{:.12f}'.format(x))
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

print("Importaciones completadas.")

Importaciones completadas.


In [3]:
LOG_FILENAME = 'training_best_gbr.log'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILENAME, mode='w', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logging.info(f"Iniciando script de entrenamiento para el mejor GBR. Log en: {LOG_FILENAME}")
print("Logging configurado.")

2025-04-06 18:38:54,236 - INFO - [2339540163.py:10] - Iniciando script de entrenamiento para el mejor GBR. Log en: training_best_gbr.log


Logging configurado.


In [4]:
try:
    df_train = pd.read_csv("datos_divididos/datos_financieros_train_completo.csv")
    df_test = pd.read_csv("datos_divididos/datos_financieros_test_completo.csv")
    logging.info("Archivos CSV cargados correctamente.")
    print("Archivos CSV cargados.")
    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")
except FileNotFoundError as e:
    logging.error(f"Error al cargar los archivos CSV: {e}. Asegúrate de que los archivos están en la ruta correcta.")
    print(f"Error carga archivo: {e}")
    raise e
except Exception as e:
    logging.error(f"Ocurrió un error inesperado al cargar los datos: {e}")
    print(f"Error carga datos: {e}")
    raise e

2025-04-06 18:39:32,839 - INFO - [1538478532.py:4] - Archivos CSV cargados correctamente.


Archivos CSV cargados.
Train shape: (85932, 442)
Test shape: (21483, 442)


In [5]:
logging.info("Iniciando limpieza inicial...")
print("Limpieza inicial datos.")

drop_cols_initial = ['timestamp', 'Unnamed: 0']
cols_to_drop_train = [col for col in drop_cols_initial if col in df_train.columns]
cols_to_drop_test = [col for col in drop_cols_initial if col in df_test.columns]

if cols_to_drop_train:
    df_train = df_train.drop(columns=cols_to_drop_train)
    logging.info(f"Columnas eliminadas de train: {cols_to_drop_train}")
    print(f"Cols eliminadas train: {cols_to_drop_train}")
if cols_to_drop_test:
    df_test = df_test.drop(columns=cols_to_drop_test)
    logging.info(f"Columnas eliminadas de test: {cols_to_drop_test}")
    print(f"Cols eliminadas test: {cols_to_drop_test}")

logging.info("Reseteando índices...")
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
logging.info("Índices reseteados.")
print("Índices reseteados.")

logging.info("Comprobando y eliminando duplicados...")
print("Comprobando duplicados.")
duplicados_train = df_train.duplicated().sum()
logging.info(f"Duplicados encontrados en train: {duplicados_train}")
print(f"Duplicados train: {duplicados_train}")
if duplicados_train > 0:
    df_train.drop_duplicates(inplace=True)
    logging.info("Duplicados eliminados en train.")
    print(f"Duplicados eliminados train. Shape: {df_train.shape}")

duplicados_test = df_test.duplicated().sum()
logging.info(f"Duplicados encontrados en test: {duplicados_test}")
print(f"Duplicados test: {duplicados_test}")
if duplicados_test > 0:
    df_test.drop_duplicates(inplace=True)
    logging.info("Duplicados eliminados en test.")
    print(f"Duplicados eliminados test. Shape: {df_test.shape}")

print("Limpieza inicial completada.")
print(f"Train shape final: {df_train.shape}")
print(f"Test shape final: {df_test.shape}")

2025-04-06 18:39:32,912 - INFO - [2472782200.py:1] - Iniciando limpieza inicial...


Limpieza inicial datos.


2025-04-06 18:39:33,316 - INFO - [2472782200.py:10] - Columnas eliminadas de train: ['timestamp']
2025-04-06 18:39:33,493 - INFO - [2472782200.py:14] - Columnas eliminadas de test: ['timestamp']


Cols eliminadas train: ['timestamp']


2025-04-06 18:39:33,503 - INFO - [2472782200.py:17] - Reseteando índices...


Cols eliminadas test: ['timestamp']


2025-04-06 18:39:34,018 - INFO - [2472782200.py:20] - Índices reseteados.
2025-04-06 18:39:34,023 - INFO - [2472782200.py:23] - Comprobando y eliminando duplicados...


Índices reseteados.
Comprobando duplicados.


2025-04-06 18:39:45,405 - INFO - [2472782200.py:26] - Duplicados encontrados en train: 0


Duplicados train: 0


2025-04-06 18:39:47,328 - INFO - [2472782200.py:34] - Duplicados encontrados en test: 0


Duplicados test: 0
Limpieza inicial completada.
Train shape final: (85932, 441)
Test shape final: (21483, 441)


In [None]:
logging.info("Iniciando limpieza inicial...")
print("Limpieza inicial datos.")

# Columnas a eliminar inicialmente
# AÑADIMOS 'PotentialLiquidityGap' A LA LISTA
drop_cols_initial = ['timestamp', 'Unnamed: 0', 'PotentialLiquidityGap']
cols_to_drop_train = [col for col in drop_cols_initial if col in df_train.columns]
cols_to_drop_test = [col for col in drop_cols_initial if col in df_test.columns]

if cols_to_drop_train:
    df_train = df_train.drop(columns=cols_to_drop_train)
    logging.info(f"Columnas eliminadas de train: {cols_to_drop_train}")
    print(f"Cols eliminadas train: {cols_to_drop_train}")
if cols_to_drop_test:
    df_test = df_test.drop(columns=cols_to_drop_test)
    logging.info(f"Columnas eliminadas de test: {cols_to_drop_test}")
    print(f"Cols eliminadas test: {cols_to_drop_test}")

# Resetear índices (sin cambios)
logging.info("Reseteando índices...")
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
logging.info("Índices reseteados.")
print("Índices reseteados.")

# Eliminar duplicados (sin cambios)
logging.info("Comprobando y eliminando duplicados...")
print("Comprobando duplicados.")
duplicados_train = df_train.duplicated().sum()
logging.info(f"Duplicados encontrados en train: {duplicados_train}")
print(f"Duplicados train: {duplicados_train}")
if duplicados_train > 0:
    df_train.drop_duplicates(inplace=True)
    logging.info("Duplicados eliminados en train.")
    print(f"Duplicados eliminados train. Shape: {df_train.shape}")

duplicados_test = df_test.duplicated().sum()
logging.info(f"Duplicados encontrados en test: {duplicados_test}")
print(f"Duplicados test: {duplicados_test}")
if duplicados_test > 0:
    df_test.drop_duplicates(inplace=True)
    logging.info("Duplicados eliminados en test.")
    print(f"Duplicados eliminados test. Shape: {df_test.shape}")

print("Limpieza inicial completada.")
print(f"Train shape final: {df_train.shape}")
print(f"Test shape final: {df_test.shape}")

In [6]:
def normalize_column_within_group(col: pd.Series, margen: float = 0.2):
    """Calcula límites escalador."""
    min_val = col.min()
    max_val = col.max()
    if pd.isna(min_val) or pd.isna(max_val):
        return col, (np.nan, np.nan)

    col_range = max_val - min_val
    if abs(col_range) < 1e-9:
        min_ext = min_val - 0.0001 if min_val != max_val else min_val
        max_ext = max_val + 0.0001 if min_val != max_val else max_val
        denominador = max_ext - min_ext
        if abs(denominador) < 1e-9:
             return col, (np.nan, np.nan)
    else:
        min_ext = min_val - margen * col_range
        max_ext = max_val + margen * col_range
        denominador = max_ext - min_ext
        if abs(denominador) < 1e-9:
             return col, (np.nan, np.nan)

    return col, (min_ext, max_ext)


def normalizar_datos_ml(df_train: pd.DataFrame,
                        df_test: pd.DataFrame,
                        columna_simbolo: str,
                        margen: float = 0.2):
    """Normaliza datos por grupo."""
    logging.info(f"Normalizando datos para ML por grupo '{columna_simbolo}'...")

    if columna_simbolo not in df_train.columns or columna_simbolo not in df_test.columns:
        error_msg = f"La columna '{columna_simbolo}' no existe en ambos DataFrames."
        logging.error(error_msg)
        raise ValueError(error_msg)

    train_original_idx = df_train.index
    test_original_idx = df_test.index

    logging.info("Calculando escaladores en el conjunto de train...")
    escaladores = {}
    columnas_numericas_train = df_train.select_dtypes(include=np.number).columns.tolist()
    objetivos = ['target','future_max_increase_capped']
    cols_to_exclude = [columna_simbolo] + objetivos
    columnas_a_normalizar = [col for col in columnas_numericas_train if col not in cols_to_exclude]

    if not columnas_a_normalizar:
        logging.warning("No hay columnas numéricas en df_train para calcular escaladores (excluyendo símbolo y objetivos).")
        return df_train.copy(), df_test.copy(), {}

    logging.info(f"Columnas a normalizar: {len(columnas_a_normalizar)}")

    grouped_train = df_train.groupby(columna_simbolo)
    for simbolo, grupo_df_train in tqdm(grouped_train, total=len(grouped_train.groups), desc="Calculando escaladores"):
        escaladores_simbolo = {}
        for col_name in columnas_a_normalizar:
            if col_name in grupo_df_train.columns:
                try:
                    _ , escalador_params = normalize_column_within_group(grupo_df_train[col_name], margen)
                    escaladores_simbolo[col_name] = escalador_params
                except Exception as e_calc:
                    logging.error(f"Error calculando escalador para {simbolo}/{col_name}: {e_calc}")
                    escaladores_simbolo[col_name] = (np.nan, np.nan)
            else:
                 escaladores_simbolo[col_name] = (np.nan, np.nan)
        escaladores[simbolo] = escaladores_simbolo
    logging.info("Escaladores calculados.")

    logging.info("Aplicando escaladores...")
    def _aplicar_escaladores_interna(df_input: pd.DataFrame, escaladores_dict: dict, cols_norm: list, desc: str) -> pd.DataFrame:
        df_output = df_input.copy()
        grouped_df = df_output.groupby(columna_simbolo)

        for simbolo, grupo_df in tqdm(grouped_df, total=len(grouped_df.groups), desc=desc):
            if simbolo not in escaladores_dict:
                logging.warning(f"Símbolo '{simbolo}' encontrado en {desc} pero no tiene escaladores. Sus datos numéricos no se normalizarán.")
                continue

            scalers_for_symbol = escaladores_dict[simbolo]

            for col_name in cols_norm:
                if col_name in grupo_df.columns:
                    if col_name in scalers_for_symbol:
                        min_ext, max_ext = scalers_for_symbol[col_name]

                        if pd.isna(min_ext) or pd.isna(max_ext):
                            df_output.loc[grupo_df.index, col_name] = np.nan
                            logging.debug(f"Aplicando NaN a {simbolo}/{col_name} debido a escaladores NaN.")
                            continue

                        denominador = max_ext - min_ext
                        col_original = grupo_df[col_name]

                        if abs(denominador) < 1e-9:
                            scaled_val = 0.5 if min_ext != max_ext else 0.0
                            scaled_col = pd.Series(scaled_val, index=col_original.index, name=col_name)
                            logging.debug(f"Aplicando {scaled_val} a {simbolo}/{col_name} debido a denominador ~0.")
                        else:
                            scaled_col = (col_original - min_ext) / denominador

                        df_output.loc[grupo_df.index, col_name] = scaled_col.where(col_original.notna(), np.nan)
                    else:
                        logging.warning(f"Escalador no encontrado para {simbolo}/{col_name} en diccionario, aunque la columna está en la lista. Se dejará sin normalizar.")

        return df_output

    df_train_normalizado = _aplicar_escaladores_interna(df_train, escaladores, columnas_a_normalizar, "Normalizando Train")
    logging.info("Conjunto de Train normalizado.")

    df_test_normalizado = _aplicar_escaladores_interna(df_test, escaladores, columnas_a_normalizar, "Normalizando Test")
    logging.info("Conjunto de Test normalizado.")

    logging.info("Normalización completada.")
    df_train_normalizado.index = train_original_idx
    df_test_normalizado.index = test_original_idx
    return df_train_normalizado, df_test_normalizado, escaladores


def guardar_escaladores(escaladores, nombre_archivo):
    """Guarda escaladores en joblib."""
    try:
        joblib.dump(escaladores, nombre_archivo)
        logging.info(f"Escaladores guardados en {nombre_archivo}")
    except Exception as e:
        logging.error(f"Error guardando escaladores en {nombre_archivo}: {e}")

def convert_value_for_json(value):
    """Convierte valores a JSON."""
    if isinstance(value, np.ndarray):
         return [convert_value_for_json(v) for v in value]
    if isinstance(value, (np.number, np.bool_)):
        if np.isnan(value): return None
        return value.item()
    if isinstance(value, float) and math.isnan(value):
         return None
    return value

def guardar_escaladores_joblib_json(escaladores, base_nombre_archivo):
    """Guarda escaladores joblib y JSON."""
    joblib_filename = f"{base_nombre_archivo}.joblib"
    guardar_escaladores(escaladores, joblib_filename)

    json_filename = f"{base_nombre_archivo}.json"
    json_compatible_scalers = {}
    try:
        for simbolo, cols_scalers in escaladores.items():
            json_compatible_scalers[str(simbolo)] = {}
            for col, (min_e, max_e) in cols_scalers.items():
                 json_min = convert_value_for_json(min_e)
                 json_max = convert_value_for_json(max_e)
                 json_compatible_scalers[str(simbolo)][str(col)] = [json_min, json_max]

        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(json_compatible_scalers, f, indent=4, ensure_ascii=False)
        logging.info(f"Escaladores guardados en formato JSON: {json_filename}")

    except Exception as e_json:
        logging.error(f"Error guardando escaladores en JSON ({json_filename}): {e_json}")

print("Funciones normalización definidas.")

Funciones normalización definidas.


In [7]:
columna_simbolo = 'symbol'
margen_normalizacion = 0.2

logging.info(f"Aplicando normalización con margen={margen_normalizacion}...")
print(f"Aplicando normalización (margen={margen_normalizacion})...")
start_norm = time.time()

df_train_normalizado, df_test_normalizado, escaladores = normalizar_datos_ml(
    df_train.copy(),
    df_test.copy(),
    columna_simbolo,
    margen=margen_normalizacion
)

end_norm = time.time()
logging.info(f"Normalización completada en {end_norm - start_norm:.2f} segundos.")
print(f"Normalización: {end_norm - start_norm:.2f}s.")

logging.info("Guardando escaladores...")
nombre_base_escaladores = 'escaladores_gbr_final'
guardar_escaladores_joblib_json(escaladores, nombre_base_escaladores)
print(f"Escaladores guardados: {nombre_base_escaladores}.[joblib/json]")

print("Muestra train normalizado:")
print(df_train_normalizado.head(3))
print("Muestra test normalizado:")
print(df_test_normalizado.head(3))

nan_cols_train = df_train_normalizado.isna().all()
nan_cols_test = df_test_normalizado.isna().all()
if nan_cols_train.any():
    logging.warning(f"Columnas completamente NaN en train normalizado: {nan_cols_train[nan_cols_train].index.tolist()}")
    print(f"WARN: Cols NaN train: {nan_cols_train[nan_cols_train].index.tolist()}")
if nan_cols_test.any():
     logging.warning(f"Columnas completamente NaN en test normalizado: {nan_cols_test[nan_cols_test].index.tolist()}")
     print(f"WARN: Cols NaN test: {nan_cols_test[nan_cols_test].index.tolist()}")

2025-04-06 18:39:47,537 - INFO - [439320242.py:4] - Aplicando normalización con margen=0.2...


Aplicando normalización (margen=0.2)...


2025-04-06 18:39:47,937 - INFO - [2376216886.py:30] - Normalizando datos para ML por grupo 'symbol'...
2025-04-06 18:39:47,941 - INFO - [2376216886.py:40] - Calculando escaladores en el conjunto de train...
2025-04-06 18:39:48,406 - INFO - [2376216886.py:51] - Columnas a normalizar: 438
Calculando escaladores: 100%|██████████| 77/77 [00:28<00:00,  2.70it/s]
2025-04-06 18:40:16,981 - INFO - [2376216886.py:67] - Escaladores calculados.
2025-04-06 18:40:16,986 - INFO - [2376216886.py:69] - Aplicando escaladores...
Normalizando Train: 100%|██████████| 77/77 [02:00<00:00,  1.57s/it]
2025-04-06 18:42:18,519 - INFO - [2376216886.py:108] - Conjunto de Train normalizado.
Normalizando Test: 100%|██████████| 77/77 [01:42<00:00,  1.33s/it]
2025-04-06 18:44:01,123 - INFO - [2376216886.py:111] - Conjunto de Test normalizado.
2025-04-06 18:44:01,125 - INFO - [2376216886.py:113] - Normalización completada.
2025-04-06 18:44:01,272 - INFO - [439320242.py:16] - Normalización completada en 253.73 segundos

Normalización: 253.73s.


2025-04-06 18:44:04,897 - INFO - [2376216886.py:123] - Escaladores guardados en escaladores_gbr_final.joblib
2025-04-06 18:44:06,329 - INFO - [2376216886.py:155] - Escaladores guardados en formato JSON: escaladores_gbr_final.json


Escaladores guardados: escaladores_gbr_final.[joblib/json]
Muestra train normalizado:
            open           high            low          close         volume  quote_asset_volume  number_of_trades  taker_buy_base_asset_volume  taker_buy_quote_asset_volume symbol    close_lag_1   volume_lag_1     open_lag_1     high_lag_1      low_lag_1  quote_asset_volume_lag_1  number_of_trades_lag_1  taker_buy_base_asset_volume_lag_1  taker_buy_quote_asset_volume_lag_1    close_lag_2   volume_lag_2     open_lag_2     high_lag_2      low_lag_2  quote_asset_volume_lag_2  number_of_trades_lag_2  taker_buy_base_asset_volume_lag_2  taker_buy_quote_asset_volume_lag_2    close_lag_3   volume_lag_3     open_lag_3     high_lag_3      low_lag_3  quote_asset_volume_lag_3  number_of_trades_lag_3  taker_buy_base_asset_volume_lag_3  taker_buy_quote_asset_volume_lag_3    close_lag_4   volume_lag_4     open_lag_4     high_lag_4      low_lag_4  quote_asset_volume_lag_4  number_of_trades_lag_4  taker_buy_base_asse



WARN: Cols NaN train: ['PotentialLiquidityGap']
WARN: Cols NaN test: ['PotentialLiquidityGap']


In [8]:
logging.info("Preparando conjuntos de datos finales para el modelo GBR...")
print("Preparando datos finales X/Z...")

objetivo_clasificacion = 'target'
objetivo_regresion = 'future_max_increase_capped'

cols_to_drop_for_X = [columna_simbolo, objetivo_clasificacion, objetivo_regresion]

X_train_main = df_train_normalizado.drop(columns=[col for col in cols_to_drop_for_X if col in df_train_normalizado.columns], axis=1)
Z_train_main = df_train[objetivo_regresion].copy()

X_test_main = df_test_normalizado.drop(columns=[col for col in cols_to_drop_for_X if col in df_test_normalizado.columns], axis=1)
Z_test_main = df_test[objetivo_regresion].copy()

logging.info(f"Dimensiones preparadas:")
logging.info(f"  X_train_main: {X_train_main.shape}")
logging.info(f"  Z_train_main: {Z_train_main.shape}")
logging.info(f"  X_test_main: {X_test_main.shape}")
logging.info(f"  Z_test_main: {Z_test_main.shape}")
print("Dimensiones datos:")
print(f"Train: X={X_train_main.shape} Z={Z_train_main.shape}")
print(f"Test: X={X_test_main.shape} Z={Z_test_main.shape}")

try:
    assert X_train_main.shape[0] == Z_train_main.shape[0], "Discrepancia en filas de entrenamiento"
    assert X_test_main.shape[0] == Z_test_main.shape[0], "Discrepancia en filas de test"
    assert X_train_main.shape[1] == X_test_main.shape[1], "Discrepancia en número de características entre train y test"
    logging.info("Dimensiones de X/Z verificadas correctamente.")
    print("Consistencia dimensiones ok.")
except AssertionError as e:
    logging.error(f"Error de aserción en dimensiones: {e}")
    print(f"Error dimensiones: {e}")

logging.info("Liberando memoria de DataFrames intermedios...")
del df_train_normalizado, df_test_normalizado, df_train, df_test, escaladores
gc.collect()
logging.info("DataFrames intermedios eliminados de memoria.")
print("Memoria intermedia liberada.")

print(f"NaNs X_train: {X_train_main.isna().sum().sum()}")
print(f"NaNs Z_train: {Z_train_main.isna().sum()}")
print(f"NaNs X_test: {X_test_main.isna().sum().sum()}")
print(f"NaNs Z_test: {Z_test_main.isna().sum()}")
if Z_train_main.isna().any() or Z_test_main.isna().any():
     print("WARN: NaNs en Z_train/Z_test.")

2025-04-06 18:44:07,170 - INFO - [4266143611.py:1] - Preparando conjuntos de datos finales para el modelo GBR...


Preparando datos finales X/Z...


2025-04-06 18:44:07,551 - INFO - [4266143611.py:15] - Dimensiones preparadas:
2025-04-06 18:44:07,554 - INFO - [4266143611.py:16] -   X_train_main: (85932, 438)
2025-04-06 18:44:07,653 - INFO - [4266143611.py:17] -   Z_train_main: (85932,)
2025-04-06 18:44:07,657 - INFO - [4266143611.py:18] -   X_test_main: (21483, 438)
2025-04-06 18:44:07,662 - INFO - [4266143611.py:19] -   Z_test_main: (21483,)
2025-04-06 18:44:07,665 - INFO - [4266143611.py:28] - Dimensiones de X/Z verificadas correctamente.
2025-04-06 18:44:07,669 - INFO - [4266143611.py:34] - Liberando memoria de DataFrames intermedios...


Dimensiones datos:
Train: X=(85932, 438) Z=(85932,)
Test: X=(21483, 438) Z=(21483,)
Consistencia dimensiones ok.


2025-04-06 18:44:08,425 - INFO - [4266143611.py:37] - DataFrames intermedios eliminados de memoria.


Memoria intermedia liberada.
NaNs X_train: 137251
NaNs Z_train: 0
NaNs X_test: 34334
NaNs Z_test: 0


In [9]:
best_gbr_params = {
    'n_estimators': 3000,
    'max_depth': 9,
    'learning_rate': 0.005,
    'subsample': 0.5,
    'random_state': 42,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'loss': 'squared_error'
}

logging.info(f"Hiperparámetros seleccionados para GradientBoostingRegressor: {best_gbr_params}")
print("Hiperparámetros GBR:")
for param, value in best_gbr_params.items():
    print(f"  {param}: {value}")

2025-04-06 18:44:09,941 - INFO - [564147948.py:12] - Hiperparámetros seleccionados para GradientBoostingRegressor: {'n_estimators': 3000, 'max_depth': 9, 'learning_rate': 0.005, 'subsample': 0.5, 'random_state': 42, 'min_samples_split': 2, 'min_samples_leaf': 1, 'loss': 'squared_error'}


Hiperparámetros GBR:
  n_estimators: 3000
  max_depth: 9
  learning_rate: 0.005
  subsample: 0.5
  random_state: 42
  min_samples_split: 2
  min_samples_leaf: 1
  loss: squared_error


In [10]:
logging.info("Instanciando el modelo GradientBoostingRegressor...")
try:
    gbr_model = GradientBoostingRegressor(**best_gbr_params)
    print(f"GBR instanciado ({best_gbr_params['n_estimators']} est).")
    logging.info("Modelo instanciado correctamente.")
except TypeError as e:
    logging.error(f"Error al instanciar GBR. Verifica los nombres de los parámetros: {e}")
    print(f"Error instanciación GBR: {e}")
    raise e

logging.info("Iniciando entrenamiento del modelo GBR...")
print("Entrenando GBR...")
start_train_time = time.time()

try:
    gbr_model.fit(X_train_main, Z_train_main)
    end_train_time = time.time()
    training_duration = end_train_time - start_train_time
    logging.info(f"Modelo GBR entrenado exitosamente en {training_duration:.2f} segundos.")
    print(f"Entrenamiento: {training_duration:.2f}s.")
except ValueError as e:
     logging.error(f"Error de valor durante el entrenamiento: {e}")
     print(f"Error entrenamiento: {e}")
     if "Input contains NaN" in str(e):
         print("Posible causa: NaNs en input.")
     raise e
except Exception as e:
    logging.error(f"Error inesperado durante el entrenamiento del modelo GBR: {e}")
    print(f"Error inesperado entrenamiento: {e}")
    raise e

try:
    _ = gbr_model.feature_importances_
    logging.info("Verificación post-entrenamiento: Atributo 'feature_importances_' encontrado.")
except AttributeError:
    logging.error("Error post-entrenamiento: El modelo no parece haber sido entrenado correctamente (falta 'feature_importances_').")
    print("Error: Modelo no parece entrenado.")

2025-04-06 18:44:10,095 - INFO - [3285868171.py:1] - Instanciando el modelo GradientBoostingRegressor...
2025-04-06 18:44:10,100 - INFO - [3285868171.py:5] - Modelo instanciado correctamente.
2025-04-06 18:44:10,103 - INFO - [3285868171.py:11] - Iniciando entrenamiento del modelo GBR...


GBR instanciado (3000 est).
Entrenando GBR...


2025-04-06 18:44:11,126 - ERROR - [3285868171.py:22] - Error de valor durante el entrenamiento: Input X contains NaN.
GradientBoostingRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


Error entrenamiento: Input X contains NaN.
GradientBoostingRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


ValueError: Input X contains NaN.
GradientBoostingRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
logging.info("Evaluando el modelo GBR en el conjunto de test...")
print("Evaluando en test...")

if not hasattr(gbr_model, 'feature_importances_'):
     print("Evaluación saltada: modelo no entrenado.")
     logging.error("Evaluación saltada porque el modelo no parece entrenado.")
else:
    try:
        start_pred_time = time.time()
        Z_pred = gbr_model.predict(X_test_main)
        end_pred_time = time.time()
        prediction_duration = end_pred_time - start_pred_time
        logging.info(f"Predicción en test completada en {prediction_duration:.2f} segundos.")
        print(f"Predicción: {prediction_duration:.2f}s.")

        mse = mean_squared_error(Z_test_main, Z_pred)
        mae = mean_absolute_error(Z_test_main, Z_pred)
        rmse = math.sqrt(mse)
        r2 = r2_score(Z_test_main, Z_pred)

        logging.info(f"Métricas de evaluación en Test:")
        logging.info(f"  MSE: {mse:.8f}")
        logging.info(f"  MAE: {mae:.8f}")
        logging.info(f"  RMSE: {rmse:.8f}")
        logging.info(f"  R2 Score: {r2:.8f}")

        print("Métricas Test:")
        print(f"  MSE: {mse:.8f}")
        print(f"  MAE: {mae:.8f}")
        print(f"  RMSE:{rmse:.8f}")
        print(f"  R2: {r2:.8f}")

        print("Comparación Log Test (n_splits=3):")
        print(f"  MSE : Log=0.01291 | Calc={mse:.5f}")
        print(f"  MAE : Log=0.08891 | Calc={mae:.5f}")
        print(f"  RMSE: Log=0.11362 | Calc={rmse:.5f}")
        print(f"  R2  : Log=0.96424 | Calc={r2:.5f}")

    except Exception as e:
        logging.error(f"Error durante la evaluación del modelo GBR: {e}")
        print(f"Error evaluación: {e}")
        raise e

In [None]:
logging.info("Calculando y mostrando la importancia de las características...")
print("Importancia Características:")

if not hasattr(gbr_model, 'feature_importances_'):
     print("Importancia no calculada: modelo no entrenado.")
     logging.error("Cálculo de importancia saltado porque el modelo no parece entrenado.")
else:
    try:
        importances = gbr_model.feature_importances_
        feature_names = X_train_main.columns

        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        })

        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

        logging.info(f"Importancia de características calculada para {len(feature_names)} características.")

        n_top_features = 30
        print(f"Top {n_top_features} Características:")
        print(feature_importance_df.head(n_top_features).to_string())

        csv_filename_importance = 'gbr_feature_importances.csv'
        try:
            feature_importance_df.to_csv(csv_filename_importance, index=False)
            logging.info(f"Importancia de todas las características guardada en '{csv_filename_importance}'")
            print(f"Importancias guardadas: '{csv_filename_importance}'")
        except Exception as e_csv:
            logging.error(f"No se pudo guardar el CSV de importancias: {e_csv}")
            print(f"Error guardando CSV importancias: {e_csv}")

    except AttributeError:
        msg = "El modelo entrenado no tiene el atributo 'feature_importances_'. Esto es inesperado."
        logging.error(msg)
        print(msg)
    except Exception as e:
        logging.error(f"Error inesperado al obtener la importancia de las características: {e}")
        print(f"Error inesperado importancia: {e}")

In [None]:
model_filename = 'best_gradient_boosting_regressor_3000est.joblib'
logging.info(f"Intentando guardar el modelo entrenado en '{model_filename}'...")
print(f"Guardando modelo: '{model_filename}'...")

if 'gbr_model' in locals() and hasattr(gbr_model, 'predict'):
    try:
        joblib.dump(gbr_model, model_filename)
        logging.info("Modelo guardado exitosamente.")
        print("Modelo guardado.")
    except Exception as e:
        logging.error(f"Error al guardar el modelo en '{model_filename}': {e}")
        print(f"Error guardando modelo: {e}")
else:
     msg = "El objeto 'gbr_model' no existe o no está entrenado. No se puede guardar."
     logging.error(msg)
     print(f"Error: {msg}")

In [None]:
run_optional_cv = False

if run_optional_cv:
    logging.info("Iniciando re-evaluación opcional con Cross-Validation (n_splits=3)...")
    print("Ejecutando CV opcional (n_splits=3)...")

    n_splits_cv = 3
    cv_splitter = TimeSeriesSplit(n_splits=n_splits_cv)

    gbr_cv_model = GradientBoostingRegressor(**best_gbr_params)
    logging.info("Instancia de GBR para CV creada.")

    scoring_reg = {
        'mse': make_scorer(mean_squared_error, greater_is_better=False),
        'mae': make_scorer(mean_absolute_error, greater_is_better=False),
        'rmse': make_scorer(lambda y, p: np.sqrt(mean_squared_error(y, p)), greater_is_better=False),
        'r2': 'r2'
    }
    logging.info(f"Scoring para CV definido: {list(scoring_reg.keys())}")

    try:
        start_cv_time = time.time()
        cv_results = cross_validate(gbr_cv_model, X_train_main, Z_train_main,
                                    cv=cv_splitter, scoring=scoring_reg,
                                    n_jobs=-1, return_train_score=False,
                                    error_score='raise')
        end_cv_time = time.time()
        cv_duration = end_cv_time - start_cv_time
        logging.info(f"Cross-Validation completada en {cv_duration:.2f} segundos.")
        print(f"CV completada: {cv_duration:.2f}s.")

        print(f"Resultados CV Mean (n_splits={n_splits_cv}):")
        cv_scores_summary = {}
        log_cv_comparison = {}

        for metric_name in scoring_reg.keys():
            cv_key = f'test_{metric_name}'
            if cv_key in cv_results:
                scores = cv_results[cv_key]
                if metric_name in ['mse', 'mae', 'rmse']:
                    scores = -scores

                mean_score = np.nanmean(scores) if np.any(~np.isnan(scores)) else np.nan
                std_score = np.nanstd(scores) if np.any(~np.isnan(scores)) else np.nan

                cv_scores_summary[metric_name] = mean_score
                cv_scores_summary[metric_name + '_std'] = std_score
                print(f"  {metric_name.upper()}: {mean_score:.8f} (std: {std_score:.8f})")
                logging.info(f"  CV {metric_name}: {mean_score:.8f} +/- {std_score:.8f}")
            else:
                 logging.warning(f"Métrica CV '{cv_key}' no encontrada en los resultados.")
                 print(f"  {metric_name.upper()}: No encontrada")

        log_cv_values = {'mse': 0.01619, 'mae': 0.10118, 'rmse': 0.12632, 'r2': 0.95523}
        print("Comparación Log CV (n_splits=3):")
        for metric, log_val in log_cv_values.items():
             calc_val = cv_scores_summary.get(metric, np.nan)
             print(f"  {metric.upper()}: Log={log_val:.5f} | Calc={calc_val:.5f}")

    except Exception as e:
        logging.error(f"Error durante el Cross-Validation opcional: {e}")
        print(f"Error CV opcional: {e}")

else:
    logging.info("Saltando re-evaluación opcional con Cross-Validation.")
    print("CV opcional desactivada.")

In [None]:
logging.info("Script finalizado.")
print("===========================")
print("Proceso completado.")
print("===========================")
print(f"Modelo: {model_filename if 'model_filename' in locals() else 'No guardado'}")
print(f"Log: {LOG_FILENAME}")