In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from statsmodels.tsa.seasonal import STL
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.feature_selection import RFECV

In [2]:
df = pd.read_csv('imputadoS2O.csv', index_col=0, parse_dates=True)

In [3]:

class DateTimeFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, time_col='ds', country_holidays='CO', drop_time_col=False, tz=None):
        self.time_col = time_col
        self.country_holidays = country_holidays
        self.drop_time_col = drop_time_col
        self.tz = tz

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.time_col] = pd.to_datetime(X[self.time_col])
        if self.tz:
            X[self.time_col] = X[self.time_col].dt.tz_localize(self.tz, nonexistent='shift_forward', ambiguous='NaT').dt.tz_convert(self.tz)

        dt = X[self.time_col]
        X['year'] = dt.dt.year
        X['month'] = dt.dt.month
        X['day'] = dt.dt.day
        X['hour'] = dt.dt.hour
        X['dow'] = dt.dt.weekday      # 0=lunes
        X['weekofyear'] = dt.dt.isocalendar().week.astype(int)
        X['is_weekend'] = (X['dow'] >= 5).astype(int)
        X['is_month_start'] = dt.dt.is_month_start.astype(int)
        X['is_month_end'] = dt.dt.is_month_end.astype(int)
        
        try:
            import holidays
            hol = holidays.CountryHoliday(self.country_holidays)
            X['is_holiday'] = dt.dt.date.astype('datetime64').isin(hol).astype(int)
        except Exception:
            X['is_holiday'] = 0

        if self.drop_time_col:
            X = X.drop(columns=[self.time_col])
        return X

class LagFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, target_col='y', lags=range(1, 73)):
        self.target_col = target_col
        self.lags = list(lags)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for L in self.lags:
            X[f'{self.target_col}_lag{L}'] = X[self.target_col].shift(L)
        return X


class RollingStats(BaseEstimator, TransformerMixin):
    def __init__(self, target_col='y', windows=(3, 6, 12, 24, 48, 72), stats=('mean','std','min','max')):
        self.target_col = target_col
        self.windows = windows
        self.stats = stats

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for w in self.windows:
            roll = X[self.target_col].shift(1).rolling(w)
            if 'mean' in self.stats: X[f'{self.target_col}_roll{w}_mean'] = roll.mean()
            if 'std'  in self.stats: X[f'{self.target_col}_roll{w}_std']  = roll.std(ddof=0)
            if 'min'  in self.stats: X[f'{self.target_col}_roll{w}_min']  = roll.min()
            if 'max'  in self.stats: X[f'{self.target_col}_roll{w}_max']  = roll.max()
        return X


class STLFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, target_col='y', period=24, robust=True, enabled=True):
        self.target_col = target_col
        self.period = period
        self.robust = robust
        self.enabled = enabled

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if not self.enabled:
            return X
        series = X[self.target_col].astype(float)
        series_filled = series.interpolate(limit_direction='both')
        res = STL(series_filled, period=self.period, robust=self.robust).fit()
        X[f'{self.target_col}_stl_trend'] = res.trend
        X[f'{self.target_col}_stl_season'] = res.seasonal
        X[f'{self.target_col}_stl_resid'] = res.resid
        return X


def build_features_from_df(df, time_col_name='datetime', target_col_name='SO2',
                           stl_period=24, use_stl=True):
    
    if isinstance(df.index, pd.DatetimeIndex):
        df_ = df.copy()
        df_[time_col_name] = df_.index
    else:
        df_ = df.copy()

    pipe = Pipeline(steps=[
        ("dt",   DateTimeFeatures(time_col=time_col_name, country_holidays="CO", drop_time_col=False)),
        ("lags", LagFeatures(target_col=target_col_name, lags=range(1, 73))),            # 72 lags
        ("roll", RollingStats(target_col=target_col_name, windows=(3,6,12,24,48,72))),   # ajustable
        ("stl",  STLFeatures(target_col=target_col_name, period=stl_period, enabled=use_stl))
    ])

    feat = pipe.fit_transform(df_)
    
    feat = feat.dropna().reset_index(drop=True)
    return feat


In [4]:
df_feat = build_features_from_df(df, time_col_name='datetime', target_col_name='SO2', stl_period=168, use_stl=True)

In [9]:
df_feat

Unnamed: 0,SO2,datetime,year,month,day,hour,dow,weekofyear,is_weekend,is_month_start,...,SO2_roll48_std,SO2_roll48_min,SO2_roll48_max,SO2_roll72_mean,SO2_roll72_std,SO2_roll72_min,SO2_roll72_max,SO2_stl_trend,SO2_stl_season,SO2_stl_resid
0,2.975828,2021-03-07 00:00:00,2021,3,7,0,6,9,1,0,...,5.298079,0.000000,21.480362,9.784296,8.038740,0.000000,33.491123,8.504659,-4.747428,-0.781403
1,3.380588,2021-03-07 01:00:00,2021,3,7,1,6,9,1,0,...,5.301265,0.000000,21.480362,9.510410,7.929365,0.000000,33.491123,8.487105,-4.610264,-0.496253
2,2.768564,2021-03-07 02:00:00,2021,3,7,2,6,9,1,0,...,5.299370,0.000000,21.480362,9.229658,7.782209,0.000000,33.491123,8.469542,-5.302609,-0.398370
3,2.323257,2021-03-07 03:00:00,2021,3,7,3,6,9,1,0,...,5.291489,0.000000,21.480362,9.045462,7.776003,0.000000,33.491123,8.451971,-4.885748,-1.242966
4,1.877950,2021-03-07 04:00:00,2021,3,7,4,6,9,1,0,...,5.270885,0.000000,21.480362,8.874323,7.786465,0.000000,33.491123,8.434392,6.560885,-13.117327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30331,4.475329,2024-08-31 20:00:00,2024,8,31,20,5,35,1,0,...,4.379549,0.478351,20.544553,6.880155,4.120033,0.478351,20.544553,6.958418,-2.551977,0.068889
30332,3.652872,2024-08-31 21:00:00,2024,8,31,21,5,35,1,0,...,4.378487,0.478351,20.544553,6.872065,4.124174,0.478351,20.544553,6.969541,-2.228821,-1.087848
30333,3.288098,2024-08-31 22:00:00,2024,8,31,22,5,35,1,0,...,4.412777,0.478351,20.544553,6.833891,4.141040,0.478351,20.544553,6.980676,-1.988110,-1.704468
30334,4.535767,2024-08-31 23:00:00,2024,8,31,23,5,35,1,0,...,4.453768,0.478351,20.544553,6.826490,4.146890,0.478351,20.544553,6.991823,-1.045224,-1.410832


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 30409 entries, 2021-03-03 23:00:00 to 2024-09-01 00:00:00
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SO2     30408 non-null  float64
dtypes: float64(1)
memory usage: 475.1 KB


In [17]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer, mean_absolute_error


TARGET = 'SO2'
TIME_COL = 'datetime'

y = df_feat[TARGET].astype(float)
# columnas a excluir del modelo: target y la columna de tiempo explícita
exclude_cols = {TARGET, TIME_COL}
X = df_feat.drop(columns=[c for c in df_feat.columns if c in exclude_cols])

# Si por algún motivo hay NaNs residuales, imputa simple:
if X.isna().any().any():
    # imputación super simple: mediana por columna
    med = X.median(numeric_only=True)
    X = X.fillna(med).fillna(-1)

# ------------------------------------------------------------------------------------
# 3) Definir el estimador base y el esquema de CV temporal
# ------------------------------------------------------------------------------------
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=5,
    n_jobs=-1,
    random_state=15926
)

# CV respetando el orden temporal (p.ej., 3 splits)
tscv = TimeSeriesSplit(n_splits=3)

# Función MAPE segura (evita división por cero si y_true tiene ceros)
# ------------------------------------------------------------------------------------
def mape_safe(y_true, y_pred, eps: float = 1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return np.mean(np.abs((y_true - y_pred) / denom))


mape_scorer = make_scorer(mape_safe, greater_is_better=False)

# Métrica: MAE (minimizar) -> usar scorer negativo para sklearn
#mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)



# ------------------------------------------------------------------------------------
# 4) RFECV para seleccionar variables (sobre TU feature matrix ya expandida)
# ------------------------------------------------------------------------------------
selector = RFECV(
    estimator=rf,
    step=2,                 
    cv=tscv,
    scoring=mape_scorer,
    n_jobs=-1,
)

selector.fit(X, y)

selected_mask = selector.support_
selected_cols = X.columns[selected_mask].tolist()
rankings = pd.Series(selector.ranking_, index=X.columns).sort_values()
print(f"Features seleccionadas ({len(selected_cols)}):")
print(selected_cols)

Features seleccionadas (7):
['hour', 'SO2_lag1', 'SO2_lag13', 'SO2_roll48_min', 'SO2_stl_trend', 'SO2_stl_season', 'SO2_stl_resid']


In [15]:
with open("selected_cols.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(map(str, selected_cols)))

In [None]:

# ------------------------------------------------------------------------------------
# 5) Entrenar modelo final solo con las features seleccionadas
# ------------------------------------------------------------------------------------
rf_final = RandomForestRegressor(
    n_estimators=800,
    max_depth=None,
    n_jobs=-1,
    random_state=15926
)
rf_final.fit(X[selected_cols], y)

# Importancias de variables (ordenadas)
importances = pd.Series(rf_final.feature_importances_, index=selected_cols)\
              .sort_values(ascending=False)
print("\nTop-20 importancias:")
print(importances.head(20))

# ------------------------------------------------------------------------------------
# 6) (Opcional) Hold-out final por fecha para evaluar fuera de RFECV
# ------------------------------------------------------------------------------------
split_idx = int(len(X) * 0.8)
X_tr, y_tr = X.iloc[:split_idx][selected_cols], y.iloc[:split_idx]
X_te, y_te = X.iloc[split_idx:][selected_cols], y.iloc[split_idx:]

rf_holdout = RandomForestRegressor(
    n_estimators=800,
    max_depth=None,
    n_jobs=-1,
    random_state=15926
).fit(X_tr, y_tr)

y_pred = rf_holdout.predict(X_te)
mae = mean_absolute_error(y_te, y_pred)
print(f"\nMAE hold-out (último 20%): {mae:,.4f}")



Top-20 importancias:
SO2_stl_resid      0.604983
SO2_lag1           0.238025
SO2_stl_season     0.110954
SO2_stl_trend      0.040280
SO2_roll48_min     0.001054
SO2_roll72_mean    0.000927
SO2_roll24_mean    0.000802
SO2_roll24_min     0.000780
SO2_lag54          0.000750
SO2_lag71          0.000745
SO2_roll48_mean    0.000700
dtype: float64

MAE hold-out (último 20%): 0.2650


In [None]:
# modelo lasso paar FSelection

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# ------------------------------------------------------------------------------------
# 2) Separar target y predictores (SIN pipelines)
# ------------------------------------------------------------------------------------
TARGET   = 'SO2'
TIME_COL = 'datetime'

y = df_feat[TARGET].astype(float)
exclude = [c for c in [TARGET, TIME_COL] if c in df_feat.columns]
X = df_feat.drop(columns=exclude)

# Imputación simple por si quedó algún NaN
med = X.median(numeric_only=True)
X   = X.fillna(med).fillna(0)

# ------------------------------------------------------------------------------------
# 3) Estandarizar
# ------------------------------------------------------------------------------------
scaler   = StandardScaler().fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)

# ------------------------------------------------------------------------------------
# 4) LassoCV con TimeSeriesSplit
# ------------------------------------------------------------------------------------
tscv   = TimeSeriesSplit(n_splits=3)
alphas = np.logspace(-4, 1, 60) 

lasso_cv = LassoCV(
    alphas=alphas,
    cv=tscv,              
    max_iter=1000,
    n_jobs=-1,
    random_state=15926
)
lasso_cv.fit(X_scaled, y)

best_alpha = float(lasso_cv.alpha_)
coefs      = pd.Series(lasso_cv.coef_, index=X.columns)
selected_cols = coefs[coefs.abs() > 1e-8].index.tolist()

print(f"Best alpha (LassoCV): {best_alpha:.6f}")
print(f"Features seleccionadas: {len(selected_cols)}")
print(selected_cols[:25])  # primeras 25 para ver


Best alpha (LassoCV): 0.000100
Features seleccionadas: 95
['year', 'month', 'day', 'dow', 'weekofyear', 'SO2_lag1', 'SO2_lag2', 'SO2_lag3', 'SO2_lag6', 'SO2_lag7', 'SO2_lag8', 'SO2_lag9', 'SO2_lag10', 'SO2_lag11', 'SO2_lag12', 'SO2_lag13', 'SO2_lag14', 'SO2_lag15', 'SO2_lag16', 'SO2_lag17', 'SO2_lag18', 'SO2_lag19', 'SO2_lag20', 'SO2_lag21', 'SO2_lag22']


In [16]:
with open("selected_cols.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(map(str, selected_cols)))

In [None]:
# === Imports ===
import numpy as np
import pandas as pd
from pathlib import Path

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error


def mape_safe(y_true, y_pred, eps: float = 1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return np.mean(np.abs((y_true - y_pred) / denom))

def cargar_selected_cols(path="selected_cols.txt"):
    p = Path(path)
    if not p.exists():
        # fallback: tomar el último selected_cols_*.txt si existe
        cands = sorted(Path(".").glob("selected_cols_*.txt"))
        if not cands:
            raise FileNotFoundError(
                "No se encontró 'selected_cols.txt' ni archivos 'selected_cols_*.txt'."
            )
        p = cands[-1]
        print(f"Usando: {p.name}")
    with p.open("r", encoding="utf-8") as f:
        cols = [ln.strip() for ln in f if ln.strip()]
    return cols

# ---------------------------
# 1) Configuración
# ---------------------------
TARGET   = "SO2"
TIME_COL = "datetime"

# df_feat: tu DataFrame ya construido con build_features_from_df(...)
# df_feat = ...

# ---------------------------
# 2) Leer columnas seleccionadas de TXT
# ---------------------------
selected_cols = cargar_selected_cols("selected_cols.txt")

# Asegurar intersección con df_feat y excluir target/time si aparecen
selected_cols = [
    c for c in selected_cols
    if (c in df_feat.columns) and (c not in {TARGET, TIME_COL})
]
if not selected_cols:
    raise ValueError("La lista de columnas seleccionadas quedó vacía tras intersectar con df_feat.")

# ---------------------------
# 3) Armar X, y y preprocesado mínimo
# ---------------------------
y = df_feat[TARGET].astype(float)
X = df_feat[selected_cols].copy()

# Imputación súper simple (mediana); evita NaNs
if X.isna().any().any():
    med = X.median(numeric_only=True)
    X = X.fillna(med).fillna(-1)

# Marcar categóricas (si hay object/category)
for c in X.columns:
    if X[c].dtype == "O":
        X[c] = X[c].astype("category")

cat_cols = [c for c in X.columns if str(X[c].dtype) == "category"]

# ---------------------------
# 4) Split temporal 80/20
# ---------------------------
n = len(X)
cut = int(n * 0.8)
X_train, y_train = X.iloc[:cut], y.iloc[:cut]
X_test,  y_test  = X.iloc[cut:],  y.iloc[cut:]

# ---------------------------
# 5) LGBM sencillo y entrenamiento
# ---------------------------
lgbm = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=15926,
)

# Si tienes categóricas, pásalas por nombre; si no, LightGBM igual funciona
fit_kwargs = {}
if cat_cols:
    fit_kwargs["categorical_feature"] = cat_cols

lgbm.fit(X_train, y_train, **fit_kwargs)

# ---------------------------
# 6) Métricas y top importancias
# ---------------------------
y_pred = lgbm.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mape_safe(y_test, y_pred) * 100

print(f"MAE : {mae:,.4f}")
print(f"RMSE: {rmse:,.4f}")
print(f"MAPE: {mape:,.2f}%")

# Importancias
imp = pd.Series(lgbm.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop-20 importancias:")
print(imp.head(20))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007003 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23036
[LightGBM] [Info] Number of data points in the train set: 24268, number of used features: 95
[LightGBM] [Info] Start training from score 10.230850
MAE : 0.2491
RMSE: 0.4444
MAPE: 23,865,503.07%

Top-20 importancias:
SO2_stl_resid      2141
SO2_stl_season     1909
SO2_stl_trend      1717
SO2_lag1           1189
SO2_roll72_mean     153
SO2_roll3_min       144
SO2_roll3_mean      116
SO2_lag2            103
SO2_roll3_max        94
SO2_roll72_min       46
SO2_roll24_min       41
SO2_lag22            40
SO2_lag3             39
SO2_roll48_min       35
SO2_lag21            33
SO2_lag44            32
SO2_lag72            31
SO2_lag45            27
SO2_lag23            27
SO2_lag11            26
dtype: int32


In [12]:
# === Imports ===
import numpy as np
import pandas as pd
from pathlib import Path

from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer

# ---------------------------
# 0) Utilidades
# ---------------------------
def mape_safe(y_true, y_pred, eps: float = 1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return np.mean(np.abs((y_true - y_pred) / denom))

def wmape(y_true, y_pred, eps=1e-8):
    """Weighted MAPE: sum(|err|)/sum(|y|). Robusta a ceros individuales."""
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    return np.sum(np.abs(y_true - y_pred)) / (np.sum(np.abs(y_true)) + eps)

def smape(y_true, y_pred, eps=1e-8):
    """sMAPE: 2|err|/(|y|+|y_hat|). También evita explosiones en ceros."""
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    return np.mean(2.0 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + eps))

def mape_clip(y_true, y_pred, min_denom):
    """
    MAPE 'clipeado': denom = max(|y|, min_denom). Elige min_denom en unidades del problema
    (p.ej. resolución del sensor, o percentil 5 de |y|).
    """
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = np.maximum(np.abs(y_true), float(min_denom))
    return np.mean(np.abs((y_true - y_pred) / denom))


def cargar_selected_cols(path="selected_cols.txt"):
    p = Path(path)
    if not p.exists():
        # fallback: tomar el último selected_cols_*.txt si existe
        cands = sorted(Path(".").glob("selected_cols_*.txt"))
        if not cands:
            raise FileNotFoundError("No se encontró 'selected_cols.txt' ni archivos 'selected_cols_*.txt'.")
        p = cands[-1]
        print(f"Usando: {p.name}")
    with p.open("r", encoding="utf-8") as f:
        cols = [ln.strip() for ln in f if ln.strip()]
    return cols

# ---------------------------
# 1) Configuración
# ---------------------------
TARGET   = "SO2"
TIME_COL = "datetime"

# df_feat: tu DataFrame ya construido con build_features_from_df(...)
# df_feat = ...

# ---------------------------
# 2) Leer columnas seleccionadas de TXT
# ---------------------------
selected_cols = cargar_selected_cols("selected_cols.txt")
selected_cols = [c for c in selected_cols if (c in df_feat.columns) and (c not in {TARGET, TIME_COL})]
if not selected_cols:
    raise ValueError("La lista de columnas seleccionadas quedó vacía tras intersectar con df_feat.")

# ---------------------------
# 3) Armar X, y y preprocesado mínimo
# ---------------------------
y = df_feat[TARGET].astype(float)
X = df_feat[selected_cols].copy()

# Imputación súper simple (mediana); evita NaNs
if X.isna().any().any():
    med = X.median(numeric_only=True)
    X = X.fillna(med).fillna(-1)

# Detectar y marcar categóricas (si hay object/category)
for c in X.columns:
    if X[c].dtype == "O":
        X[c] = X[c].astype("category")
cat_cols = [c for c in X.columns if str(X[c].dtype) == "category"]

# ---------------------------
# 4) Split temporal 80/20 (hold-out para métricas finales)
# ---------------------------
n = len(X)
cut = int(n * 0.8)
X_train, y_train = X.iloc[:cut].copy(), y.iloc[:cut].copy()
X_test,  y_test  = X.iloc[cut:].copy(),  y.iloc[cut:].copy()

# Alinear categorías entre train/test (evita categorías desconocidas)
for c in cat_cols:
    X_train[c] = X_train[c].astype("category")
    X_test[c]  = X_test[c].astype("category")
    cats = list(set(X_train[c].cat.categories) | set(X_test[c].cat.categories))
    X_train[c] = X_train[c].cat.set_categories(cats)
    X_test[c]  = X_test[c].cat.set_categories(cats)

# ---------------------------
# 5) Búsqueda de hiperparámetros con TSCV + MAPE
# ---------------------------
base = LGBMRegressor(random_state=15926)

param_dist = {
    "n_estimators":       [300, 500, 800, 1200],
    "learning_rate":      [0.03, 0.05, 0.08, 0.1],
    "num_leaves":         [31, 63, 127, 255],
    "max_depth":          [-1, 6, 8, 10],
    "min_child_samples":  [10, 20, 40, 80],
    "subsample":          [0.7, 0.8, 0.9, 1.0],       # alias bagging_fraction
    "colsample_bytree":   [0.7, 0.8, 0.9, 1.0],       # alias feature_fraction
    "reg_alpha":          [0.0, 0.1, 0.3, 1.0],
    "reg_lambda":         [0.0, 1.0, 3.0, 10.0],
}

tscv = TimeSeriesSplit(n_splits=3)
wmape_scorer = make_scorer(wmape, greater_is_better=False)

search = RandomizedSearchCV(
    estimator=base,
    param_distributions=param_dist,
    n_iter=100,
    scoring=wmape_scorer,         
    cv=tscv,
    n_jobs=-1,
    random_state=15926,
    verbose=0,
)

search.fit(X_train, y_train, **fit_kwargs)
best_params = search.best_params_

# 2) entrena final y reporta métricas robustas además del MAE/RMSE
lgbm = LGBMRegressor(random_state=15926, **best_params).fit(X_train, y_train, **fit_kwargs)
y_pred = lgbm.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))

# wMAPE y sMAPE
wmape_val = wmape(y_test, y_pred) * 100
smape_val = smape(y_test, y_pred) * 100

# (opcional) MAPE clippeado: usa, por ejemplo, el percentil 5 de |y_train| como denominador mínimo
min_denom = np.percentile(np.abs(y_train), 5)  # ajusta al dominio (o pon 1.0 si tiene sentido físico)
mape_c = mape_clip(y_test, y_pred, min_denom) * 100

print("\nMétricas hold-out (20% final):")
print(f"MAE   : {mae:,.4f}")
print(f"RMSE  : {rmse:,.4f}")
print(f"wMAPE : {wmape_val:,.2f}%")
print(f"sMAPE : {smape_val:,.2f}%")
print(f"MAPE† : {mape_c:,.2f}%   †(clip con min_denom={min_denom:,.4f})")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23036
[LightGBM] [Info] Number of data points in the train set: 24268, number of used features: 95
[LightGBM] [Info] Start training from score 10.230850
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23036
[LightGBM] [Info] Number of data points in the train set: 24268, number of used features: 95
[LightGBM] [Info] Start training from score 10.230850

Métricas hold-out (20% final):
MAE   : 0.2017
RMSE  : 0.4178
wMAPE : 2.77%
sMAPE : 6.00%
MAPE† : 4.05%   †(clip con min_denom=1.1706)
