In [1]:
# === catboost_train_score_numeric.py ===
import numpy as np
import pandas as pd
from typing import Dict, Any, Optional, List
from catboost import CatBoostClassifier, Pool

IGNORED_COLS_DEFAULT = ["filepath", "path"]

def _auto_class_weights(y: pd.Series) -> Optional[List[float]]:
    """
    Si hay desbalance, calcula pesos inversamente proporcionales a la frecuencia.
    Si está balanceado, regresa None.
    """
    vc = y.value_counts(normalize=True)
    if vc.min() < 0.4:  # umbral simple de desbalance
        # Importante: mantener el orden de clases como strings/nums ordenados por valor
        # CatBoost mapeará internamente; usaremos model.classes_ luego.
        return (1.0 / vc).tolist()
    return None

def _select_numeric_features(df: pd.DataFrame, drop_cols: List[str], target_col: Optional[str]) -> List[str]:
    cols = df.columns.tolist()
    cols = [c for c in cols if c not in (drop_cols or [])]
    if target_col and target_col in cols:
        cols.remove(target_col)
    num_cols = df[cols].select_dtypes(include=[np.number]).columns.tolist()
    if not num_cols:
        raise ValueError("No se encontraron columnas numéricas después de excluir filepath/path/target.")
    return num_cols

def train_and_score_catboost_numeric(
    train_df: pd.DataFrame,
    score_df: pd.DataFrame,
    target_col: str = "clase",
    drop_cols: Optional[List[str]] = None,
    params: Optional[Dict[str, Any]] = None,
    model_path: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Entrena CatBoostClassifier con SOLO features numéricas y predice sobre score_df.
    - Usa TODO el train (sin early stopping ni holdout).
    - Ignora columnas 'filepath' y 'path' (configurable en drop_cols).
    - Alinea y ordena columnas numéricas iguales entre train y score.
    - Devuelve: modelo, columnas usadas, y un DF con predicciones sobre score (y probs).
    """
    drop_cols = drop_cols or IGNORED_COLS_DEFAULT

    # Validaciones básicas
    if target_col not in train_df.columns:
        raise ValueError(f"'{target_col}' no está en train_df.")
    if target_col in score_df.columns:
        # No es grave, pero evitamos fugas por si alguien la dejó
        score_df = score_df.drop(columns=[target_col]).copy()

    # 1) Selección de columnas numéricas (tras ignorar filepath/path y target)
    train_num_cols = _select_numeric_features(train_df, drop_cols, target_col)
    score_num_cols = _select_numeric_features(score_df, drop_cols, target_col=None)

    # 2) Intersección y orden consistente de columnas
    shared_num_cols = sorted(list(set(train_num_cols).intersection(score_num_cols)))
    if not shared_num_cols:
        raise ValueError("Train y Score no comparten columnas numéricas después de los filtros.")
    # Mantén un orden estable (alfabético o como en train). Elegimos como en train:
    shared_num_cols = [c for c in train_num_cols if c in shared_num_cols]

    X_train = train_df[shared_num_cols]
    y_train = train_df[target_col]
    X_score = score_df[shared_num_cols]

    # 3) Parámetros (sin early stopping; entrenar con TODO el train)
    default_params = dict(
        loss_function="MultiClass" if y_train.nunique() > 2 else "Logloss",
        eval_metric="MultiClass" if y_train.nunique() > 2 else "AUC",
        learning_rate=0.05,
        depth=6,
        l2_leaf_reg=10.0,
        iterations=2000,          # puedes subir/bajar
        random_seed=42,
        verbose=False,
        use_best_model=False,     # <- importante: sin holdout
        class_weights=None        # se setea abajo si aplica
    )

    # Ponderación de clases automática si hay desbalance
    cw = _auto_class_weights(y_train)
    if cw is not None:
        default_params["class_weights"] = cw

    if params:
        default_params.update(params)

    # 4) Entrenar
    train_pool = Pool(X_train, y_train)  # CatBoost maneja NaNs numéricos nativamente
    model = CatBoostClassifier(**default_params)
    model.fit(train_pool)

    # 5) Predecir en score
    score_pool = Pool(X_score)
    y_pred = model.predict(score_pool)
    # y_pred es una matriz Nx1; aplanamos
    y_pred = np.array(y_pred).ravel()

    # Probabilidades por clase (si el modelo las soporta)
    prob_cols = []
    try:
        proba = model.predict_proba(score_pool)  # shape: (n, n_classes) o (n, 2) binario
        classes_ = model.classes_ if hasattr(model, "classes_") else np.unique(y_train)
        # Crear columnas p_<clase>
        prob_cols = [f"p_{str(c)}" for c in classes_]
    except Exception:
        proba = None

    # 6) Armar salida de predicción
    out_cols = []
    for c in drop_cols:
        if c in score_df.columns:
            out_cols.append(c)

    preds_df = pd.DataFrame(index=score_df.index)
    for c in out_cols:
        preds_df[c] = score_df[c]

    preds_df["y_pred"] = y_pred

    if proba is not None:
        # Asegurar que las columnas sigan el orden de classes_
        for j, cname in enumerate(prob_cols):
            preds_df[cname] = proba[:, j]

    # 7) Guardar modelo si se solicita
    if model_path:
        model.save_model(model_path)

    return {
        "model": model,
        "used_numeric_features": shared_num_cols,
        "preds_score": preds_df
    }



In [13]:
# Ejemplo de cómo llamarlo con tus dfs ya cargados:
train_df = pd.read_parquet(r'model/audio_features_train.parquet')  # tiene 'clase'
score_df = pd.read_parquet(r'model/audio_features_test.parquet')  # NO tiene 'clase'

out = train_and_score_catboost_numeric(
    train_df=train_df,
     score_df=score_df,
     target_col="clase",
     drop_cols=["nombre_archivo", "path"],
     params=dict(iterations=1500, depth=5, l2_leaf_reg=12.0),
     model_path="modelo_catboost.cbm"
 )
print("Features usadas:", len(out["used_numeric_features"]))
print(out["preds_score"])



Features usadas: 116
   nombre_archivo  y_pred       p_1       p_2       p_3       p_4       p_5
0           1.wav       4  0.002017  0.004600  0.003525  0.943621  0.046237
1          10.wav       2  0.013846  0.850051  0.018623  0.028880  0.088599
2          11.wav       4  0.003369  0.007971  0.006404  0.568029  0.414227
3          12.wav       2  0.008453  0.906503  0.010593  0.013710  0.060741
4          13.wav       2  0.015309  0.794087  0.020001  0.036925  0.133677
5          14.wav       5  0.000912  0.002033  0.001690  0.005830  0.989535
6          15.wav       5  0.001714  0.003352  0.002903  0.011637  0.980393
7          16.wav       2  0.017224  0.829541  0.021198  0.038044  0.093993
8          17.wav       5  0.014785  0.028155  0.035442  0.150090  0.771528
9          18.wav       4  0.008132  0.015700  0.012873  0.798402  0.164892
10         19.wav       5  0.281788  0.064656  0.088371  0.065364  0.499819
11          2.wav       5  0.009001  0.016599  0.021234  0.043517  

In [14]:
preds = out["preds_score"]
solo_cols = preds[["nombre_archivo", "y_pred"]]


In [15]:
solo_cols

Unnamed: 0,nombre_archivo,y_pred
0,1.wav,4
1,10.wav,2
2,11.wav,4
3,12.wav,2
4,13.wav,2
5,14.wav,5
6,15.wav,5
7,16.wav,2
8,17.wav,5
9,18.wav,4
