In [None]:
# === catboost_clf_numeric.py ===
import numpy as np
import pandas as pd
from typing import Dict, Any, Optional
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)

def _detect_problem_type(y: pd.Series) -> str:
    """Devuelve 'Binary' o 'Multiclass' según el número de clases."""
    classes = pd.Series(y).dropna().unique()
    return "Binary" if len(classes) == 2 else "Multiclass"

def _auto_class_weights(y: pd.Series) -> Optional[list]:
    """
    Para 2 clases: devuelve [w_neg, w_pos] inversamente proporcionales a la frecuencia.
    Para multiclase: lista de pesos por clase.
    Si clases balanceadas, regresa None.
    """
    vc = y.value_counts(normalize=True)
    if vc.min() < 0.4:  # umbral sencillo de desbalance
        weights = (1.0 / vc).reindex(sorted(vc.index)).tolist()
        return weights
    return None

def train_catboost_classifier_numeric(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    target_col: str,
    id_col: Optional[str] = None,
    params: Optional[Dict[str, Any]] = None,
    model_path: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Entrena y evalúa CatBoostClassifier con features numéricas.
    - train_df y test_df deben incluir la columna target_col.
    - id_col (opcional) se agrega al DataFrame de predicciones.
    - params (opcional) para sobreescribir hiperparámetros.
    - model_path (opcional) para guardar el modelo .cbm

    Retorna dict con: modelo, métricas, preds_df, feature_importances
    """
    assert target_col in train_df.columns, "target_col no está en train_df"
    assert target_col in test_df.columns, "target_col no está en test_df"

    # 1) Separar X, y (solo numéricas)
    num_cols = train_df.drop(columns=[target_col]).select_dtypes(include=[np.number]).columns.tolist()
    if not num_cols:
        raise ValueError("No hay columnas numéricas en train_df (revisa tipos).")

    X_train = train_df[num_cols]
    y_train = train_df[target_col]
    X_test  = test_df[num_cols]
    y_test  = test_df[target_col]

    # 2) Detectar tipo de problema y objetivo de pérdida
    problem_type = _detect_problem_type(y_train)
    loss_function = "Logloss" if problem_type == "Binary" else "MultiClass"
    eval_metric   = "AUC" if problem_type == "Binary" else "MultiClass"

    # 3) Pesos de clase (automático si desbalanceado)
    class_weights = _auto_class_weights(y_train)

    # 4) Parámetros por defecto (buenos para empezar)
    default_params = dict(
        loss_function=loss_function,
        eval_metric=eval_metric,
        learning_rate=0.05,
        depth=6,
        l2_leaf_reg=6.0,
        iterations=3000,
        random_seed=42,
        od_type="Iter",              # early stopping
        od_wait=200,                 # paciencia
        verbose=False,
        class_weights=class_weights, # None si balanceado
        # CatBoost maneja NaNs en numéricas nativamente
    )
    if params:
        default_params.update(params)

    # 5) Pools (CatBoost maneja NaNs y no requiere escalado)
    train_pool = Pool(X_train, y_train)
    test_pool  = Pool(X_test,  y_test)

    # 6) Entrenar con early stopping usando el set de test como eval_set
    model = CatBoostClassifier(**default_params)
    model.fit(train_pool, eval_set=test_pool)

    # 7) Predicciones
    y_pred = model.predict(test_pool)
    # Probabilidades (para AUC binario o top prob multiclase)
    y_proba = None
    try:
        y_proba = model.predict_proba(test_pool)
    except Exception:
        pass

    # 8) Métricas
    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average="macro")
    metrics = {"accuracy": acc, "f1_macro": f1m}

    if problem_type == "Binary" and y_proba is not None and y_proba.ndim == 2 and y_proba.shape[1] == 2:
        try:
            auc = roc_auc_score(y_test, y_proba[:, 1])
            metrics["roc_auc"] = auc
        except Exception:
            pass

    # Reporte y matriz de confusión (útil para inspección)
    report = classification_report(y_test, y_pred, output_dict=False)
    cm = confusion_matrix(y_test, y_pred)

    # 9) Importancias
    fi = pd.Series(model.get_feature_importance(train_pool), index=num_cols, name="importance") \
           .sort_values(ascending=False).reset_index().rename(columns={"index": "feature"})

    # 10) DataFrame de predicciones
    preds_df = pd.DataFrame({
        "y_true": y_test.values,
        "y_pred": y_pred
    })
    if y_proba is not None:
        if problem_type == "Binary":
            preds_df["p_pos"] = y_proba[:, 1]
        else:
            # guarda prob de la clase predicha
            max_proba = y_proba.max(axis=1)
            preds_df["p_pred"] = max_proba
    if id_col and id_col in test_df.columns:
        preds_df.insert(0, id_col, test_df[id_col].values)

    # 11) Guardar modelo (opcional)
    if model_path:
        model.save_model(model_path)

    return {
        "model": model,
        "problem_type": problem_type,
        "metrics": metrics,
        "report_txt": report,
        "confusion_matrix": cm,
        "preds_df": preds_df,
        "feature_importances": fi
    }


In [None]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=3000, n_features=30, n_informative=10,
                            n_redundant=5, n_classes=3, weights=None, random_state=42)
df = pd.DataFrame(X, columns=[f"x{i}" for i in range(30)])
df["target"] = y
train_df = df.sample(frac=0.8, random_state=42)
test_df  = df.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

out = train_catboost_classifier_numeric(
    train_df=train_df,
    test_df=test_df,
    target_col="target",
    id_col=None,                      # o un id si lo tienes, ej. "SamplingOperations_code"
    params=dict(iterations=1500),     # puedes sobreescribir cualquier hiperparámetro
    model_path=None                   # opcional: "cat_model.cbm"
)

print("== Tipo de problema:", out["problem_type"])
print("== Métricas:", out["metrics"])
print("== Reporte ===")
print(out["report_txt"])
print("== Top 10 features ===")
print(out["feature_importances"].head(10))
print("== Preds sample ===")
print(out["preds_df"].head())