3. Model Training
**Objetivo**: Entrenar y evaluar los modelos (`rf`, `logreg`, `lgbm`) usando validación cruzada multilabel, medir ROC-AUC por clase y guardar artefactos. 

In [2]:
import sys, os
from pathlib import Path
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

In [3]:
root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if root not in sys.path:
    sys.path.insert(0, root)
print(root)

C:\Users\ignag\OneDrive\Documentos\tfg‑steel‑plate‑defects


In [4]:
# Añade src al path para importar módulos propios
from src import ROOT_DIR
from src.pipeline.preprocessing import build_preprocessing_pipeline
from src.models.training import (
    load_data, make_holdout_split, make_folds,
    compute_pos_weights, build_estimator, roc_auc_by_class
)

# Parámetros
MODEL_NAME = sys.argv[1] if len(sys.argv) > 1 else "lgbm"
N_SPLITS = 5
RANDOM_STATE = 2
DATA_RAW = ROOT_DIR / "data/raw/playground-series-s4e3"

In [6]:
# Carga X, y
X, y = load_data()

# Hold-out estratificado multilabel
train_idx, hold_idx = make_holdout_split(X, y, test_size=0.15)
print(f"Train: {len(train_idx)} filas, Hold-out: {len(hold_idx)} filas")

# Folds multilabel
folds = make_folds(train_idx, y, n_splits=N_SPLITS)
print("Número de folds:", len(folds))

Train: 16336 filas, Hold-out: 2883 filas
Número de folds: 5


In [7]:
import warnings
# Oculta TODOS los FutureWarning cuyo módulo empiece por "sklearn."
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    module=r"sklearn\..*"
)

## 3. Pipeline y estimador

In [9]:
from src.models.training import run_cv

MODEL_NAME = "lgbm"   #  "lgbm", "rf", "logreg"
cv_df = run_cv(model_name=MODEL_NAME, n_splits=5, do_oversample=True)

# Pesos por clase
pos_weights = compute_pos_weights(y[train_idx])

# Crear pipeline y modelo
preprocessor = build_preprocessing_pipeline()

estimator = build_estimator(MODEL_NAME, pos_weights)
print("Modelo:", MODEL_NAME)

AUC media por fold:
    fold  mean_auc
0     0  0.882675
1     1  0.876933
2     2  0.881184
3     3  0.877993
4     4  0.875472
AUC global: 0.8788513693325003
Modelo: lgbm


## 4. Validación cruzada multilabel

In [21]:
cv_results = []
for i, val_idx in enumerate(folds):
    # Índices de entrenamiento de este fold
    tr_idx = np.setdiff1d(train_idx, val_idx)
    X_tr_raw, y_tr = X.iloc[tr_idx], y[tr_idx]
    X_va_raw, y_va = X.iloc[val_idx], y[val_idx]

    # Preprocesamiento
    X_tr = preprocessor.fit_transform(X_tr_raw)
    X_va = preprocessor.transform(X_va_raw)

    # Entrenamiento y predicción
    if isinstance(estimator, list):
        y_pred_cols = []
        for k, clf in enumerate(estimator):
            clf.fit(X_tr, y_tr[:, k])
            y_pred_cols.append(clf.predict_proba(X_va)[:,1])
        y_pred = np.vstack(y_pred_cols).T
    else:
        estimator.fit(X_tr, y_tr)
        proba = estimator.predict_proba(X_va)
        y_pred = np.vstack([p[:,1] for p in proba]).T

    # Métricas
    metrics = roc_auc_by_class(y_va, y_pred)
    metrics['fold'] = i
    cv_results.append(metrics)

# DataFrame de resultados
df_cv = pd.DataFrame(cv_results)
df_cv.to_csv(ROOT_DIR / 'reports/tables' / f'{MODEL_NAME}_cv_metrics.csv', index=False)



## 5. Resultados y visualización

In [None]:
# Mostrar resultados
display(df_cv)

# Media global
mean_auc = df_cv['mean_auc'].mean()
print(f"Media global ROC-AUC: {mean_auc:.4f}")

# Boxplot de AUC por clase
plt.figure(figsize=(8,5))
classes = [c for c in df_cv.columns if c not in ['fold','mean_auc']]
df_cv[classes].boxplot(rot=45)
plt.title(f"Distribución de ROC-AUC por clase ({MODEL_NAME})")
plt.ylabel('ROC-AUC')
plt.tight_layout()
plt.show()

## 6. Entrenamiento final y guardado de modelos

In [None]:
# Preprocesador entrenado con todo el train_rem (85%)
X_rem = X.iloc[train_idx]
y_rem = y[train_idx]
X_rem_p = preprocessor.fit_transform(X_rem)

# Entrenar estimador final y guardar
if isinstance(estimator, list):
    for k, clf in enumerate(estimator):
        clf.fit(X_rem_p, y_rem[:,k])
        joblib.dump(clf, ROOT_DIR / 'models' / f'{MODEL_NAME}_label{k}_final.pkl')
else:
    estimator.fit(X_rem_p, y_rem)
    joblib.dump(estimator, ROOT_DIR / 'models' / f'{MODEL_NAME}_final.pkl')

# Guardar preprocesador final
joblib.dump(preprocessor, ROOT_DIR / 'models' / 'preprocessor_final.pkl')

print("Modelos finales guardados en 'models/'")