# 03 - Model Training

**Purpose**: Train models, compare, save best one.

**Inputs**: `data/processed/features_v1_*.csv`

**Outputs**: `models/model_v1.joblib`

‚ö†Ô∏è **MEDICAL**: We prioritize RECALL - missing a recurrence is FATAL.

In [7]:
"""
03_model_training.ipynb
Entrenament de models per predicci√≥ de recidiva en c√†ncer d'endometri.
Focus: Maximitzar SENSIBILITAT (Recall) - Un Fals Negatiu √©s fatal!
"""

import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    recall_score, precision_score, f1_score, make_scorer
)

# XGBoost
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    print("‚ö†Ô∏è XGBoost no instal¬∑lat. Executa: pip install xgboost")
    XGBOOST_AVAILABLE = False

# SMOTE per balanceig
try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
    SMOTE_AVAILABLE = True
except ImportError:
    print("‚ö†Ô∏è imbalanced-learn no instal¬∑lat. Executa: pip install imbalanced-learn")
    SMOTE_AVAILABLE = False

# Constants
RANDOM_STATE = 42
PROCESSED_DATA_PATH = '../data/processed/dataset_procesado.csv'
MODELS_PATH = '../models/'

‚ö†Ô∏è imbalanced-learn no instal¬∑lat. Executa: pip install imbalanced-learn


In [6]:
# C√†rrega del dataset processat
df = pd.read_csv(PROCESSED_DATA_PATH)

# Separar X i y
X = df.drop(columns=['recidiva'])
y = df['recidiva']

print(f"üìä Dataset shape: {X.shape}")
print(f"\nüéØ Distribuci√≥ del target:")
print(y.value_counts())
print(f"\n‚ö†Ô∏è Ratio desbalanceig: {y.value_counts()[0] / y.value_counts()[1]:.2f}:1")

KeyError: "['recidiva'] not found in axis"

In [None]:
# Split estratificat per mantenir proporci√≥ del target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=RANDOM_STATE,
    stratify=y  # Cr√≠tic per datasets desbalancejats
)

print(f"Train: {X_train.shape[0]} mostres")
print(f"Test: {X_test.shape[0]} mostres")
print(f"\nDistribuci√≥ train: {y_train.value_counts().to_dict()}")
print(f"Distribuci√≥ test: {y_test.value_counts().to_dict()}")

In [None]:
# Escalar features (important per Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Guardar scaler
import os
os.makedirs(MODELS_PATH, exist_ok=True)
joblib.dump(scaler, f'{MODELS_PATH}scaler_v1.joblib')
print(f"‚úÖ Scaler guardat a: {MODELS_PATH}scaler_v1.joblib")

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    """
    Avaluaci√≥ amb focus m√®dic: prioritzem SENSIBILITAT (Recall).
    Un Fals Negatiu (no detectar recidiva) √©s pitjor que un Fals Positiu.
    """
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    print(f"\n{'='*60}")
    print(f"üìã RESULTATS: {model_name}")
    print(f"{'='*60}")
    
    # M√®triques
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\nüéØ SENSIBILITAT (Recall): {recall:.3f}  ‚Üê PRIORITAT M√ÄXIMA")
    print(f"üìä Precisi√≥: {precision:.3f}")
    print(f"‚öñÔ∏è F1-Score: {f1:.3f}")
    
    if y_proba is not None:
        auc = roc_auc_score(y_test, y_proba)
        print(f"üìà ROC-AUC: {auc:.3f}")
    
    # Matriu de confusi√≥
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nüìä Matriu de Confusi√≥:")
    print(f"   TN={cm[0,0]}  FP={cm[0,1]}")
    print(f"   FN={cm[1,0]}  TP={cm[1,1]}")
    
    # Interpretaci√≥ m√®dica
    fn = cm[1,0]
    if fn > 0:
        print(f"\n‚ö†Ô∏è ALERTA: {fn} Falsos Negatius (pacients amb recidiva no detectada)")
    else:
        print(f"\n‚úÖ Cap Fals Negatiu!")
    
    print(f"\n{classification_report(y_test, y_pred, target_names=['No Recidiva', 'Recidiva'])}")
    
    return {'recall': recall, 'precision': precision, 'f1': f1, 'model': model}

In [None]:
# Logistic Regression amb class_weight per compensar desbalanceig
log_reg = LogisticRegression(
    class_weight='balanced',  # Ajusta pesos autom√†ticament
    max_iter=1000,
    random_state=RANDOM_STATE,
    solver='lbfgs'
)

log_reg.fit(X_train_scaled, y_train)
results_lr = evaluate_model(log_reg, X_test_scaled, y_test, "Logistic Regression")

In [None]:
# Random Forest amb class_weight
rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',  # Important!
    max_depth=5,              # Evitar overfitting (poques mostres)
    min_samples_leaf=5,       # Regularitzaci√≥
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)  # RF no necessita escalat
results_rf = evaluate_model(rf_model, X_test, y_test, "Random Forest")

In [None]:
if XGBOOST_AVAILABLE:
    # Calcular scale_pos_weight per desbalanceig
    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    
    xgb_model = xgb.XGBClassifier(
        scale_pos_weight=scale_pos_weight,  # Compensa desbalanceig
        max_depth=3,                         # Evitar overfitting
        learning_rate=0.1,
        n_estimators=100,
        min_child_weight=5,                  # Regularitzaci√≥
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE,
        eval_metric='logloss',
        use_label_encoder=False
    )
    
    xgb_model.fit(X_train, y_train)
    results_xgb = evaluate_model(xgb_model, X_test, y_test, "XGBoost")
else:
    print("‚ö†Ô∏è XGBoost no disponible")
    results_xgb = None

In [None]:
# Comparar models per Recall (prioritat m√®dica)
print("\n" + "="*60)
print("üèÜ COMPARACI√ì FINAL (Ordenat per RECALL)")
print("="*60)

results = [
    ("Logistic Regression", results_lr),
    ("Random Forest", results_rf),
]
if results_xgb:
    results.append(("XGBoost", results_xgb))

# Ordenar per recall (descendent)
results_sorted = sorted(results, key=lambda x: x[1]['recall'], reverse=True)

for i, (name, res) in enumerate(results_sorted):
    emoji = "ü•á" if i == 0 else "ü•à" if i == 1 else "ü•â"
    print(f"{emoji} {name}: Recall={res['recall']:.3f}, F1={res['f1']:.3f}")

# Seleccionar millor model
best_name, best_result = results_sorted[0]
best_model = best_result['model']
print(f"\n‚úÖ Millor model seleccionat: {best_name}")

In [None]:
# Guardar el millor model
model_filename = f'{MODELS_PATH}model_v1.joblib'
joblib.dump(best_model, model_filename)
print(f"‚úÖ Model guardat a: {model_filename}")

# Guardar feature names per l'app
feature_names = list(X.columns)
joblib.dump(feature_names, f'{MODELS_PATH}feature_names_v1.joblib')
print(f"‚úÖ Feature names guardats")

print(f"\nüìã Features utilitzades:")
for i, feat in enumerate(feature_names, 1):
    print(f"   {i}. {feat}")

In [None]:
import matplotlib.pyplot as plt

# Feature importance del Random Forest
if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
elif hasattr(best_model, 'coef_'):
    importances = np.abs(best_model.coef_[0])
else:
    importances = None

if importances is not None:
    feat_imp = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=True)
    
    plt.figure(figsize=(10, 6))
    plt.barh(feat_imp['feature'], feat_imp['importance'], color='steelblue')
    plt.xlabel('Import√†ncia')
    plt.title('Import√†ncia de les Features')
    plt.tight_layout()
    plt.show()

## Next Step
‚Üí Go to `04_model_evaluation.ipynb`