In [1]:
#jupyter nbconvert --to script Model.ipynb


#TODO: Añadir las métricas que voy a usar finales
#TODO: Platt scaling
#TODO: Hyperband
#TODO: Corregir los logs
#TODO: Conformal prediction

# Optional with Shapely or sth like that?
#TODO: Estudiar relevancia de las features para cada modelo y cada grupo.

In [2]:
# Remove any existing log files
import os
import glob
import logging

# Reset logger to avoid any issues with permissions
logging.shutdown()
# Remove loggers
for log_file in glob.glob("*.log"):
    os.remove(log_file)



# Star-Galaxy Classification using ALHAMBRA Photometry

This notebook implements and evaluates several machine learning models for classifying astronomical objects as stars or galaxies based on multi-band photometric data from the ALHAMBRA survey, using labels derived from higher-resolution COSMOS2020 data.

**Target Variable:** `acs_mu_class` (from COSMOS2020)
 - Which is 1 for Galaxy and 2 for Star. We will remap this to 0 (Galaxy, majority class) and 1 (Star, minority class).

**Features:** Selected columns from the ALHAMBRA survey data.

**Models:**
1. Support Vector Machine (SVM)
2. Decision Tree (CART)
3. Random Forest
4. XGBoost
5. LightGBM

## 0. Setup and Configuration

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
import logging
from tqdm.notebook import tqdm # Use notebook tqdm for better integration in Jupyter
import time
from datetime import datetime
import joblib # For saving/loading models efficiently
import glob

# Scikit-learn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV,  ParameterSampler
from sklearn.linear_model import LogisticRegression
from scipy.stats import loguniform # For hyperparameter distributions
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, roc_auc_score,
    brier_score_loss, precision_recall_curve, auc
)   
import seaborn as sns # For confusion matrix heatmap

# Boosting models
import xgboost as xgb
import lightgbm as lgb


# Configure logging
logging.shutdown()
logging.basicConfig(
    filename=f'models_{datetime.now().strftime("%d_%H-%M-%S")}.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)
# Prevent logs from being printed to console
logging.getLogger().handlers = [h for h in logging.getLogger().handlers if isinstance(h, logging.FileHandler)]

## 1. Loading Dataset & Feature Selection

**Interesting Feature Combinations for Modeling:**
 
 The feature groups are defined as follows:
 - Group 1: Morphology features and their uncertainties
 - Group 2: Photometry magnitudes
 - Group 3: Photometry magnitude and errors
 - Group 4: Redshift features and their uncertainties
 - Group 5: Combination of photometry magnitude errors and morphology features (including uncertainties)
 - Group 6: Combination of photometry magnitude errors, morphology features (including uncertainties), and redshift features (including uncertainties)




In [4]:
# Read the df
df = pd.read_csv('data/match_alhambra_cosmos2020_ACS_class_0.8arcsec.csv')
logging.info(f"DataFrame created with shape: {df.shape}")
# Map ACS classification: 1 (Galaxy, Majority) -> 0, 2 (Star, minority) -> 1, 3 (Fake) -> drop
logging.info("Original class counts:")
logging.info(df['acs_mu_class'].value_counts().to_string())

# Drop fake detections (class 3)
# Drop fake detections
n_fakes = (df['acs_mu_class'] == 3).sum()
logging.info(f"Number of fake detections (class 3): {n_fakes}")
df = df[df['acs_mu_class'] != 3]

# Map classifications
df['acs_mu_class'] = df['acs_mu_class'].map({1: 0, 2: 1})

logging.info("After dropping fakes and mapping classes (0: Galaxy, 1: Star):")
logging.info(df['acs_mu_class'].value_counts().to_string())

In [5]:
# Input features

# --- Define feature categories based on ALHAMBRA data using exact names ---

# 1. ALHAMBRA Morphology Features (SExtractor-based)
morphology_features = [
    'area', 'fwhm', 'stell', 'ell', 'a', 'b', 'theta', 'rk', 'rf'
]

morphology_err = [
    's2n'
]

morphology_mags_errors = morphology_features + morphology_err

# 2. ALHAMBRA Photometry Magnitudes (Optical + NIR + Synthetic)
OPTICAL_MAG_COLS = [
    'F365W', 'F396W', 'F427W', 'F458W', 'F489W', 'F520W', 'F551W',
    'F582W', 'F613W', 'F644W', 'F675W', 'F706W', 'F737W', 'F768W',
    'F799W', 'F830W', 'F861W', 'F892W', 'F923W', 'F954W'
]
photometry_magnitudes = (
    OPTICAL_MAG_COLS +
    ['J', 'H', 'KS', 'F814W']
)

# 3. ALHAMBRA Photometry Uncertainties
OPTICAL_ERR_COLS = [
    'dF365W', 'dF396W', 'dF427W', 'dF458W', 'dF489W', 'dF520W', 'dF551W',
    'dF582W', 'dF613W', 'dF644W', 'dF675W', 'dF706W', 'dF737W', 'dF768W',
    'dF799W', 'dF830W', 'dF861W', 'dF892W', 'dF923W', 'dF954W'
]
photometry_uncertainties = (
    OPTICAL_ERR_COLS +
    ['dJ', 'dH', 'dKS', 'dF814W']
)

photometry_mags_errors = photometry_magnitudes + photometry_uncertainties

# 4. ALHAMBRA Photometric Redshift & Derived Features (BPZ-based)
redshift_features = [
    'zb_1', 'zb_Min_1', 'zb_Max_1', 'Tb_1',
    'z_ml', 't_ml',
    'Stell_Mass_1', 'M_Abs_1', 'MagPrior'
]

redshift_uncertainties = [
    'Odds_1', 'Chi2'
]


redshift_mags_errors = redshift_features + redshift_uncertainties

# 5. ALHAMBRA Quality/Auxiliary Features (per-band quality etc.)
OPTICAL_IRMS_COLS = [
    'irms_F365W', 'irms_F396W', 'irms_F427W', 'irms_F458W', 'irms_F489W',
    'irms_F520W', 'irms_F551W', 'irms_F582W', 'irms_F613W', 'irms_F644W',
    'irms_F675W', 'irms_F706W', 'irms_F737W', 'irms_F768W', 'irms_F799W',
    'irms_F830W', 'irms_F861W', 'irms_F892W', 'irms_F923W', 'irms_F954W'
]
quality_aux_features = (
    ['nfobs'] +
    OPTICAL_IRMS_COLS +
    ['irms_J', 'irms_H', 'irms_KS', 'irms_F814W']
)

# --- Define lists of features NOT used for modeling ---

non_modeling_identifiers = ['ID_1', 'id_2'] # ALHAMBRA ID, COSMOS ID

non_modeling_astrometry = [
    'RA_1', 'Dec_1', 'x', 'y', # ALHAMBRA Astrometry
    'ra_2', 'dec_2',          # COSMOS Astrometry
    'Separation'              # Matching Quality
]

non_modeling_flags = [
    'photoflag', 'xray', 'PercW', 'Satur_Flag', # ALHAMBRA Object/Photometry Flags
    'irms_OPT_Flag', 'irms_NIR_Flag'           # ALHAMBRA Overall Quality Flags
]

alhambra_prediction = ['Stellar_Flag'] # ALHAMBRA's own classification

non_modeling_aperture_mags = [ # Specific aperture mags, usually use total mags
    'F814W_3arcs', 'dF814W_3arcs', 'F814W_3arcs_corr'
]

non_modeling_cosmos_features = [ # Measurements/flags derived from COSMOS data (HST, HSC, VISTA...)
    'model_flag',
    'flag_hsc', 'flag_supcam', 'flag_udeep', 'flag_uvista',
    'hsc_r_mag', 'hsc_r_magerr', 'hsc_r_valid',
    'hsc_i_mag', 'hsc_i_magerr', 'hsc_i_valid',
    'uvista_j_mag', 'uvista_j_magerr', 'uvista_j_valid',
    'uvista_ks_mag', 'uvista_ks_magerr', 'uvista_ks_valid',
    'acs_f814w_mag', 'acs_f814w_magerr',
    'acs_fwhm_world', 'acs_mu_max',
    'solution_model' # This is categorical, but still COSMOS-derived info
]

target_variable = ['acs_mu_class'] # The COSMOS classification label to predict

##########################################################################################
#! --- Consolidate into the main dictionary for easy access ---
##########################################################################################

feature_sets = {
        # --- Potential Input Feature Sets ---
        'morphology_only': morphology_mags_errors,
        'photometry_magnitudes_only': photometry_magnitudes,
        'photometry_mags_errors': photometry_mags_errors,
        'photometry_plus_morphology': photometry_mags_errors + morphology_mags_errors,
        'photometry_no_redshift': photometry_mags_errors + morphology_mags_errors + quality_aux_features,
        'redshift_only': redshift_mags_errors,
        'full_alhambra_all': (morphology_mags_errors +
                            photometry_mags_errors +
                            redshift_mags_errors + 
                            quality_aux_features),

        # --- Excluded Feature Sets ---
        'non_modeling_identifiers': non_modeling_identifiers,
        'non_modeling_astrometry': non_modeling_astrometry,
        'non_modeling_flags': non_modeling_flags,
        'non_modeling_aperture_mags': non_modeling_aperture_mags,
        'non_modeling_cosmos_features': non_modeling_cosmos_features,
        'alhambra_prediction': alhambra_prediction,
        'target_variable': target_variable
    }

#! This is excluding the quality aux.
# Include target_variable in each group by appending it to the feature list
groups = {
        'group_1': feature_sets.get('morphology_only', []) + feature_sets.get('target_variable', []),
        'group_2': feature_sets.get('photometry_magnitudes_only', []) + feature_sets.get('target_variable', []),
        'group_3': feature_sets.get('photometry_mags_errors', []) + feature_sets.get('target_variable', []),
        'group_4': feature_sets.get('redshift_only', []) + feature_sets.get('target_variable', []),
        'group_5': feature_sets.get('photometry_plus_morphology', []) + feature_sets.get('target_variable', []),
        'group_6': (feature_sets.get('photometry_mags_errors', []) +
                   feature_sets.get('morphology_only', []) +
                   feature_sets.get('redshift_only', []) +
                   feature_sets.get('target_variable', [])),
        'group_7': feature_sets.get('full_alhambra_all', []) + feature_sets.get('target_variable', [])
    }

# --- Function to get a specific feature set (Unchanged from before) ---

def get_feature_set(df, set_name, groups = groups):
    """
    Selects columns from a DataFrame based on a predefined feature set name,
    including six specific groups defined by combinations of morphology,
    photometry magnitudes, uncertainties, and redshift features.

    Args:
        df (pd.DataFrame): The input DataFrame.
        set_name (str): The name of the desired feature set group:
                        'group_1' to 'group_6' as defined below.

    Returns:
        pd.DataFrame: A DataFrame containing only the columns
                      belonging to the specified feature set group.
                      Returns an empty DataFrame if no columns are found.
    """

    if set_name not in groups:
        raise ValueError(f"Feature set group '{set_name}' not defined. "
                         f"Available groups: {list(groups.keys())}")

    required_cols_in_set = groups[set_name]

    # Find which of these columns actually exist in the DataFrame
    available_cols = [col for col in required_cols_in_set if col in df.columns]

    # Warn if some columns from the set definition are missing
    missing_cols = [col for col in required_cols_in_set if col not in available_cols]
    if missing_cols:
        print(f"Warning: The following columns defined for feature set group '{set_name}'"
              f" were not found in the DataFrame and will be excluded: {missing_cols}")

    if not available_cols:
        print(f"Warning: No columns for feature set group '{set_name}' found in the DataFrame.")
        return pd.DataFrame()  # Return empty DataFrame

    print(f"Selecting feature set group '{set_name}' with {len(available_cols)} columns.")
    return df[available_cols]


In [6]:
# Quality check to see which cols are excluded and contained in each group
all_feature_cols = set()
for cols in feature_sets.values():
    all_feature_cols.update(cols)

df_cols_set = set(df.columns)
not_in_feature_sets = df_cols_set - all_feature_cols

if not_in_feature_sets:
    print(f"Columns in df not included in any feature_sets: {sorted(not_in_feature_sets)}")
else:
    print("All df columns are included in feature_sets.")


# Check which columns are in each feature group
for group_name in ['group_1', 'group_2', 'group_3', 'group_4', 'group_5', 'group_6', 'group_7']:
    print(f"\n=== {group_name} ===")
    
    # Get the feature set definition
    feature_set = groups[group_name]
    
    # Get the actual columns that exist in the data
    group_df = get_feature_set(df, group_name)
    

    available_cols = list(group_df.columns)
    
    # Find columns that are defined but not in the data
    missing_cols = [col for col in list(df.columns) if col not in feature_set]
    
    print(f"\nFeatures present ({len(available_cols)} columns):")
    print(list(sorted(available_cols)))
    
    print(f"\nFeatures missing ({len(missing_cols)} columns):")
    print(list(sorted(missing_cols)))




All df columns are included in feature_sets.

=== group_1 ===
Selecting feature set group 'group_1' with 11 columns.

Features present (11 columns):
['a', 'acs_mu_class', 'area', 'b', 'ell', 'fwhm', 'rf', 'rk', 's2n', 'stell', 'theta']

Features missing (125 columns):
['Chi2', 'Dec_1', 'F365W', 'F396W', 'F427W', 'F458W', 'F489W', 'F520W', 'F551W', 'F582W', 'F613W', 'F644W', 'F675W', 'F706W', 'F737W', 'F768W', 'F799W', 'F814W', 'F814W_3arcs', 'F814W_3arcs_corr', 'F830W', 'F861W', 'F892W', 'F923W', 'F954W', 'H', 'ID_1', 'J', 'KS', 'M_Abs_1', 'MagPrior', 'Odds_1', 'PercW', 'RA_1', 'Satur_Flag', 'Separation', 'Stell_Mass_1', 'Stellar_Flag', 'Tb_1', 'acs_f814w_mag', 'acs_f814w_magerr', 'acs_fwhm_world', 'acs_mu_max', 'dF365W', 'dF396W', 'dF427W', 'dF458W', 'dF489W', 'dF520W', 'dF551W', 'dF582W', 'dF613W', 'dF644W', 'dF675W', 'dF706W', 'dF737W', 'dF768W', 'dF799W', 'dF814W', 'dF814W_3arcs', 'dF830W', 'dF861W', 'dF892W', 'dF923W', 'dF954W', 'dH', 'dJ', 'dKS', 'dec_2', 'flag_hsc', 'flag_supcam

## 2. Data Preprocessing and Splitting

In [7]:
# Data splitting parameters
TEST_SIZE = 0.20 # Test set proportion
VAL_SIZE = 0.10 # Validation set proportion
CAL_SIZE = 0.10 # Calibration set proportion
# Train size will be 1 - (TEST_SIZE + VAL_SIZE + CAL_SIZE)

TARGET_COLUMN = feature_sets.get('target_variable', [])[0]
RANDOM_SEED = 33 # For reproducibility

# Model saving directory
MODEL_DIR = "trained_models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Data splitting strategy ('stratified' or 'random')
SPLIT_STRATEGY = 'stratified' # Recommended for imbalanced datasets


In [8]:
# --- Data Cleaning ---
logging.info(f"Original dataset size: {df.shape}")

# Choose the feature group to use (e.g., 'group_1', 'group_2', etc.)
FEATURE_GROUP = 'group_7'  # Change this to select a different group

# Get the feature columns for the selected group using get_feature_set
df_clean = get_feature_set(df, FEATURE_GROUP).dropna().copy()
logging.info(f"Dataset size after dropping NaNs: {df_clean.shape}")

# Ensure TARGET_COLUMN is defined correctly
if TARGET_COLUMN not in df_clean.columns:
    raise KeyError(f"Target column '{TARGET_COLUMN}' not found in the cleaned DataFrame columns: {df_clean.columns.tolist()}")

# Log value counts for target
logging.info(f"Value counts for target:\n1 (Star): {(df_clean[TARGET_COLUMN] == 1).sum()}\n0 (Galaxy): {(df_clean[TARGET_COLUMN] == 0).sum()}")

# Separate features (X) and target (y) for the cleaned DataFrame
X = df_clean.drop(columns=[TARGET_COLUMN])
y = df_clean[TARGET_COLUMN]

Selecting feature set group 'group_7' with 95 columns.


In [9]:
# --- Data Splitting ---
import numpy as np # Ensure numpy is imported
from sklearn.model_selection import train_test_split # Ensure train_test_split is imported

logging.info(f"Splitting data using '{SPLIT_STRATEGY}' strategy...")

# --- Validate Proportions ---
if not (0 <= TEST_SIZE <= 1 and 0 <= VAL_SIZE <= 1 and 0 <= CAL_SIZE <= 1):
     raise ValueError("Split proportions (TEST_SIZE, VAL_SIZE, CAL_SIZE) must be between 0 and 1.")

TRAIN_SIZE = 1.0 - TEST_SIZE - VAL_SIZE - CAL_SIZE
if not (0 <= TRAIN_SIZE <= 1):
     raise ValueError(f"Calculated TRAIN_SIZE ({TRAIN_SIZE:.3f}) is invalid. Sum of TEST_SIZE, VAL_SIZE, and CAL_SIZE must be between 0 and 1.")

if not np.isclose(TRAIN_SIZE + TEST_SIZE + VAL_SIZE + CAL_SIZE, 1.0):
    # This check might be redundant given the calculation of TRAIN_SIZE, but good for safety.
    raise ValueError("Sum of split proportions must be equal to 1.")

if np.isclose(TRAIN_SIZE, 0) and (np.isclose(VAL_SIZE, 0) or np.isclose(TEST_SIZE, 0) or np.isclose(CAL_SIZE, 0)):
     # Avoid scenarios where train is 0 but other splits are also 0, leading to ambiguity.
     # If only train is 0, it might be valid in some rare cases, but usually requires at least one other non-zero split.
     # Let's enforce Train > 0 for typical ML workflows.
     # If you need zero training data, adjust this check.
     logging.warning("TRAIN_SIZE is zero or near zero. Ensure this is intended.")
     if TRAIN_SIZE < 0: # Definitely an error
         raise ValueError("TRAIN_SIZE cannot be negative.")
     # Allow TRAIN_SIZE = 0 only if explicitly handled later, otherwise raise error?
     # For now, let's proceed but log a warning. If TRAIN_SIZE must be > 0, uncomment the raise below.
     # raise ValueError("TRAIN_SIZE must be greater than 0 for typical model training.")


logging.info(f"Target split ratios: Train={TRAIN_SIZE:.2f}, Val={VAL_SIZE:.2f}, Test={TEST_SIZE:.2f}, Cal={CAL_SIZE:.2f}")

# --- Initialize Splits ---
# Use iloc[0:0] to create empty DataFrames/Series with the same columns/dtype
empty_X = X.iloc[0:0]
empty_y = y.iloc[0:0]
X_train, y_train = empty_X.copy(), empty_y.copy()
X_val, y_val = empty_X.copy(), empty_y.copy()
X_test, y_test = empty_X.copy(), empty_y.copy()
X_cal, y_cal = empty_X.copy(), empty_y.copy()

# Temporary variables for sequential splitting
X_remaining, y_remaining = X.copy(), y.copy() # Use copies to avoid modifying original X, y

# --- Stratification Option ---
# Define stratify_func only once
def get_stratify_array(y_arr):
    return y_arr if SPLIT_STRATEGY == 'stratified' and not y_arr.empty else None

# --- First Split: Train vs. Remainder (Val + Test + Cal) ---
val_test_cal_size = VAL_SIZE + TEST_SIZE + CAL_SIZE

if np.isclose(val_test_cal_size, 0): # Only Train set needed
    X_train, y_train = X_remaining, y_remaining
    logging.info("All data assigned to Train set (Val, Test, Cal sizes are 0).")
    X_remaining, y_remaining = empty_X.copy(), empty_y.copy() # No remainder
elif np.isclose(TRAIN_SIZE, 0): # No Train set needed
    logging.info("Train set is empty (TRAIN_SIZE=0). Remainder passed to next splits.")
    # X_remaining, y_remaining already hold all data
else: # Split Train vs Remainder
    split_test_size = val_test_cal_size # Proportion of remainder relative to total (1.0)
    X_train, X_remaining, y_train, y_remaining = train_test_split(
        X_remaining, y_remaining,
        test_size=split_test_size,
        random_state=RANDOM_SEED,
        stratify=get_stratify_array(y_remaining)
    )
logging.info(f"Train set shape: {X_train.shape}")


# --- Second Split: Val vs. Remainder (Test + Cal) ---
if not X_remaining.empty:
    test_cal_size = TEST_SIZE + CAL_SIZE
    # Denominator for relative size calculation: size of the current remaining pool
    current_remaining_size_frac = VAL_SIZE + test_cal_size # = val_test_cal_size

    if np.isclose(VAL_SIZE, 0): # No Val set, pass remainder to next stage
        X_temp2, y_temp2 = X_remaining, y_remaining # Remainder is Test + Cal
        logging.info("Validation set is empty (VAL_SIZE=0).")
    elif np.isclose(test_cal_size, 0): # Only Val set left in remainder
        X_val, y_val = X_remaining, y_remaining
        X_temp2, y_temp2 = empty_X.copy(), empty_y.copy() # No data left for Test/Cal
        logging.info(f"Validation set shape: {X_val.shape}")
    else: # Split Val vs (Test + Cal)
        # Proportion of (Test + Cal) relative to (Val + Test + Cal)
        split_test_size = test_cal_size / current_remaining_size_frac
        X_val, X_temp2, y_val, y_temp2 = train_test_split(
            X_remaining, y_remaining,
            test_size=split_test_size,
            random_state=RANDOM_SEED,
            stratify=get_stratify_array(y_remaining)
        )
        logging.info(f"Validation set shape: {X_val.shape}")
else: # No data remaining after train split
    X_temp2, y_temp2 = empty_X.copy(), empty_y.copy()
    if not np.isclose(VAL_SIZE, 0): # Log only if Val set was expected
       logging.info("Validation set is empty (no data remaining after train split).")


# --- Third Split: Test vs. Cal ---
if not X_temp2.empty:
    # Denominator for relative size calculation: size of the current remaining pool
    current_remaining_size_frac = TEST_SIZE + CAL_SIZE # = test_cal_size

    if np.isclose(CAL_SIZE, 0): # No Cal set, remainder is Test
        X_test, y_test = X_temp2, y_temp2
        logging.info("Calibration set is empty (CAL_SIZE=0).")
    elif np.isclose(TEST_SIZE, 0): # Only Cal set left in remainder
        X_cal, y_cal = X_temp2, y_temp2
        logging.info("Test set is empty (TEST_SIZE=0).")
    else: # Split Test vs Cal
        # Proportion of Cal relative to (Test + Cal)
        split_test_size = CAL_SIZE / current_remaining_size_frac
        X_test, X_cal, y_test, y_cal = train_test_split(
            X_temp2, y_temp2,
            test_size=split_test_size,
            random_state=RANDOM_SEED,
            stratify=get_stratify_array(y_temp2)
        )
        # Logging shapes done after the if/else block
else: # No data remaining for Test/Cal split
    if not (np.isclose(TEST_SIZE, 0) and np.isclose(CAL_SIZE, 0)): # Log only if Test or Cal were expected
        logging.info("Test and Calibration sets are empty (no data remaining for final split).")

# Log final shapes for Test and Cal
logging.info(f"Test set shape: {X_test.shape}")
logging.info(f"Calibration set shape: {X_cal.shape}")


# --- Verification and Final Logging ---
total_len = len(X_train) + len(X_val) + len(X_test) + len(X_cal)
original_len = len(X)

if total_len != original_len:
     # Calculate actual proportions based on lengths
     actual_train = len(X_train) / original_len if original_len > 0 else 0
     actual_val = len(X_val) / original_len if original_len > 0 else 0
     actual_test = len(X_test) / original_len if original_len > 0 else 0
     actual_cal = len(X_cal) / original_len if original_len > 0 else 0
     logging.warning(f"Total split length ({total_len}) does not exactly match original length ({original_len}). "
                     f"This can happen with stratification or rounding. "
                     f"Target proportions: Train={TRAIN_SIZE:.3f}, Val={VAL_SIZE:.3f}, Test={TEST_SIZE:.3f}, Cal={CAL_SIZE:.3f}. "
                     f"Actual proportions: Train={actual_train:.3f}, Val={actual_val:.3f}, Test={actual_test:.3f}, Cal={actual_cal:.3f}")
else:
    logging.info("Split lengths verification successful.")

logging.info("Data splitting complete.")

# Log distributions, handling empty sets
def log_distribution(name, y_set):
    if y_set.empty:
        logging.info(f"{name} target distribution: Set is empty.")
    else:
        try:
            # Use normalize=True, handle potential division by zero if counts are zero (though unlikely if not empty)
            counts = y_set.value_counts()
            dist = counts / counts.sum() if counts.sum() > 0 else counts
            logging.info(f"{name} target distribution:\n{dist}")
            # Log absolute counts as well for clarity
            logging.info(f"{name} target counts:\n{counts}")
        except Exception as e:
            logging.error(f"Could not calculate distribution for {name}: {e}")
            # Attempt to log raw value counts even if normalization fails
            try:
                logging.info(f"{name} raw value counts:\n{y_set.value_counts()}")
            except Exception as e_raw:
                 logging.error(f"Could not get raw value counts for {name}: {e_raw}")


log_distribution("Train", y_train)
log_distribution("Validation", y_val)
log_distribution("Test", y_test)
log_distribution("Calibration", y_cal)

### Hyperparameter Optimization via Hyperband

In [None]:
from sklearn.model_selection import ParameterSampler, train_test_split
from sklearn.metrics import f1_score # Default scorer
from sklearn.base import clone



# --- Internal Helper ---
def _train_and_eval(model_class, params,
                    X_train, y_train, X_val, y_val,
                    resource, resource_type,
                    scoring_func, random_state, fit_params):
    """Internal helper function to train and evaluate a single configuration."""
    try:
        model = model_class(**params)

        if resource_type == 'data_fraction':
            # Create a stratified subset of the training data
            if resource < 1.0:
                 # Ensure resource is not too small to stratify
                min_samples_per_class = min(y_train.value_counts()) if isinstance(y_train, pd.Series) else 0
                # Estimate required samples, handle cases where min_samples_per_class is 0 or very small
                if min_samples_per_class > 0:
                    required_samples = max(2, math.ceil(resource * min_samples_per_class)) # Need >= 2 per class for stratification
                else:
                    required_samples = 2 # Default if counts are weird

                if required_samples > min_samples_per_class and min_samples_per_class > 0: # Avoid warning if min_samples is 0
                     logging.warning(f"Resource fraction {resource:.2f} too small for stratification with min class size {min_samples_per_class}. Using full data.")
                     X_subset, y_subset = X_train, y_train
                elif len(y_train) * resource < 2:
                     logging.warning(f"Resource fraction {resource:.2f} results in < 2 samples ({len(y_train)*resource:.1f}). Using full data.")
                     X_subset, y_subset = X_train, y_train
                else:
                    # Use train_test_split to get a stratified subset. We only need the 'train' part.
                    try:
                        X_subset, _, y_subset, _ = train_test_split(
                            X_train, y_train,
                            train_size=resource,
                            stratify=y_train,
                            random_state=random_state
                        )
                        if len(X_subset) < 2: # Safety check after split
                            logging.warning(f"Subset resulted in < 2 samples after split. Using full data.")
                            X_subset, y_subset = X_train, y_train
                    except ValueError as e:
                        # Handle cases where stratification might fail (e.g., too few samples in a class for the split)
                        logging.warning(f"Stratified split failed for resource {resource:.2f} (Error: {e}). Using full data.")
                        X_subset, y_subset = X_train, y_train

            else:
                 X_subset, y_subset = X_train, y_train # Use full data if resource is 1.0

            # Train on the subset
            start_fit = time.time()
            model.fit(X_subset, y_subset) # No fit_params here usually
            fit_duration = time.time() - start_fit

        elif resource_type == 'iterations':
            # Resource represents n_estimators or similar iteration parameter
            # Assumes the parameter name is 'n_estimators' - adjust if needed for other models
            params_iter = params.copy() # Avoid modifying original params dict
            iter_param_name = 'n_estimators' # Common case
            # Handle potential different names if necessary (e.g., 'max_iter' for some models)
            # if 'n_estimators' not in model.get_params(): iter_param_name = 'SOME_OTHER_NAME'
            params_iter[iter_param_name] = int(resource)
            model = model_class(**params_iter) # Re-instantiate with correct n_estimators

            # Prepare fit_params for early stopping if applicable
            current_fit_params = {}
            # Check if 'eval_set' is needed and provided in fit_params keys
            if 'eval_set' in fit_params:
                 # Ensure X_val, y_val are correctly formatted
                 # XGBoost/LightGBM usually want list of tuples [(X, y)]
                 # Make sure the passed X_val, y_val are the correct ones (scaled/unscaled)
                 current_fit_params['eval_set'] = [(X_val, y_val)] # Use the validation set passed to HPO
                 # Copy other relevant early stopping params
                 for key in ['early_stopping_rounds', 'verbose', 'callbacks']: # Add other potential keys like 'callbacks' for LightGBM
                     if key in fit_params:
                         current_fit_params[key] = fit_params[key]

            start_fit = time.time()
            # Use try-except as fit might fail
            try:
                 model.fit(X_train, y_train, **current_fit_params)
            except Exception as fit_error:
                 logging.error(f"Fit failed for config {params_iter} with resource {resource}: {fit_error}")
                 return -1.0 # Indicate failure
            fit_duration = time.time() - start_fit

        else:
            raise ValueError("Invalid resource_type. Choose 'data_fraction' or 'iterations'.")

        # Evaluate on the full validation set
        start_eval = time.time()
        # Use try-except as predict might fail (e.g., if fit failed silently or model is unusable)
        try:
             y_pred_val = model.predict(X_val)
             score = scoring_func(y_val, y_pred_val)
        except Exception as eval_error:
             logging.error(f"Predict/Score failed for config {params} with resource {resource}: {eval_error}")
             score = -1.0 # Indicate failure
        eval_duration = time.time() - start_eval

        logging.debug(f"Evaluated config: {params} | Resource: {resource:.2f} | Score: {score:.4f} | Fit: {fit_duration:.2f}s | Eval: {eval_duration:.2f}s")
        return score

    except Exception as e:
        logging.error(f"Error training/evaluating config {params} with resource {resource}: {e}", exc_info=False) # Set exc_info=True for traceback
        return -1.0 # Return a clearly bad score
    # --- End of _train_and_eval code ---


def hyperband_hpo(model_class, param_space,
                  X_train, y_train, X_val, y_val,
                  max_resource, eta=3, resource_type='iterations',
                  min_resource=1, # Min iterations or min data fraction
                  scoring_func=f1_score, # Function accepting (y_true, y_pred)
                  random_state=None,
                  fit_params=None): # For early stopping etc. passed to .fit()
    """
    Performs Hyperband Hyperparameter Optimization.

    Args:
        model_class: The model class (e.g., SVC, RandomForestClassifier).
        param_space (dict): Dictionary defining the hyperparameter search space
                           compatible with ParameterSampler.
        X_train, y_train: Training data and labels.
        X_val, y_val: Validation data and labels for evaluation.
        max_resource (float/int): Maximum resource allocation
                                 (e.g., max n_estimators or 1.0 for data fraction).
        eta (int): Reduction factor for successive halving (>= 2).
        resource_type (str): How resource is allocated:
                             'iterations' -> resource sets n_estimators (or similar).
                             'data_fraction' -> resource is fraction of training data used (stratified).
        min_resource (float/int): Minimum resource for the first iteration.
                                 Must be >= 1 for 'iterations', > 0 for 'data_fraction'.
        scoring_func (callable): Function to evaluate performance (e.g., f1_score).
                                Higher score is assumed better.
        random_state (int): Seed for reproducibility of parameter sampling and data subsetting.
        fit_params (dict, optional): Additional parameters passed to the model's .fit()
                                      method (e.g., for early stopping: {'eval_set':..., 'early_stopping_rounds':...}).

    Returns:
        tuple: (best_params, best_score)
               best_params (dict): The hyperparameters of the best performing configuration.
               best_score (float): The score achieved by the best configuration on the validation set
                                  using the maximum resource.
    """
    if fit_params is None:
        fit_params = {}

    log_max_r = math.log(max_resource / min_resource, eta) if max_resource > min_resource and min_resource > 0 else 0
    s_max = int(log_max_r)
    B = (s_max + 1) * max_resource # Approximate total resource budget

    logging.info(f"--- Starting Hyperband HPO ---")
    logging.info(f"Model: {model_class.__name__}")
    logging.info(f"Resource Type: {resource_type}")
    logging.info(f"Resource Range: [{min_resource}, {max_resource}]")
    logging.info(f"Eta: {eta}")
    logging.info(f"Max Brackets (s_max): {s_max}")
    logging.info(f"Approx. Budget (B): {B:.2f}")
    logging.info(f"Scoring: {scoring_func.__name__}")

    best_params = None
    best_score = -1.0
    total_configs_evaluated = 0
    outer_tqdm = tqdm(range(s_max, -1, -1), desc="Hyperband Brackets (s)")

    # Outer loop: Iterate through brackets (s values)
    for s in outer_tqdm:
        n_configs = int(math.ceil(int(B / max_resource / (s + 1)) * eta**s)) # Number of configs in this bracket
        r_initial = max_resource * eta**(-s) # Initial resource for this bracket
        # Ensure initial resource is not less than min_resource
        r_initial = max(r_initial, min_resource)

        outer_tqdm.set_description(f"Bracket s={s} (n={n_configs}, r0={r_initial:.2f})")
        logging.info(f"\n>> Bracket s={s}: n_configs={n_configs}, r_initial={r_initial:.2f}")

        # Sample configurations for this bracket
        param_list = list(ParameterSampler(param_space, n_iter=n_configs, random_state=random_state + s if random_state is not None else None))
        
        # --- Add common fixed parameters ---
        # Calculate scale_pos_weight once if needed
        scale_pos_weight_val = None
        if model_class in [xgb.XGBClassifier, lgb.LGBMClassifier]:
             neg_count = (y_train == 0).sum()
             pos_count = (y_train == 1).sum()
             if pos_count > 0:
                 scale_pos_weight_val = neg_count / pos_count

        for p in param_list:
             # Add random_state if model supports it and it's not sampled
             if 'random_state' not in p and hasattr(model_class(random_state=1), 'random_state'): # Check if attr exists
                 p['random_state'] = random_state
             # Add class_weight='balanced' for relevant sklearn models if not sampled
             if model_class in [SVC, RandomForestClassifier, DecisionTreeClassifier] and 'class_weight' not in p:
                 p['class_weight'] = 'balanced'
             # Add scale_pos_weight for boosting models if not sampled and calculated
             if model_class in [xgb.XGBClassifier, lgb.LGBMClassifier] and 'scale_pos_weight' not in p and scale_pos_weight_val is not None:
                  p['scale_pos_weight'] = scale_pos_weight_val
             # For LightGBM, also consider adding 'objective': 'binary' if not sampled
             if model_class is lgb.LGBMClassifier and 'objective' not in p:
                  p['objective'] = 'binary'
        # -----------------------------------

        # Inner loop: Successive halving rounds
        inner_tqdm = tqdm(range(s + 1), desc=f"SH Round (s={s})", leave=False)
        for i in inner_tqdm:
            current_resource = r_initial * eta**i
            # Ensure resource doesn't exceed max_resource due to floating point/rounding
            current_resource = min(current_resource, max_resource)

            n_configs_in_round = len(param_list)
            inner_tqdm.set_description(f"SH Round i={i} (n={n_configs_in_round}, r={current_resource:.2f})")
            logging.info(f"  -- Round i={i}: Evaluating {n_configs_in_round} configs with resource={current_resource:.2f} --")

            round_scores = []
            # Use tqdm for the configurations within the round
            eval_tqdm = tqdm(param_list, desc=f"Evaluating Configs (i={i})", leave=False)
            for params in eval_tqdm:
                score = _train_and_eval(model_class, params, X_train, y_train, X_val, y_val,
                                        current_resource, resource_type, scoring_func,
                                        random_state, fit_params)
                round_scores.append((score, params))
                total_configs_evaluated += 1 # Count unique evaluations

            # Sort by score (descending, higher is better)
            round_scores.sort(key=lambda x: x[0], reverse=True)

            # Track the best overall score and params seen so far *at max resource*
            # Only update if we are actually at max resource in this round
            if abs(current_resource - max_resource) < 1e-6: # Check if we are at max resource
                 if round_scores and round_scores[0][0] > best_score:
                      best_score = round_scores[0][0]
                      best_params = round_scores[0][1]
                      logging.info(f"  ** New Best Found (Score: {best_score:.4f}) at max resource ** Params: {best_params}")
                      # Update outer tqdm description with best score found so far
                      outer_tqdm.set_postfix_str(f"Best F1: {best_score:.4f}", refresh=True)


            # --- Halving Step ---
            n_keep = int(n_configs_in_round / eta)
            logging.info(f"  -- Round i={i}: Completed {len(round_scores)} evaluations. Keeping top {n_keep} configs. --")

            if n_keep < 1 or i == s: # Keep at least one, or if it's the last round
                # If it's the last round, ensure the best score from *this bracket* at *max resource* is considered
                if abs(current_resource - max_resource) < 1e-6 and round_scores:
                     bracket_best_score = round_scores[0][0]
                     bracket_best_params = round_scores[0][1]
                     logging.info(f"  Bracket s={s} final best score: {bracket_best_score:.4f}")
                     # No need to update global best here, already done above
                break # Exit inner loop

            # Prepare parameter list for the next round
            param_list = [params for score, params in round_scores[:n_keep]]
            if not param_list: # Safety break if list becomes empty unexpectedly
                 logging.warning(f"  Param list empty after halving round i={i}. Stopping bracket.")
                 break

    logging.info(f"\n--- Hyperband HPO Finished ---")
    logging.info(f"Total configurations evaluated (approx): {total_configs_evaluated}") # Might overcount if errors happened
    if best_params:
        logging.info(f"Best Overall Score ({scoring_func.__name__}): {best_score:.4f}")
        logging.info(f"Best Params: {best_params}")
    else:
        logging.warning("No best parameters found. Check logs for errors or increase resources/configs.")

    return best_params, best_score

### Platt Scaling

In [11]:
def train_platt_scaler(base_estimator_class, best_params, X_train, y_train,
                       score_method='decision_function', # Nuevo parámetro
                       n_splits=5, random_state=None):
    """
    Trains a base estimator and calibrates its outputs using Platt scaling
    with k-fold cross-validation to obtain out-of-fold scores.

    Args:
        base_estimator_class: The class of the base estimator (e.g., SVC, RandomForestClassifier).
        best_params (dict): Dictionary of best hyperparameters for the base estimator.
        X_train (pd.DataFrame or np.ndarray): Training features.
        y_train (pd.Series or np.ndarray): Training labels.
        score_method (str): Method to get scores from the base estimator during CV.
                            Options: 'decision_function', 'predict_proba',
                                     'raw_margin_xgb', 'raw_score_lgbm'.
        n_splits (int): Number of folds for cross-validation.
        random_state (int): Random state for reproducibility.

    Returns:
        tuple: (fitted_base_estimator, fitted_platt_scaler)
               Returns (None, None) if an error occurs.
    """
    logging.info(f"--- Starting Platt Scaling Training ({score_method}) ---")
    try:
        # 1. Train the final base model on the entire training set
        logging.info("Training final base model on full training data...")
        final_base_estimator = base_estimator_class(**best_params)
        # Make sure y_train is a numpy array for fitting if needed
        y_train_np = y_train.values if isinstance(y_train, pd.Series) else y_train
        final_base_estimator.fit(X_train, y_train_np)
        logging.info("Final base model trained.")

        # 2. Get out-of-fold scores using k-fold CV
        logging.info(f"Performing {n_splits}-fold CV to get out-of-fold scores ({score_method})...")
        # Use StratifiedKFold for classification
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

        # Determine if X_train is DataFrame or ndarray for proper indexing
        is_pandas_X = isinstance(X_train, pd.DataFrame)
        is_pandas_y = isinstance(y_train, pd.Series)

        oof_scores = np.zeros(len(y_train), dtype=float)
        oof_true_labels = np.zeros(len(y_train), dtype=int)

        for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train_np)):
            logging.info(f"Processing Fold {fold+1}/{n_splits}...")

            # Select data based on index type
            if is_pandas_X:
                X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
            else: # Assume numpy array
                X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]

            if is_pandas_y:
                y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
            else: # Assume numpy array
                y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

            # Ensure y_train_fold is numpy for fitting fold estimator
            y_train_fold_np = y_train_fold.values if isinstance(y_train_fold, pd.Series) else y_train_fold


            # Clone and train estimator on the fold's training data
            # Use clone to ensure fresh state and proper param handling
            estimator_fold = clone(final_base_estimator) # Clone the already instantiated final estimator
            estimator_fold.fit(X_train_fold, y_train_fold_np)

            # Get scores based on the specified method
            scores_fold = None
            if score_method == 'decision_function':
                 if hasattr(estimator_fold, 'decision_function'):
                     scores_fold = estimator_fold.decision_function(X_val_fold)
                 else:
                     raise AttributeError(f"{base_estimator_class.__name__} does not have 'decision_function' method.")
            elif score_method == 'predict_proba':
                 if hasattr(estimator_fold, 'predict_proba'):
                     # Use probability of the positive class (class 1)
                     scores_fold = estimator_fold.predict_proba(X_val_fold)[:, 1]
                 else:
                      raise AttributeError(f"{base_estimator_class.__name__} does not have 'predict_proba' method.")
            elif score_method == 'raw_margin_xgb':
                 # Assumes XGBoost model
                 scores_fold = estimator_fold.predict(X_val_fold, output_margin=True)
            elif score_method == 'raw_score_lgbm':
                 # Assumes LightGBM model
                 scores_fold = estimator_fold.predict(X_val_fold, raw_score=True)
            else:
                 raise ValueError(f"Unsupported score_method: {score_method}")

            # Store results
            oof_scores[val_idx] = scores_fold
            # Ensure y_val_fold is numpy for assignment
            y_val_fold_np = y_val_fold.values if isinstance(y_val_fold, pd.Series) else y_val_fold
            oof_true_labels[val_idx] = y_val_fold_np

        logging.info("Out-of-fold scores collected.")

        # Reshape scores for Logistic Regression input
        oof_scores_reshaped = oof_scores.reshape(-1, 1)

        # 3. Train the Logistic Regression scaler
        logging.info("Training Logistic Regression (Platt) scaler...")
        # Use high C to approximate original Platt scaling (low regularization)
        platt_scaler = LogisticRegression(C=1e10, solver='liblinear', random_state=random_state)
        platt_scaler.fit(oof_scores_reshaped, oof_true_labels)
        logging.info("Platt scaler trained.")

        # Verify shapes one last time
        if oof_scores_reshaped.shape[0] != len(oof_true_labels):
            raise ValueError(f"Shape mismatch after collecting OOF scores: scores {oof_scores_reshaped.shape[0]}, labels {len(oof_true_labels)}")

        logging.info(f"--- Platt Scaling Training ({score_method}) Complete ---")
        return final_base_estimator, platt_scaler

    except Exception as e:
        logging.error(f"Error during Platt scaling ({score_method}): {e}", exc_info=True)
        return None, None

### Conformal Prediction

In [12]:
def calculate_ncm_scores(calibrated_probs, true_labels):
    """Calculates non-conformity scores (1 - probability of true class)."""
    if not isinstance(calibrated_probs, np.ndarray) or not isinstance(true_labels, np.ndarray):
         # Ensure inputs are numpy arrays for proper indexing
        calibrated_probs = np.asarray(calibrated_probs)
        true_labels = np.asarray(true_labels)
        
    if calibrated_probs.shape[0] != true_labels.shape[0]:
         raise ValueError("Probs and labels must have the same number of samples.")
    if calibrated_probs.shape[1] < np.max(true_labels) + 1:
        raise ValueError("Probs array has fewer columns than needed for max label index.")

    # Get probability of the true class for each sample
    # Using np.take_along_axis for efficient indexing
    true_class_probs = np.take_along_axis(calibrated_probs, true_labels[:, np.newaxis], axis=1).squeeze()

    return 1.0 - true_class_probs


def calibrate_conformal_threshold(ncm_scores, alpha):
    """Calculates the quantile threshold q for ICP."""
    n = len(ncm_scores)
    q_level = np.ceil((n + 1) * (1 - alpha)) / n
    q_threshold = np.quantile(ncm_scores, q_level, method='higher') # Use 'higher' to ensure coverage
    logging.info(f"Calibrated CP: n={n}, alpha={alpha}, q_level={q_level:.4f}, q_threshold={q_threshold:.6f}")
    return q_threshold

def predict_conformal_sets(calibrated_probs, q_threshold):
    """Generates prediction sets based on calibrated probabilities and threshold."""
    prediction_sets = []
    ncm_scores_per_class = 1.0 - calibrated_probs # NCM score for *each* potential class

    for scores in ncm_scores_per_class:
        set_for_sample = [i for i, score in enumerate(scores) if score <= q_threshold]
        # Handle empty sets - should be rare with correct quantile calculation
        # but as a fallback, could include the most likely class
        if not set_for_sample:
             set_for_sample = [np.argmin(scores)] # Index of lowest NCM score == highest prob
             logging.warning(f"Empty prediction set generated, falling back to most likely class: {set_for_sample}")
        prediction_sets.append(set(set_for_sample)) # Store as sets
    return prediction_sets

def evaluate_conformal_prediction(y_true, prediction_sets, alpha, model_name="Model"):
    """Evaluates the performance of conformal prediction sets."""
    if not isinstance(y_true, (pd.Series, np.ndarray)):
        y_true = np.asarray(y_true) # Ensure y_true is indexable

    n_samples = len(y_true)
    if n_samples == 0:
        logging.warning(f"[{model_name} CP Eval] Empty y_true provided.")
        return None, None

    # Empirical Coverage
    coverage_count = sum(y_true.iloc[i] in prediction_sets[i] for i in range(n_samples))
    empirical_coverage = coverage_count / n_samples

    # Average Set Size
    average_set_size = np.mean([len(s) for s in prediction_sets])

    logging.info(f"--- {model_name} Conformal Prediction Evaluation (alpha={alpha}) ---")
    logging.info(f"Target Coverage: {1 - alpha:.2f}")
    logging.info(f"Empirical Coverage: {empirical_coverage:.4f} ({coverage_count}/{n_samples})")
    logging.info(f"Average Prediction Set Size: {average_set_size:.4f}")

    return empirical_coverage, average_set_size

### Metrics

In [13]:
# --- Define Comprehensive Metrics ---

def calculate_metrics(y_true, y_pred, y_proba, model_name="Model"):
    """
    Calculates a comprehensive set of classification metrics.

    Args:
        y_true (array-like): Ground truth labels.
        y_pred (array-like): Predicted labels.
        y_proba (array-like): Predicted probabilities for the positive class (class 1).
        model_name (str): Name of the model for logging.

    Returns:
        dict: A dictionary containing calculated metrics.
              Returns None if input arrays are empty or invalid.
    """
    if len(y_true) == 0 or len(y_pred) == 0 or len(y_proba) == 0:
        logging.error(f"[{model_name}] Empty input arrays provided for metric calculation.")
        return None
    if len(y_true) != len(y_pred) or len(y_true) != len(y_proba):
        logging.error(f"[{model_name}] Mismatched lengths in input arrays for metric calculation.")
        return None

    metrics = {}

    # --- Threshold-based Metrics (using y_pred) ---
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision
    metrics['recall_tpr'] = recall # True Positive Rate (Sensitivity)
    metrics['f1_score'] = f1

    # Specificity (True Negative Rate)
    metrics['specificity_tnr'] = tn / (tn + fp) if (tn + fp) > 0 else 0.0

    # Geometric Mean
    metrics['g_mean'] = np.sqrt(metrics['recall_tpr'] * metrics['specificity_tnr'])

    # Confusion Matrix
    metrics['confusion_matrix'] = {'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp}

    # --- Ranking/Probabilistic Metrics (using y_proba) ---
    try:
        metrics['roc_auc'] = roc_auc_score(y_true, y_proba)
    except ValueError as e:
        logging.warning(f"[{model_name}] Could not calculate ROC AUC: {e}. Setting to 0.0.")
        metrics['roc_auc'] = 0.0 # Handle cases with only one class present

    # PR AUC
    pr_curve_precision, pr_curve_recall, _ = precision_recall_curve(y_true, y_proba)
    metrics['pr_auc'] = auc(pr_curve_recall, pr_curve_precision) # Note order: recall is x, precision is y

    # Brier Score
    metrics['brier_score'] = brier_score_loss(y_true, y_proba)

    logging.info(f"--- {model_name} Metrics ---")
    logging.info(f"Accuracy: {metrics['accuracy']:.4f}")
    logging.info(f"Precision: {metrics['precision']:.4f}")
    logging.info(f"Recall (TPR): {metrics['recall_tpr']:.4f}")
    logging.info(f"Specificity (TNR): {metrics['specificity_tnr']:.4f}")
    logging.info(f"F1-Score: {metrics['f1_score']:.4f}")
    logging.info(f"G-Mean: {metrics['g_mean']:.4f}")
    logging.info(f"ROC AUC: {metrics['roc_auc']:.4f}")
    logging.info(f"PR AUC: {metrics['pr_auc']:.4f}")
    logging.info(f"Brier Score: {metrics['brier_score']:.4f}")
    logging.info(f"Confusion Matrix (TN, FP, FN, TP): ({tn}, {fp}, {fn}, {tp})")

    # Optional: Plot Confusion Matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap([[tn, fp], [fn, tp]], annot=True, fmt='d', cmap='Blues',
                xticklabels=['Predicted Galaxy (0)', 'Predicted Star (1)'],
                yticklabels=['Actual Galaxy (0)', 'Actual Star (1)'])
    plt.title(f'{model_name} Confusion Matrix')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    cm_filename = os.path.join(MODEL_DIR, f"{model_name}_confusion_matrix_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
    plt.savefig(cm_filename)
    plt.close()
    logging.info(f"Confusion matrix plot saved to {cm_filename}")


    return metrics

### Feature Scaling

In [10]:
# --- Feature Scaling ---
# Important for SVM, not used for the other models.
# Fit scaler ONLY on training data, then transform all sets.

# Check if training set and other datasets are non-empty before scaling
if len(X_train) > 0 and TRAIN_SIZE > 0:
    logging.info("Applying StandardScaler to features...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
else:
    logging.info("Empty training set, NOT able to apply StandardScaler!")
    X_train_scaled = X_train

if len(X_val) > 0 and VAL_SIZE > 0:
    X_val_scaled = scaler.transform(X_val)
else:
    X_val_scaled = X_val

if len(X_test) > 0 and TEST_SIZE > 0:
    X_test_scaled = scaler.transform(X_test)
else:
    X_test_scaled = X_test

if len(X_cal) > 0 and CAL_SIZE > 0:
    X_cal_scaled = scaler.transform(X_cal)
else:
    X_cal_scaled = X_cal

# Save the scaler if it was fitted
if 'scaler' in locals():
    scaler_filename = os.path.join(MODEL_DIR, f"scaler_{datetime.now().strftime('%Y%m%d_%H%M%S')}.joblib")
    joblib.dump(scaler, scaler_filename)
    logging.info(f"Scaler saved to {scaler_filename}")

# Use scaled data for models sensitive to scale (like SVM)
logging.info("Feature scaling complete.")

## 3. Models

In [None]:
all_results = {} # Dictionary to store metrics for each model

ALPHA = 0.1

### 3.1 SVM

In [None]:
# HPO Settings for SVM (using data fraction)
MAX_RESOURCE_SVM = 1.0  # Max data fraction
MIN_RESOURCE_SVM = 0.1  # Min data fraction (adjust based on minority class size)
ETA_SVM = 3
RESOURCE_TYPE_SVM = 'data_fraction'
model_name_svm = "SVM"

In [None]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_SVM = 1.0  # Se mantiene en 1.0 para usar todos los datos al final
MIN_RESOURCE_SVM = 0.5  # Aumentado para reducir s_max (menos brackets/configs)
ETA_SVM = 4             # Aumentado para eliminar configuraciones más rápido

In [None]:
from sklearn.svm import SVC
from scipy.stats import loguniform

logging.info(f"\n\n===== Starting Workflow for {model_name_svm} =====")
timestamp_svm = datetime.now().strftime("%Y%m%d_%H%M%S")
hpo_start_time_svm = time.time()

# --- 1.1 SVM: Define Search Space and HPO Params ---
param_space_svm = {
    'C': loguniform(1e-2, 1e3),
    'gamma': loguniform(1e-4, 1e1),
    'kernel': ['rbf'], # Example: Fixed RBF kernel
    # 'kernel': ['rbf', 'linear'], # Example: If you want to search kernels
    # class_weight is added automatically inside hyperband_hpo
    # random_state is added automatically inside hyperband_hpo
}



# --- 1.2 SVM: Run Hyperband HPO ---
logging.info(f"--- [{model_name_svm}] Running Hyperband HPO ---")
best_params_svm, best_score_hpo_svm = hyperband_hpo(
    model_class=SVC,
    param_space=param_space_svm,
    X_train=X_train_scaled, # USE SCALED DATA
    y_train=y_train,
    X_val=X_val_scaled,     # USE SCALED DATA
    y_val=y_val,
    max_resource=MAX_RESOURCE_SVM,
    eta=ETA_SVM,
    resource_type=RESOURCE_TYPE_SVM,
    min_resource=MIN_RESOURCE_SVM,
    scoring_func=f1_score,
    random_state=RANDOM_SEED,
    fit_params={} # No specific fit_params for SVC
)
hpo_duration_svm = time.time() - hpo_start_time_svm
logging.info(f"--- [{model_name_svm}] HPO finished in {hpo_duration_svm:.2f} seconds ---")

# --- 1.3 SVM: Train Final Model & Platt Scaler (using Full Training Set) ---
fitted_svm_base = None
platt_scaler_svm = None
if best_params_svm:
    logging.info(f"--- [{model_name_svm}] Training final model and Platt scaler ---")
    platt_start_time_svm = time.time()
    # Ensure necessary fixed parameters are present for the final fit
    best_params_svm['random_state'] = RANDOM_SEED
    if 'class_weight' not in best_params_svm: best_params_svm['class_weight'] = 'balanced'
    if 'probability' in best_params_svm: del best_params_svm['probability'] # Use decision_function

    fitted_svm_base, platt_scaler_svm = train_platt_scaler(
        base_estimator_class=SVC, # Pass the class
        best_params=best_params_svm,
        X_train=X_train_scaled, # Use scaled training data
        y_train=y_train,        # Use original y_train for CV indexing
        n_splits=5,             # Folds for Platt CV
        random_state=RANDOM_SEED
    )
    platt_duration_svm = time.time() - platt_start_time_svm
    if fitted_svm_base and platt_scaler_svm:
        logging.info(f"--- [{model_name_svm}] Platt scaling finished in {platt_duration_svm:.2f} seconds ---")
        # Optional: Save models
        # joblib.dump(...)
    else:
        logging.error(f"[{model_name_svm}] Failed to train base model or Platt scaler.")
else:
    logging.warning(f"[{model_name_svm}] HPO did not find best parameters. Skipping subsequent steps.")

# --- 1.4 SVM: Calibrate ICP Threshold (using Calibration Set) ---
q_threshold_svm = None
if fitted_svm_base and platt_scaler_svm:
    logging.info(f"--- [{model_name_svm}] Calibrating Conformal Prediction (alpha={ALPHA}) ---")
    icp_cal_start_time_svm = time.time()
    decision_scores_cal_svm = fitted_svm_base.decision_function(X_cal_scaled) # Use scaled cal data
    calibrated_probs_cal_svm = platt_scaler_svm.predict_proba(decision_scores_cal_svm.reshape(-1, 1))
    ncm_scores_cal_svm = calculate_ncm_scores(calibrated_probs_cal_svm, y_cal.values) # Use .values
    q_threshold_svm = calibrate_conformal_threshold(ncm_scores_cal_svm, ALPHA)
    icp_cal_duration_svm = time.time() - icp_cal_start_time_svm
    logging.info(f"--- [{model_name_svm}] ICP calibration finished in {icp_cal_duration_svm:.2f} seconds. Threshold={q_threshold_svm:.6f} ---")
    # Optional: Save threshold
    # joblib.dump(...)
else:
    logging.warning(f"[{model_name_svm}] Skipping ICP calibration.")


# --- 1.5 SVM: Final Evaluation (using Test Set) ---
if fitted_svm_base and platt_scaler_svm:
    logging.info(f"--- [{model_name_svm}] Final Evaluation on Test Set ---")
    eval_start_time_svm = time.time()
    decision_scores_test_svm = fitted_svm_base.decision_function(X_test_scaled) # Use scaled test data
    calibrated_probs_test_svm = platt_scaler_svm.predict_proba(decision_scores_test_svm.reshape(-1, 1))
    y_proba_test_svm = calibrated_probs_test_svm[:, 1]
    y_pred_test_svm = (y_proba_test_svm >= 0.5).astype(int)

    metrics_svm = calculate_metrics(y_test, y_pred_test_svm, y_proba_test_svm, model_name=model_name_svm)

    cp_coverage_svm, cp_avg_set_size_svm = None, None
    if q_threshold_svm is not None:
        prediction_sets_test_svm = predict_conformal_sets(calibrated_probs_test_svm, q_threshold_svm)
        cp_coverage_svm, cp_avg_set_size_svm = evaluate_conformal_prediction(
            y_test, prediction_sets_test_svm, ALPHA, model_name=model_name_svm
        )
    else:
         logging.warning(f"[{model_name_svm}] No CP threshold, skipping CP evaluation.")

    eval_duration_svm = time.time() - eval_start_time_svm
    logging.info(f"--- [{model_name_svm}] Evaluation finished in {eval_duration_svm:.2f} seconds ---")

    # Store results
    all_results[model_name_svm] = {
        'metrics': metrics_svm,
        'cp_coverage': cp_coverage_svm,
        'cp_avg_set_size': cp_avg_set_size_svm,
        'best_hpo_params': best_params_svm,
        'hpo_f1_score': best_score_hpo_svm,
        'hpo_duration_s': hpo_duration_svm,
        'q_threshold': q_threshold_svm
    }
else:
    logging.warning(f"[{model_name_svm}] Skipping final evaluation.")

logging.info(f"===== Finished Workflow for {model_name_svm} =====")

### 3.2 CART

In [None]:
# HPO Settings for CART (using data fraction)
MAX_RESOURCE_CART = 1.0
MIN_RESOURCE_CART = 0.1 # Can start with smaller fraction for trees
ETA_CART = 3
RESOURCE_TYPE_CART = 'data_fraction'
model_name_cart = "CART"

In [None]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_CART = 1.0
MIN_RESOURCE_CART = 0.5 # Aumentado para reducir s_max
ETA_CART = 4            # Aumentado para eliminar configuraciones más rápido

In [None]:
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint, uniform


logging.info(f"\n\n===== Starting Workflow for {model_name_cart} =====")
timestamp_cart = datetime.now().strftime("%Y%m%d_%H%M%S")
hpo_start_time_cart = time.time()

# --- 2.1 CART: Define Search Space and HPO Params ---
param_space_cart = {
    'criterion': ['gini', 'entropy'],
    'max_depth': randint(3, 50),
    'min_samples_split': randint(2, 100),
    'min_samples_leaf': randint(1, 50),
    # class_weight added automatically
    # random_state added automatically
}

# --- 2.2 CART: Run Hyperband HPO ---
logging.info(f"--- [{model_name_cart}] Running Hyperband HPO ---")
best_params_cart, best_score_hpo_cart = hyperband_hpo(
    model_class=DecisionTreeClassifier,
    param_space=param_space_cart,
    X_train=X_train, # USE UNSCALED DATA
    y_train=y_train,
    X_val=X_val,     # USE UNSCALED DATA
    y_val=y_val,
    max_resource=MAX_RESOURCE_CART,
    eta=ETA_CART,
    resource_type=RESOURCE_TYPE_CART,
    min_resource=MIN_RESOURCE_CART,
    scoring_func=f1_score,
    random_state=RANDOM_SEED,
    fit_params={}
)
hpo_duration_cart = time.time() - hpo_start_time_cart
logging.info(f"--- [{model_name_cart}] HPO finished in {hpo_duration_cart:.2f} seconds ---")

# --- 2.3 CART: Train Final Model & Platt Scaler ---
fitted_cart_base = None
platt_scaler_cart = None
if best_params_cart:
    logging.info(f"--- [{model_name_cart}] Training final model and Platt scaler ---")
    platt_start_time_cart = time.time()
    # Ensure necessary fixed parameters are present for the final fit
    best_params_cart['random_state'] = RANDOM_SEED
    if 'class_weight' not in best_params_cart: best_params_cart['class_weight'] = 'balanced'

    # Use the modified train_platt_scaler with predict_proba
    fitted_cart_base, platt_scaler_cart = train_platt_scaler(
        base_estimator_class=DecisionTreeClassifier,
        best_params=best_params_cart,
        X_train=X_train, # Use UNSCALED training data
        y_train=y_train,
        score_method='predict_proba', # <<< Specify score method for CART
        n_splits=5,
        random_state=RANDOM_SEED
    )
    platt_duration_cart = time.time() - platt_start_time_cart
    if fitted_cart_base and platt_scaler_cart:
        logging.info(f"--- [{model_name_cart}] Platt scaling finished in {platt_duration_cart:.2f} seconds ---")
        # Optional: Save models
        # joblib.dump(...)
    else:
        logging.error(f"[{model_name_cart}] Failed to train base model or Platt scaler.")
else:
    logging.warning(f"[{model_name_cart}] HPO did not find best parameters. Skipping subsequent steps.")

# --- 2.4 CART: Calibrate ICP Threshold (using Calibration Set) ---
q_threshold_cart = None
if fitted_cart_base and platt_scaler_cart:
    logging.info(f"--- [{model_name_cart}] Calibrating Conformal Prediction (alpha={ALPHA}) ---")
    icp_cal_start_time_cart = time.time()
    # Get base model probabilities (class 1) for calibration set
    base_probs_cal_cart = fitted_cart_base.predict_proba(X_cal)[:, 1].reshape(-1, 1) # UNSCALED cal data
    # Get calibrated probabilities from Platt scaler
    calibrated_probs_cal_cart = platt_scaler_cart.predict_proba(base_probs_cal_cart)
    ncm_scores_cal_cart = calculate_ncm_scores(calibrated_probs_cal_cart, y_cal.values) # Use .values
    q_threshold_cart = calibrate_conformal_threshold(ncm_scores_cal_cart, ALPHA)
    icp_cal_duration_cart = time.time() - icp_cal_start_time_cart
    logging.info(f"--- [{model_name_cart}] ICP calibration finished in {icp_cal_duration_cart:.2f} seconds. Threshold={q_threshold_cart:.6f} ---")
    # Optional: Save threshold
    # joblib.dump(...)
else:
    logging.warning(f"[{model_name_cart}] Skipping ICP calibration.")


# --- 2.5 CART: Final Evaluation (using Test Set) ---
if fitted_cart_base and platt_scaler_cart:
    logging.info(f"--- [{model_name_cart}] Final Evaluation on Test Set ---")
    eval_start_time_cart = time.time()
    # Get base model probabilities (class 1) for test set
    base_probs_test_cart = fitted_cart_base.predict_proba(X_test)[:, 1].reshape(-1, 1) # UNSCALED test data
    # Get calibrated probabilities from Platt scaler
    calibrated_probs_test_cart = platt_scaler_cart.predict_proba(base_probs_test_cart)
    y_proba_test_cart = calibrated_probs_test_cart[:, 1] # Probability of positive class
    y_pred_test_cart = (y_proba_test_cart >= 0.5).astype(int) # Threshold calibrated probabilities

    metrics_cart = calculate_metrics(y_test, y_pred_test_cart, y_proba_test_cart, model_name=model_name_cart)

    cp_coverage_cart, cp_avg_set_size_cart = None, None
    if q_threshold_cart is not None:
        # Use calibrated probabilities for prediction set generation
        prediction_sets_test_cart = predict_conformal_sets(calibrated_probs_test_cart, q_threshold_cart)
        cp_coverage_cart, cp_avg_set_size_cart = evaluate_conformal_prediction(
            y_test, prediction_sets_test_cart, ALPHA, model_name=model_name_cart
        )
    else:
         logging.warning(f"[{model_name_cart}] No CP threshold, skipping CP evaluation.")

    eval_duration_cart = time.time() - eval_start_time_cart
    logging.info(f"--- [{model_name_cart}] Evaluation finished in {eval_duration_cart:.2f} seconds ---")

    # Store results
    all_results[model_name_cart] = {
        'metrics': metrics_cart,
        'cp_coverage': cp_coverage_cart,
        'cp_avg_set_size': cp_avg_set_size_cart,
        'best_hpo_params': best_params_cart,
        'hpo_f1_score': best_score_hpo_cart,
        'hpo_duration_s': hpo_duration_cart,
        'q_threshold': q_threshold_cart # Store the threshold
    }
else:
    logging.warning(f"[{model_name_cart}] Skipping final evaluation.")

logging.info(f"===== Finished Workflow for {model_name_cart} =====")

### 3.3 Random Forest

In [None]:
# HPO Settings for RF (using iterations)
MAX_RESOURCE_RF = 300  # Max n_estimators
MIN_RESOURCE_RF = 20   # Min n_estimators
ETA_RF = 3
RESOURCE_TYPE_RF = 'iterations'
model_name_rf = "Random_Forest"

In [None]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_RF = 20   # ¡Reducido drásticamente! (Antes 300)
MIN_RESOURCE_RF = 5    # Mínimo bajo pero cercano a max para pocos brackets
ETA_RF = 4             # Aumentado

In [None]:
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint


logging.info(f"\n\n===== Starting Workflow for {model_name_rf} =====")
timestamp_rf = datetime.now().strftime("%Y%m%d_%H%M%S")
hpo_start_time_rf = time.time()

# --- 3.1 RF: Define Search Space and HPO Params ---
param_space_rf = {
    # n_estimators is controlled by resource_type='iterations'
    'max_depth': randint(5, 50),
    'min_samples_split': randint(2, 50),
    'min_samples_leaf': randint(1, 25),
    'max_features': ['sqrt', 'log2', None], # None means max_features=n_features
    'criterion': ['gini', 'entropy'],
    # class_weight added automatically
    # random_state added automatically
}

# --- 3.2 RF: Run Hyperband HPO ---
logging.info(f"--- [{model_name_rf}] Running Hyperband HPO ---")
best_params_rf, best_score_hpo_rf = hyperband_hpo(
    model_class=RandomForestClassifier,
    param_space=param_space_rf,
    X_train=X_train, # USE UNSCALED DATA
    y_train=y_train,
    X_val=X_val,     # USE UNSCALED DATA
    y_val=y_val,
    max_resource=MAX_RESOURCE_RF,
    eta=ETA_RF,
    resource_type=RESOURCE_TYPE_RF,
    min_resource=MIN_RESOURCE_RF,
    scoring_func=f1_score,
    random_state=RANDOM_SEED,
    fit_params={} # No special fit_params needed
)
hpo_duration_rf = time.time() - hpo_start_time_rf
logging.info(f"--- [{model_name_rf}] HPO finished in {hpo_duration_rf:.2f} seconds ---")


# --- 3.3 RF: Train Final Model & Platt Scaler ---
fitted_rf_base = None
platt_scaler_rf = None
if best_params_rf:
    logging.info(f"--- [{model_name_rf}] Training final model and Platt scaler ---")
    platt_start_time_rf = time.time()
    # Ensure necessary fixed parameters are present
    best_params_rf['random_state'] = RANDOM_SEED
    if 'class_weight' not in best_params_rf: best_params_rf['class_weight'] = 'balanced'
    best_params_rf['n_jobs'] = -1 # Use all cores

    # Use the modified train_platt_scaler with predict_proba
    fitted_rf_base, platt_scaler_rf = train_platt_scaler(
        base_estimator_class=RandomForestClassifier,
        best_params=best_params_rf,
        X_train=X_train, # Use UNSCALED training data
        y_train=y_train,
        score_method='predict_proba', # <<< Specify score method for RF
        n_splits=5,
        random_state=RANDOM_SEED
    )
    platt_duration_rf = time.time() - platt_start_time_rf
    if fitted_rf_base and platt_scaler_rf:
        logging.info(f"--- [{model_name_rf}] Platt scaling finished in {platt_duration_rf:.2f} seconds ---")
        # Optional: Save models
        # joblib.dump(...)
    else:
        logging.error(f"[{model_name_rf}] Failed to train base model or Platt scaler.")
else:
    logging.warning(f"[{model_name_rf}] HPO did not find best parameters. Skipping subsequent steps.")

# --- 3.4 RF: Calibrate ICP Threshold ---
# (Keep this section as it is, but verify input to platt_scaler_rf.predict_proba)
q_threshold_rf = None
if fitted_rf_base and platt_scaler_rf:
    logging.info(f"--- [{model_name_rf}] Calibrating Conformal Prediction (alpha={ALPHA}) ---")
    icp_cal_start_time_rf = time.time()
    # *** Verify this line ***
    # Get base model probabilities (class 1) for calibration set
    base_probs_cal_rf = fitted_rf_base.predict_proba(X_cal)[:, 1].reshape(-1, 1) # UNSCALED cal data, prob class 1
    calibrated_probs_cal_rf = platt_scaler_rf.predict_proba(base_probs_cal_rf) # Get calibrated probs for BOTH classes
    ncm_scores_cal_rf = calculate_ncm_scores(calibrated_probs_cal_rf, y_cal.values)
    q_threshold_rf = calibrate_conformal_threshold(ncm_scores_cal_rf, ALPHA)
    icp_cal_duration_rf = time.time() - icp_cal_start_time_rf
    logging.info(f"--- [{model_name_rf}] ICP calibration finished in {icp_cal_duration_rf:.2f} seconds. Threshold={q_threshold_rf:.6f} ---")
else:
    logging.warning(f"[{model_name_rf}] Skipping ICP calibration.")

# --- 3.5 RF: Final Evaluation ---
# (Keep this section as it is, but verify input to platt_scaler_rf.predict_proba)
if fitted_rf_base and platt_scaler_rf:
    logging.info(f"--- [{model_name_rf}] Final Evaluation on Test Set ---")
    eval_start_time_rf = time.time()
    # *** Verify this line ***
    # Get base model probabilities (class 1) for test set
    base_probs_test_rf = fitted_rf_base.predict_proba(X_test)[:, 1].reshape(-1, 1) # UNSCALED test data, prob class 1
    calibrated_probs_test_rf = platt_scaler_rf.predict_proba(base_probs_test_rf)
    y_proba_test_rf = calibrated_probs_test_rf[:, 1]
    y_pred_test_rf = (y_proba_test_rf >= 0.5).astype(int)

    metrics_rf = calculate_metrics(y_test, y_pred_test_rf, y_proba_test_rf, model_name=model_name_rf)

    cp_coverage_rf, cp_avg_set_size_rf = None, None
    if q_threshold_rf is not None:
        # Use calibrated probabilities
        prediction_sets_test_rf = predict_conformal_sets(calibrated_probs_test_rf, q_threshold_rf)
        cp_coverage_rf, cp_avg_set_size_rf = evaluate_conformal_prediction(
            y_test, prediction_sets_test_rf, ALPHA, model_name=model_name_rf
        )
    else:
         logging.warning(f"[{model_name_rf}] No CP threshold, skipping CP evaluation.")

    eval_duration_rf = time.time() - eval_start_time_rf
    logging.info(f"--- [{model_name_rf}] Evaluation finished in {eval_duration_rf:.2f} seconds ---")

    # Store results
    all_results[model_name_rf] = {
        'metrics': metrics_rf,
        'cp_coverage': cp_coverage_rf,
        'cp_avg_set_size': cp_avg_set_size_rf,
        'best_hpo_params': best_params_rf,
        'hpo_f1_score': best_score_hpo_rf,
        'hpo_duration_s': hpo_duration_rf,
        'q_threshold': q_threshold_rf
    }
else:
    logging.warning(f"[{model_name_rf}] Skipping final evaluation.")

logging.info(f"===== Finished Workflow for {model_name_rf} =====")

### 3.4 XGBoost

In [None]:
# HPO Settings for XGB (using iterations)
MAX_RESOURCE_XGB = 500 # Max n_estimators
MIN_RESOURCE_XGB = 30  # Min n_estimators
ETA_XGB = 3
RESOURCE_TYPE_XGB = 'iterations'
model_name_xgb = "XGBoost"

# Fit params for early stopping within HPO
fit_params_xgb_hpo = {
    'early_stopping_rounds': 5, #!15,
    # 'eval_set': Will be set inside _train_and_eval using X_val, y_val
    'verbose': False # Suppress verbose output during HPO fitting
}

In [None]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_XGB = 30  # ¡Reducido drásticamente! (Antes 500)
MIN_RESOURCE_XGB = 10  # Mínimo bajo pero cercano a max
ETA_XGB = 4            # Aumentado

In [None]:
import xgboost as xgb
from scipy.stats import randint, uniform


logging.info(f"\n\n===== Starting Workflow for {model_name_xgb} =====")
timestamp_xgb = datetime.now().strftime("%Y%m%d_%H%M%S")
hpo_start_time_xgb = time.time()

# --- 4.1 XGB: Define Search Space and HPO Params ---
param_space_xgb = {
    # n_estimators controlled by resource
    'learning_rate': loguniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.6, 0.4), # range [0.6, 1.0)
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': loguniform(1e-2, 1.0), # Min loss reduction
    'reg_alpha': loguniform(1e-3, 1.0), # L1 reg
    'reg_lambda': loguniform(1e-3, 1.0), # L2 reg
    # scale_pos_weight added automatically
    # random_state added automatically
    'objective': ['binary:logistic'], # Fixed objective
    'eval_metric': ['logloss'],        # Fixed eval metric for early stopping
    'use_label_encoder': [False]       # Deprecated, set to False
}



# --- 4.2 XGB: Run Hyperband HPO ---
logging.info(f"--- [{model_name_xgb}] Running Hyperband HPO ---")
best_params_xgb, best_score_hpo_xgb = hyperband_hpo(
    model_class=xgb.XGBClassifier,
    param_space=param_space_xgb,
    X_train=X_train, # USE UNSCALED DATA
    y_train=y_train,
    X_val=X_val,     # USE UNSCALED DATA
    y_val=y_val,
    max_resource=MAX_RESOURCE_XGB,
    eta=ETA_XGB,
    resource_type=RESOURCE_TYPE_XGB,
    min_resource=MIN_RESOURCE_XGB,
    scoring_func=f1_score,
    random_state=RANDOM_SEED,
    fit_params=fit_params_xgb_hpo
)
hpo_duration_xgb = time.time() - hpo_start_time_xgb
logging.info(f"--- [{model_name_xgb}] HPO finished in {hpo_duration_xgb:.2f} seconds ---")


# --- 4.3 XGB: Train Final Model & Platt Scaler ---
fitted_xgb_base = None
platt_scaler_xgb = None
final_best_params_xgb = None # Initialize

if best_params_xgb:
    logging.info(f"--- [{model_name_xgb}] Determining best iteration and training Platt scaler ---")
    platt_start_time_xgb = time.time()

    # 1. Determine best iteration using early stopping on validation set
    temp_best_params_xgb = best_params_xgb.copy() # Work with a copy
    temp_best_params_xgb['random_state'] = RANDOM_SEED
    if 'objective' not in temp_best_params_xgb: temp_best_params_xgb['objective'] = 'binary:logistic'
    if 'eval_metric' not in temp_best_params_xgb: temp_best_params_xgb['eval_metric'] = 'logloss'
    if 'use_label_encoder' not in temp_best_params_xgb: temp_best_params_xgb['use_label_encoder'] = False
    if 'n_jobs' not in temp_best_params_xgb: temp_best_params_xgb['n_jobs'] = -1
    if 'scale_pos_weight' not in temp_best_params_xgb:
         neg_count = (y_train == 0).sum(); pos_count = (y_train == 1).sum()
         if pos_count > 0: temp_best_params_xgb['scale_pos_weight'] = neg_count / pos_count

    logging.info("Training temporary XGBoost with early stopping to find best iteration...")
    temp_xgb_model = xgb.XGBClassifier(**temp_best_params_xgb)
    # Ensure X_val, y_val are appropriate (unscaled)
    eval_set_final = [(X_val, y_val)]
    temp_xgb_model.fit(X_train, y_train,
                       early_stopping_rounds=20,
                       eval_set=eval_set_final,
                       verbose=False)
    best_iteration = temp_xgb_model.best_iteration
    logging.info(f"Best iteration found: {best_iteration}")

    # Update best_params with the optimal number of estimators found
    final_best_params_xgb = temp_best_params_xgb.copy()
    final_best_params_xgb['n_estimators'] = best_iteration if best_iteration is not None and best_iteration > 0 else MAX_RESOURCE_XGB

    # 2. Train final model and Platt scaler using train_platt_scaler
    logging.info(f"--- [{model_name_xgb}] Training final model ({final_best_params_xgb['n_estimators']} est.) and Platt scaler ---")
    fitted_xgb_base, platt_scaler_xgb = train_platt_scaler(
        base_estimator_class=xgb.XGBClassifier,
        best_params=final_best_params_xgb, # Use params with best_iteration
        X_train=X_train, # Use UNSCALED training data
        y_train=y_train,
        score_method='raw_margin_xgb', # <<< Specify score method for XGB
        n_splits=5,
        random_state=RANDOM_SEED
    )
    platt_duration_xgb = time.time() - platt_start_time_xgb
    if fitted_xgb_base and platt_scaler_xgb:
        logging.info(f"--- [{model_name_xgb}] Platt scaling finished in {platt_duration_xgb:.2f} seconds ---")
        # Optional: Save models
        # joblib.dump(...)
    else:
        logging.error(f"[{model_name_xgb}] Failed to train base model or Platt scaler.")
else:
    logging.warning(f"[{model_name_xgb}] HPO did not find best parameters. Skipping subsequent steps.")


# --- 4.4 XGB: Calibrate ICP Threshold ---
# (Keep this section as it is, but verify input to platt_scaler_xgb.predict_proba)
q_threshold_xgb = None
if fitted_xgb_base and platt_scaler_xgb:
    logging.info(f"--- [{model_name_xgb}] Calibrating Conformal Prediction (alpha={ALPHA}) ---")
    icp_cal_start_time_xgb = time.time()
    # *** Verify this line ***
    # Get base model raw margins for calibration set
    base_raw_cal_xgb = fitted_xgb_base.predict(X_cal, output_margin=True).reshape(-1, 1) # UNSCALED cal data
    calibrated_probs_cal_xgb = platt_scaler_xgb.predict_proba(base_raw_cal_xgb)
    ncm_scores_cal_xgb = calculate_ncm_scores(calibrated_probs_cal_xgb, y_cal.values)
    q_threshold_xgb = calibrate_conformal_threshold(ncm_scores_cal_xgb, ALPHA)
    icp_cal_duration_xgb = time.time() - icp_cal_start_time_xgb
    logging.info(f"--- [{model_name_xgb}] ICP calibration finished in {icp_cal_duration_xgb:.2f} seconds. Threshold={q_threshold_xgb:.6f} ---")
else:
    logging.warning(f"[{model_name_xgb}] Skipping ICP calibration.")


# --- 4.5 XGB: Final Evaluation ---
# (Keep this section as it is, but verify input to platt_scaler_xgb.predict_proba)
if fitted_xgb_base and platt_scaler_xgb:
    logging.info(f"--- [{model_name_xgb}] Final Evaluation on Test Set ---")
    eval_start_time_xgb = time.time()
    # *** Verify this line ***
    # Get base model raw margins for test set
    base_raw_test_xgb = fitted_xgb_base.predict(X_test, output_margin=True).reshape(-1, 1) # UNSCALED test data
    calibrated_probs_test_xgb = platt_scaler_xgb.predict_proba(base_raw_test_xgb)
    y_proba_test_xgb = calibrated_probs_test_xgb[:, 1]
    y_pred_test_xgb = (y_proba_test_xgb >= 0.5).astype(int)

    metrics_xgb = calculate_metrics(y_test, y_pred_test_xgb, y_proba_test_xgb, model_name=model_name_xgb)

    cp_coverage_xgb, cp_avg_set_size_xgb = None, None
    if q_threshold_xgb is not None:
        # Use calibrated probabilities
        prediction_sets_test_xgb = predict_conformal_sets(calibrated_probs_test_xgb, q_threshold_xgb)
        cp_coverage_xgb, cp_avg_set_size_xgb = evaluate_conformal_prediction(
            y_test, prediction_sets_test_xgb, ALPHA, model_name=model_name_xgb
        )
    else:
         logging.warning(f"[{model_name_xgb}] No CP threshold, skipping CP evaluation.")

    eval_duration_xgb = time.time() - eval_start_time_xgb
    logging.info(f"--- [{model_name_xgb}] Evaluation finished in {eval_duration_xgb:.2f} seconds ---")

    # Store results
    all_results[model_name_xgb] = {
        'metrics': metrics_xgb,
        'cp_coverage': cp_coverage_xgb,
        'cp_avg_set_size': cp_avg_set_size_xgb,
        'best_hpo_params': best_params_xgb, # Original HPO params
        # Store actual used estimators if available
        'final_n_estimators': final_best_params_xgb.get('n_estimators', None) if final_best_params_xgb else None,
        'hpo_f1_score': best_score_hpo_xgb,
        'hpo_duration_s': hpo_duration_xgb,
        'q_threshold': q_threshold_xgb
    }
else:
    logging.warning(f"[{model_name_xgb}] Skipping final evaluation.")

logging.info(f"===== Finished Workflow for {model_name_xgb} =====")

### 3.5 LightGBM

In [None]:
# HPO Settings for LGBM (using iterations)
MAX_RESOURCE_LGBM = 500 # Max n_estimators
MIN_RESOURCE_LGBM = 30  # Min n_estimators
ETA_LGBM = 3
RESOURCE_TYPE_LGBM = 'iterations'
model_name_lgbm = "LightGBM"

In [None]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_LGBM = 30  # ¡Reducido drásticamente! (Antes 500)
MIN_RESOURCE_LGBM = 10  # Mínimo bajo pero cercano a max
ETA_LGBM = 4            # Aumentado
RESOURCE_TYPE_LGBM = 'iterations'

# Fit params for early stopping within HPO
# Newer LightGBM uses callbacks
fit_params_lgbm_hpo = {
    # 'eval_set': Will be set inside _train_and_eval
    # Use callbacks for early stopping
    'callbacks': [early_stopping(stopping_rounds=5, #!15, 
                                 verbose=False
                                 )]
    # 'verbose': -1 # Suppress verbose output during HPO fitting (or False) - use callback instead
}

In [None]:
import lightgbm as lgb
# Note: LightGBM early stopping might need callbacks depending on version
# We'll try passing via fit_params first, but might need adjustment
from lightgbm import early_stopping # For newer versions


logging.info(f"\n\n===== Starting Workflow for {model_name_lgbm} =====")
timestamp_lgbm = datetime.now().strftime("%Y%m%d_%H%M%S")
hpo_start_time_lgbm = time.time()

# --- 5.1 LGBM: Define Search Space and HPO Params ---
param_space_lgbm = {
    # n_estimators controlled by resource
    'learning_rate': loguniform(0.01, 0.3),
    'num_leaves': randint(20, 100),
    'max_depth': randint(3, 15), # Often kept lower than XGB depth
    'subsample': uniform(0.6, 0.4), # Aliased as bagging_fraction
    'colsample_bytree': uniform(0.6, 0.4), # Aliased as feature_fraction
    'reg_alpha': loguniform(1e-3, 1.0), # L1
    'reg_lambda': loguniform(1e-3, 1.0), # L2
    # scale_pos_weight or is_unbalance=True added automatically
    # random_state added automatically
    'objective': ['binary'], # Fixed objective
    'metric': ['logloss'],   # Fixed metric for early stopping
}


# --- 5.2 LGBM: Run Hyperband HPO ---
logging.info(f"--- [{model_name_lgbm}] Running Hyperband HPO ---")
best_params_lgbm, best_score_hpo_lgbm = hyperband_hpo(
    model_class=lgb.LGBMClassifier,
    param_space=param_space_lgbm,
    X_train=X_train, # USE UNSCALED DATA
    y_train=y_train,
    X_val=X_val,     # USE UNSCALED DATA
    y_val=y_val,
    max_resource=MAX_RESOURCE_LGBM,
    eta=ETA_LGBM,
    resource_type=RESOURCE_TYPE_LGBM,
    min_resource=MIN_RESOURCE_LGBM,
    scoring_func=f1_score,
    random_state=RANDOM_SEED,
    fit_params=fit_params_lgbm_hpo
)
hpo_duration_lgbm = time.time() - hpo_start_time_lgbm
logging.info(f"--- [{model_name_lgbm}] HPO finished in {hpo_duration_lgbm:.2f} seconds ---")


# --- 5.3 LGBM: Train Final Model & Platt Scaler ---
fitted_lgbm_base = None
platt_scaler_lgbm = None
final_best_params_lgbm = None # Initialize

if best_params_lgbm:
    logging.info(f"--- [{model_name_lgbm}] Determining best iteration and training Platt scaler ---")
    platt_start_time_lgbm = time.time()

    # 1. Determine best iteration using early stopping on validation set
    temp_best_params_lgbm = best_params_lgbm.copy() # Work with a copy
    temp_best_params_lgbm['random_state'] = RANDOM_SEED
    if 'objective' not in temp_best_params_lgbm: temp_best_params_lgbm['objective'] = 'binary'
    if 'metric' not in temp_best_params_lgbm: temp_best_params_lgbm['metric'] = 'logloss'
    if 'n_jobs' not in temp_best_params_lgbm: temp_best_params_lgbm['n_jobs'] = -1
    if 'scale_pos_weight' not in temp_best_params_lgbm:
        neg_count = (y_train == 0).sum(); pos_count = (y_train == 1).sum()
        if pos_count > 0:
            temp_best_params_lgbm['scale_pos_weight'] = neg_count / pos_count
            if 'is_unbalance' in temp_best_params_lgbm: del temp_best_params_lgbm['is_unbalance']
        elif 'is_unbalance' not in temp_best_params_lgbm:
            temp_best_params_lgbm['is_unbalance'] = True

    logging.info("Training temporary LightGBM with early stopping to find best iteration...")
    temp_lgbm_model = lgb.LGBMClassifier(**temp_best_params_lgbm)
    eval_set_final_lgbm = [(X_val, y_val)]
    callbacks_final = [early_stopping(stopping_rounds=20, verbose=False)]

    temp_lgbm_model.fit(X_train, y_train,
                         eval_set=eval_set_final_lgbm,
                         callbacks=callbacks_final)

    best_iteration_lgbm = temp_lgbm_model.best_iteration_
    logging.info(f"Best iteration found: {best_iteration_lgbm}")

    # Update best_params with the optimal number of estimators found
    final_best_params_lgbm = temp_best_params_lgbm.copy()
    final_best_params_lgbm['n_estimators'] = best_iteration_lgbm if best_iteration_lgbm is not None and best_iteration_lgbm > 0 else MAX_RESOURCE_LGBM

    # 2. Train final model and Platt scaler using train_platt_scaler
    logging.info(f"--- [{model_name_lgbm}] Training final model ({final_best_params_lgbm['n_estimators']} est.) and Platt scaler ---")
    fitted_lgbm_base, platt_scaler_lgbm = train_platt_scaler(
        base_estimator_class=lgb.LGBMClassifier,
        best_params=final_best_params_lgbm, # Use params with best_iteration
        X_train=X_train, # Use UNSCALED training data
        y_train=y_train,
        score_method='raw_score_lgbm', # <<< Specify score method for LGBM
        n_splits=5,
        random_state=RANDOM_SEED
    )
    platt_duration_lgbm = time.time() - platt_start_time_lgbm
    if fitted_lgbm_base and platt_scaler_lgbm:
        logging.info(f"--- [{model_name_lgbm}] Platt scaling finished in {platt_duration_lgbm:.2f} seconds ---")
        # Optional: Save models
        # joblib.dump(...)
    else:
        logging.error(f"[{model_name_lgbm}] Failed to train base model or Platt scaler.")
else:
    logging.warning(f"[{model_name_lgbm}] HPO did not find best parameters. Skipping subsequent steps.")


# --- 5.4 LGBM: Calibrate ICP Threshold ---
# (Keep this section as it is, but verify input to platt_scaler_lgbm.predict_proba)
q_threshold_lgbm = None
if fitted_lgbm_base and platt_scaler_lgbm:
    logging.info(f"--- [{model_name_lgbm}] Calibrating Conformal Prediction (alpha={ALPHA}) ---")
    icp_cal_start_time_lgbm = time.time()
    # *** Verify this line ***
    # Get base model raw scores for calibration set
    base_raw_cal_lgbm = fitted_lgbm_base.predict(X_cal, raw_score=True).reshape(-1, 1) # UNSCALED cal data
    calibrated_probs_cal_lgbm = platt_scaler_lgbm.predict_proba(base_raw_cal_lgbm)
    ncm_scores_cal_lgbm = calculate_ncm_scores(calibrated_probs_cal_lgbm, y_cal.values)
    q_threshold_lgbm = calibrate_conformal_threshold(ncm_scores_cal_lgbm, ALPHA)
    icp_cal_duration_lgbm = time.time() - icp_cal_start_time_lgbm
    logging.info(f"--- [{model_name_lgbm}] ICP calibration finished in {icp_cal_duration_lgbm:.2f} seconds. Threshold={q_threshold_lgbm:.6f} ---")
else:
    logging.warning(f"[{model_name_lgbm}] Skipping ICP calibration.")


# --- 5.5 LGBM: Final Evaluation ---
# (Keep this section as it is, but verify input to platt_scaler_lgbm.predict_proba)
if fitted_lgbm_base and platt_scaler_lgbm:
    logging.info(f"--- [{model_name_lgbm}] Final Evaluation on Test Set ---")
    eval_start_time_lgbm = time.time()
    # *** Verify this line ***
    # Get base model raw scores for test set
    base_raw_test_lgbm = fitted_lgbm_base.predict(X_test, raw_score=True).reshape(-1, 1) # UNSCALED test data
    calibrated_probs_test_lgbm = platt_scaler_lgbm.predict_proba(base_raw_test_lgbm)
    y_proba_test_lgbm = calibrated_probs_test_lgbm[:, 1]
    y_pred_test_lgbm = (y_proba_test_lgbm >= 0.5).astype(int)

    metrics_lgbm = calculate_metrics(y_test, y_pred_test_lgbm, y_proba_test_lgbm, model_name=model_name_lgbm)

    cp_coverage_lgbm, cp_avg_set_size_lgbm = None, None
    if q_threshold_lgbm is not None:
        # Use calibrated probabilities
        prediction_sets_test_lgbm = predict_conformal_sets(calibrated_probs_test_lgbm, q_threshold_lgbm)
        cp_coverage_lgbm, cp_avg_set_size_lgbm = evaluate_conformal_prediction(
            y_test, prediction_sets_test_lgbm, ALPHA, model_name=model_name_lgbm
        )
    else:
         logging.warning(f"[{model_name_lgbm}] No CP threshold, skipping CP evaluation.")

    eval_duration_lgbm = time.time() - eval_start_time_lgbm
    logging.info(f"--- [{model_name_lgbm}] Evaluation finished in {eval_duration_lgbm:.2f} seconds ---")

    # Store results
    all_results[model_name_lgbm] = {
        'metrics': metrics_lgbm,
        'cp_coverage': cp_coverage_lgbm,
        'cp_avg_set_size': cp_avg_set_size_lgbm,
        'best_hpo_params': best_params_lgbm, # Original HPO params
        # Store actual used estimators if available
        'final_n_estimators': final_best_params_lgbm.get('n_estimators', None) if final_best_params_lgbm else None,
        'hpo_f1_score': best_score_hpo_lgbm,
        'hpo_duration_s': hpo_duration_lgbm,
        'q_threshold': q_threshold_lgbm
    }
else:
    logging.warning(f"[{model_name_lgbm}] Skipping final evaluation.")

logging.info(f"===== Finished Workflow for {model_name_lgbm} =====")

### 4 Results


In [None]:
results_summary = []
for model_name, results_data in all_results.items():
    summary = {'Model': model_name}
    if results_data['metrics']:
        summary.update(results_data['metrics'])
        # Remove nested confusion matrix dict for simple display
        if 'confusion_matrix' in summary:
             cm = summary.pop('confusion_matrix')
             summary['TN'] = cm['tn']
             summary['FP'] = cm['fp']
             summary['FN'] = cm['fn']
             summary['TP'] = cm['tp']
    summary['CP Coverage'] = results_data.get('cp_coverage', None)
    summary['CP Avg Set Size'] = results_data.get('cp_avg_set_size', None)
    summary['HPO F1'] = results_data.get('hpo_f1_score', None)
    summary['HPO Duration (s)'] = results_data.get('hpo_duration_s', None)
    summary['Final Estimators'] = results_data.get('final_n_estimators', 'N/A') # For XGB/LGBM
    results_summary.append(summary)

results_df = pd.DataFrame(results_summary)

# Set display options for float formatting
pd.set_option('display.float_format', lambda x: f'{x:.4f}' if isinstance(x, float) else x)

print("\n===== Performance Metrics Summary =====")
display(results_df)

# You might want to save results_df to CSV
results_df.to_csv(os.path.join(MODEL_DIR, f"model_comparison_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"), index=False)
logging.info("Results summary DataFrame saved.")

# You can also access detailed results for a specific model:
# print("\nDetailed SVM Results:")
# print(all_results.get('SVM_Hyperband'))