In [1]:
#jupyter nbconvert --to script Model.ipynb

#TODO: from sklearn.pipeline import Pipeline to make them all together and cleaner. 
# Include the SVM with the standard scaler in the same pipeline.
# Maybe also the CVAP or the MICP?
#TODO: feature names
#c:\Users\javym\miniconda3\envs\Lab\lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names
#  warnings.warn(
#c:\Users\javym\miniconda3\envs\Lab\lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RandomForestClassifier was fitted without feature names
#TODO: Thresholding
# CART is outputting allways the 0 class due to the 0.5 threshold, the max prob from cvap in CART is 0.06. 
#TODO: Saving models and best parameters for each group
#TODO: Tabla
# Quiero la tabla del latex con los resultados y ejecutar por grupos
#TODO: Implement loop for all the groups
# Maybe wrapping each model into a function of the dataset??
#TODO: Estudiar relevancia de las features para cada modelo y cada grupo.
# Shapely is too computationally expensive
# Use conformasight
#TODO: Check MICP is correctly implemented
# It doesn't seem clear whether it is using the true labels in the calibration split for anything
# Maybe try different bins (using the predicted label rather than the true? Changes the condition in the probability guarantees)
# Maybe implement ICP (non mondrian) option to see the differences
# Maybe use better ncm with the interval probs of the Venn Abers prediction. How wide it is and how far from the true label?


In [2]:
# Remove any existing log files
import os
import glob
import logging

# Reset logger to avoid any issues with permissions
logging.shutdown()
# Remove loggers
for log_file in glob.glob("*.log"):
    os.remove(log_file)



# Star-Galaxy Classification using ALHAMBRA Photometry

This notebook implements and evaluates several machine learning models for classifying astronomical objects as stars or galaxies based on multi-band photometric data from the ALHAMBRA survey, using labels derived from higher-resolution COSMOS2020 data.

**Target Variable:** `acs_mu_class` (from COSMOS2020)
 - Which is 1 for Galaxy and 2 for Star. We will remap this to 0 (Galaxy, majority class) and 1 (Star, minority class).

**Features:** Selected columns from the ALHAMBRA survey data.

**Models:**
1. Support Vector Machine (SVM)
2. Decision Tree (CART)
3. Random Forest
4. XGBoost
5. LightGBM

## 0. Setup and Configuration

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # For confusion matrix heatmap
from scipy.stats import randint, uniform, loguniform # Ensure loguniform is imported if used
import os
import json
import time
from datetime import datetime
import math
import logging
from tqdm import tqdm  
#from tqdm.notebook import tqdm # Needs pip install ipywidgets
#from tqdm.auto import tqdm
import joblib # For saving/loading models efficiently

# Scikit-learn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterSampler
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import CalibratedClassifierCV
from scipy.stats import loguniform # For hyperparameter distributions
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, roc_auc_score,
    brier_score_loss, precision_recall_curve, auc, f1_score
)   
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.exceptions import NotFittedError

# Boosting models
import xgboost as xgb
from xgboost.callback import EarlyStopping
import lightgbm as lgb
from lightgbm import early_stopping

# Mondrian ICP
from crepes import ConformalClassifier

# Configure logging
logging.shutdown()
logging.basicConfig(
    filename=f'models_{datetime.now().strftime("%d_%H-%M-%S")}.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)
# Prevent logs from being printed to console
logging.getLogger().handlers = [h for h in logging.getLogger().handlers if isinstance(h, logging.FileHandler)]

## 1. Loading Dataset & Feature Selection

**Interesting Feature Combinations for Modeling:**
 
 The feature groups are defined as follows:
 - Group 1: Morphology features and their uncertainties
 - Group 2: Photometry magnitudes
 - Group 3: Photometry magnitude and errors
 - Group 4: Redshift features and their uncertainties
 - Group 5: Combination of photometry magnitude errors and morphology features (including uncertainties)
 - Group 6: Combination of photometry magnitude errors, morphology features (including uncertainties), and redshift features (including uncertainties)




In [4]:
# Read the df
df = pd.read_csv('data/match_alhambra_cosmos2020_ACS_class_0.8arcsec.csv')
logging.info(f"DataFrame created with shape: {df.shape}")
# Map ACS classification: 1 (Galaxy, Majority) -> 0, 2 (Star, minority) -> 1, 3 (Fake) -> drop
logging.info("Original class counts:")
logging.info(df['acs_mu_class'].value_counts().to_string())

# Drop fake detections (class 3)
# Drop fake detections
n_fakes = (df['acs_mu_class'] == 3).sum()
logging.info(f"Number of fake detections (class 3): {n_fakes}")
df = df[df['acs_mu_class'] != 3]

# Map classifications
df['acs_mu_class'] = df['acs_mu_class'].map({1: 0, 2: 1})

logging.info("After dropping fakes and mapping classes (0: Galaxy, 1: Star):")
logging.info(df['acs_mu_class'].value_counts().to_string())

In [5]:
# Input features

# --- Define feature categories based on ALHAMBRA data using exact names ---

# 1. ALHAMBRA Morphology Features (SExtractor-based)
morphology_features = [
    'area', 'fwhm', 'stell', 'ell', 'a', 'b', 'theta', 'rk', 'rf'
]

morphology_err = [
    's2n'
]

morphology_mags_errors = morphology_features + morphology_err

# 2. ALHAMBRA Photometry Magnitudes (Optical + NIR + Synthetic)
OPTICAL_MAG_COLS = [
    'F365W', 'F396W', 'F427W', 'F458W', 'F489W', 'F520W', 'F551W',
    'F582W', 'F613W', 'F644W', 'F675W', 'F706W', 'F737W', 'F768W',
    'F799W', 'F830W', 'F861W', 'F892W', 'F923W', 'F954W'
]
photometry_magnitudes = (
    OPTICAL_MAG_COLS +
    ['J', 'H', 'KS', 'F814W']
)

# 3. ALHAMBRA Photometry Uncertainties
OPTICAL_ERR_COLS = [
    'dF365W', 'dF396W', 'dF427W', 'dF458W', 'dF489W', 'dF520W', 'dF551W',
    'dF582W', 'dF613W', 'dF644W', 'dF675W', 'dF706W', 'dF737W', 'dF768W',
    'dF799W', 'dF830W', 'dF861W', 'dF892W', 'dF923W', 'dF954W'
]
photometry_uncertainties = (
    OPTICAL_ERR_COLS +
    ['dJ', 'dH', 'dKS', 'dF814W']
)

photometry_mags_errors = photometry_magnitudes + photometry_uncertainties

# 4. ALHAMBRA Photometric Redshift & Derived Features (BPZ-based)
redshift_features = [
    'zb_1', 'zb_Min_1', 'zb_Max_1', 'Tb_1',
    'z_ml', 't_ml',
    'Stell_Mass_1', 'M_Abs_1', 'MagPrior'
]

redshift_uncertainties = [
    'Odds_1', 'Chi2'
]


redshift_mags_errors = redshift_features + redshift_uncertainties

# 5. ALHAMBRA Quality/Auxiliary Features (per-band quality etc.)
OPTICAL_IRMS_COLS = [
    'irms_F365W', 'irms_F396W', 'irms_F427W', 'irms_F458W', 'irms_F489W',
    'irms_F520W', 'irms_F551W', 'irms_F582W', 'irms_F613W', 'irms_F644W',
    'irms_F675W', 'irms_F706W', 'irms_F737W', 'irms_F768W', 'irms_F799W',
    'irms_F830W', 'irms_F861W', 'irms_F892W', 'irms_F923W', 'irms_F954W'
]
quality_aux_features = (
    ['nfobs'] +
    OPTICAL_IRMS_COLS +
    ['irms_J', 'irms_H', 'irms_KS', 'irms_F814W']
)

# --- Define lists of features NOT used for modeling ---

non_modeling_identifiers = ['ID_1', 'id_2'] # ALHAMBRA ID, COSMOS ID

non_modeling_astrometry = [
    'RA_1', 'Dec_1', 'x', 'y', # ALHAMBRA Astrometry
    'ra_2', 'dec_2',          # COSMOS Astrometry
    'Separation'              # Matching Quality
]

non_modeling_flags = [
    'photoflag', 'xray', 'PercW', 'Satur_Flag', # ALHAMBRA Object/Photometry Flags
    'irms_OPT_Flag', 'irms_NIR_Flag'           # ALHAMBRA Overall Quality Flags
]

alhambra_prediction = ['Stellar_Flag'] # ALHAMBRA's own classification

non_modeling_aperture_mags = [ # Specific aperture mags, usually use total mags
    'F814W_3arcs', 'dF814W_3arcs', 'F814W_3arcs_corr'
]

non_modeling_cosmos_features = [ # Measurements/flags derived from COSMOS data (HST, HSC, VISTA...)
    'model_flag',
    'flag_hsc', 'flag_supcam', 'flag_udeep', 'flag_uvista',
    'hsc_r_mag', 'hsc_r_magerr', 'hsc_r_valid',
    'hsc_i_mag', 'hsc_i_magerr', 'hsc_i_valid',
    'uvista_j_mag', 'uvista_j_magerr', 'uvista_j_valid',
    'uvista_ks_mag', 'uvista_ks_magerr', 'uvista_ks_valid',
    'acs_f814w_mag', 'acs_f814w_magerr',
    'acs_fwhm_world', 'acs_mu_max',
    'solution_model' # This is categorical, but still COSMOS-derived info
]

target_variable = ['acs_mu_class'] # The COSMOS classification label to predict

##########################################################################################
#! --- Consolidate into the main dictionary for easy access ---
##########################################################################################

feature_sets = {
        # --- Potential Input Feature Sets ---
        'morphology_only': morphology_mags_errors,
        'photometry_magnitudes_only': photometry_magnitudes,
        'photometry_mags_errors': photometry_mags_errors,
        'photometry_plus_morphology': photometry_mags_errors + morphology_mags_errors,
        'photometry_no_redshift': photometry_mags_errors + morphology_mags_errors + quality_aux_features,
        'redshift_only': redshift_mags_errors,
        'full_alhambra_all': (morphology_mags_errors +
                            photometry_mags_errors +
                            redshift_mags_errors + 
                            quality_aux_features),

        # --- Excluded Feature Sets ---
        'non_modeling_identifiers': non_modeling_identifiers,
        'non_modeling_astrometry': non_modeling_astrometry,
        'non_modeling_flags': non_modeling_flags,
        'non_modeling_aperture_mags': non_modeling_aperture_mags,
        'non_modeling_cosmos_features': non_modeling_cosmos_features,
        'alhambra_prediction': alhambra_prediction,
        'target_variable': target_variable
    }

#! This is excluding the quality aux.
# Include target_variable in each group by appending it to the feature list
groups = {
        'group_1': feature_sets.get('morphology_only', []) + feature_sets.get('target_variable', []),
        'group_2': feature_sets.get('photometry_magnitudes_only', []) + feature_sets.get('target_variable', []),
        'group_3': feature_sets.get('photometry_mags_errors', []) + feature_sets.get('target_variable', []),
        'group_4': feature_sets.get('redshift_only', []) + feature_sets.get('target_variable', []),
        'group_5': feature_sets.get('photometry_plus_morphology', []) + feature_sets.get('target_variable', []),
        'group_6': (feature_sets.get('photometry_mags_errors', []) +
                   feature_sets.get('morphology_only', []) +
                   feature_sets.get('redshift_only', []) +
                   feature_sets.get('target_variable', [])),
        'group_7': feature_sets.get('full_alhambra_all', []) + feature_sets.get('target_variable', [])
    }

# --- Function to get a specific feature set (Unchanged from before) ---

def get_feature_set(df, set_name, groups = groups):
    """
    Selects columns from a DataFrame based on a predefined feature set name,
    including six specific groups defined by combinations of morphology,
    photometry magnitudes, uncertainties, and redshift features.

    Args:
        df (pd.DataFrame): The input DataFrame.
        set_name (str): The name of the desired feature set group:
                        'group_1' to 'group_6' as defined below.

    Returns:
        pd.DataFrame: A DataFrame containing only the columns
                      belonging to the specified feature set group.
                      Returns an empty DataFrame if no columns are found.
    """

    if set_name not in groups:
        raise ValueError(f"Feature set group '{set_name}' not defined. "
                         f"Available groups: {list(groups.keys())}")

    required_cols_in_set = groups[set_name]

    # Find which of these columns actually exist in the DataFrame
    available_cols = [col for col in required_cols_in_set if col in df.columns]

    # Warn if some columns from the set definition are missing
    missing_cols = [col for col in required_cols_in_set if col not in available_cols]
    if missing_cols:
        print(f"Warning: The following columns defined for feature set group '{set_name}'"
              f" were not found in the DataFrame and will be excluded: {missing_cols}")

    if not available_cols:
        print(f"Warning: No columns for feature set group '{set_name}' found in the DataFrame.")
        return pd.DataFrame()  # Return empty DataFrame

    print(f"Selecting feature set group '{set_name}' with {len(available_cols)} columns.")
    return df[available_cols]


In [6]:
# Quality check to see which cols are excluded and contained in each group
all_feature_cols = set()
for cols in feature_sets.values():
    all_feature_cols.update(cols)

df_cols_set = set(df.columns)
not_in_feature_sets = df_cols_set - all_feature_cols

if not_in_feature_sets:
    print(f"Columns in df not included in any feature_sets: {sorted(not_in_feature_sets)}")
else:
    print("All df columns are included in feature_sets.")


# Check which columns are in each feature group
for group_name in ['group_1', 'group_2', 'group_3', 'group_4', 'group_5', 'group_6', 'group_7']:
    print(f"\n=== {group_name} ===")
    
    # Get the feature set definition
    feature_set = groups[group_name]
    
    # Get the actual columns that exist in the data
    group_df = get_feature_set(df, group_name)
    

    available_cols = list(group_df.columns)
    
    # Find columns that are defined but not in the data
    missing_cols = [col for col in list(df.columns) if col not in feature_set]
    
    print(f"\nFeatures present ({len(available_cols)} columns):")
    print(list(sorted(available_cols)))
    
    print(f"\nFeatures missing ({len(missing_cols)} columns):")
    print(list(sorted(missing_cols)))




All df columns are included in feature_sets.

=== group_1 ===
Selecting feature set group 'group_1' with 11 columns.

Features present (11 columns):
['a', 'acs_mu_class', 'area', 'b', 'ell', 'fwhm', 'rf', 'rk', 's2n', 'stell', 'theta']

Features missing (125 columns):
['Chi2', 'Dec_1', 'F365W', 'F396W', 'F427W', 'F458W', 'F489W', 'F520W', 'F551W', 'F582W', 'F613W', 'F644W', 'F675W', 'F706W', 'F737W', 'F768W', 'F799W', 'F814W', 'F814W_3arcs', 'F814W_3arcs_corr', 'F830W', 'F861W', 'F892W', 'F923W', 'F954W', 'H', 'ID_1', 'J', 'KS', 'M_Abs_1', 'MagPrior', 'Odds_1', 'PercW', 'RA_1', 'Satur_Flag', 'Separation', 'Stell_Mass_1', 'Stellar_Flag', 'Tb_1', 'acs_f814w_mag', 'acs_f814w_magerr', 'acs_fwhm_world', 'acs_mu_max', 'dF365W', 'dF396W', 'dF427W', 'dF458W', 'dF489W', 'dF520W', 'dF551W', 'dF582W', 'dF613W', 'dF644W', 'dF675W', 'dF706W', 'dF737W', 'dF768W', 'dF799W', 'dF814W', 'dF814W_3arcs', 'dF830W', 'dF861W', 'dF892W', 'dF923W', 'dF954W', 'dH', 'dJ', 'dKS', 'dec_2', 'flag_hsc', 'flag_supcam

## 2. Data Preprocessing and Splitting

In [7]:
# Data splitting parameters
TEST_SIZE = 0.20 # Test set proportion
VAL_SIZE = 0.10 # Validation set proportion
CAL_SIZE = 0.10 # Calibration set proportion
# Train size will be 1 - (TEST_SIZE + VAL_SIZE + CAL_SIZE)

TARGET_COLUMN = feature_sets.get('target_variable', [])[0]
RANDOM_SEED = 33 # For reproducibility

# Model saving directory
MODEL_DIR = "trained_models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Data splitting strategy ('stratified' or 'random')
SPLIT_STRATEGY = 'stratified' # Recommended for imbalanced datasets


In [8]:
# --- Data Cleaning ---
def clean_data(df, feature_group, target_column, logger=logging):
    """
    Cleans the input DataFrame by selecting features for the given group,
    dropping NaNs, and separating features and target.

    Args:
        df (pd.DataFrame): The input DataFrame.
        feature_group (str): The feature group to use (e.g., 'group_1', 'group_2', etc.).
        target_column (str): The name of the target column.
        logger (logging.Logger): Logger for info and error messages.

    Returns:
        X (pd.DataFrame): Cleaned feature DataFrame.
        y (pd.Series): Target variable.
        df_clean (pd.DataFrame): The cleaned DataFrame (features + target).
    """
    logger.info(f"Original dataset size: {df.shape}")

    # Get the feature columns for the selected group using get_feature_set
    df_clean = get_feature_set(df, feature_group).dropna().copy()
    logger.info(f"Dataset size after dropping NaNs: {df_clean.shape}")

    # Ensure target_column is defined correctly
    if target_column not in df_clean.columns:
        raise KeyError(f"Target column '{target_column}' not found in the cleaned DataFrame columns: {df_clean.columns.tolist()}")

    # Log value counts for target
    logger.info(f"Value counts for target:\n1 (Star): {(df_clean[target_column] == 1).sum()}\n0 (Galaxy): {(df_clean[target_column] == 0).sum()}")

    # Separate features (X) and target (y) for the cleaned DataFrame
    X = df_clean.drop(columns=[target_column])
    y = df_clean[target_column]
    return X, y, df_clean

# Example usage:
# X, y, df_clean = clean_data(df, feature_group='group_7', target_column=TARGET_COLUMN, logger=logging)

In [9]:
# --- Data Splitting ---
def split_data(X, y):
    """
    Splits the data into train, validation, test, and calibration sets according to the global
    split proportions and strategy. Uses global variables:
        - TEST_SIZE, VAL_SIZE, CAL_SIZE, SPLIT_STRATEGY, RANDOM_SEED

    The logic and split order is identical to the original code.

    Args:
        X (pd.DataFrame): Feature DataFrame.
        y (pd.Series): Target variable.

    Returns:
        (X_train, y_train, X_val, y_val, X_test, y_test, X_cal, y_cal): tuple of splits.
    """
    logging.info(f"Splitting data using '{SPLIT_STRATEGY}' strategy...")

    # --- Validate Proportions ---
    if not (0 <= TEST_SIZE <= 1 and 0 <= VAL_SIZE <= 1 and 0 <= CAL_SIZE <= 1):
        raise ValueError("Split proportions (TEST_SIZE, VAL_SIZE, CAL_SIZE) must be between 0 and 1.")

    TRAIN_SIZE = 1.0 - TEST_SIZE - VAL_SIZE - CAL_SIZE
    if not (0 <= TRAIN_SIZE <= 1):
        raise ValueError(f"Calculated TRAIN_SIZE ({TRAIN_SIZE:.3f}) is invalid. Sum of TEST_SIZE, VAL_SIZE, and CAL_SIZE must be between 0 and 1.")

    if not np.isclose(TRAIN_SIZE + TEST_SIZE + VAL_SIZE + CAL_SIZE, 1.0):
        # This check might be redundant given the calculation of TRAIN_SIZE, but good for safety.
        raise ValueError("Sum of split proportions must be equal to 1.")

    if np.isclose(TRAIN_SIZE, 0) and (np.isclose(VAL_SIZE, 0) or np.isclose(TEST_SIZE, 0) or np.isclose(CAL_SIZE, 0)):
        # Avoid scenarios where train is 0 but other splits are also 0, leading to ambiguity.
        # If only train is 0, it might be valid in some rare cases, but usually requires at least one other non-zero split.
        # Let's enforce Train > 0 for typical ML workflows.
        # If you need zero training data, adjust this check.
        logging.warning("TRAIN_SIZE is zero or near zero. Ensure this is intended.")
        if TRAIN_SIZE < 0: # Definitely an error
            raise ValueError("TRAIN_SIZE cannot be negative.")
        # Allow TRAIN_SIZE = 0 only if explicitly handled later, otherwise raise error?
        # For now, let's proceed but log a warning. If TRAIN_SIZE must be > 0, uncomment the raise below.
        # raise ValueError("TRAIN_SIZE must be greater than 0 for typical model training.")

    logging.info(f"Target split ratios: Train={TRAIN_SIZE:.2f}, Val={VAL_SIZE:.2f}, Test={TEST_SIZE:.2f}, Cal={CAL_SIZE:.2f}")

    # --- Initialize Splits ---
    # Use iloc[0:0] to create empty DataFrames/Series with the same columns/dtype
    empty_X = X.iloc[0:0]
    empty_y = y.iloc[0:0]
    X_train, y_train = empty_X.copy(), empty_y.copy()
    X_val, y_val = empty_X.copy(), empty_y.copy()
    X_test, y_test = empty_X.copy(), empty_y.copy()
    X_cal, y_cal = empty_X.copy(), empty_y.copy()

    # Temporary variables for sequential splitting
    X_remaining, y_remaining = X.copy(), y.copy() # Use copies to avoid modifying original X, y

    # --- Stratification Option ---
    # Define stratify_func only once
    def get_stratify_array(y_arr):
        return y_arr if SPLIT_STRATEGY == 'stratified' and not y_arr.empty else None

    # --- First Split: Train vs. Remainder (Val + Test + Cal) ---
    val_test_cal_size = VAL_SIZE + TEST_SIZE + CAL_SIZE

    if np.isclose(val_test_cal_size, 0): # Only Train set needed
        X_train, y_train = X_remaining, y_remaining
        logging.info("All data assigned to Train set (Val, Test, Cal sizes are 0).")
        X_remaining, y_remaining = empty_X.copy(), empty_y.copy() # No remainder
    elif np.isclose(TRAIN_SIZE, 0): # No Train set needed
        logging.info("Train set is empty (TRAIN_SIZE=0). Remainder passed to next splits.")
        # X_remaining, y_remaining already hold all data
    else: # Split Train vs Remainder
        split_test_size = val_test_cal_size # Proportion of remainder relative to total (1.0)
        X_train, X_remaining, y_train, y_remaining = train_test_split(
            X_remaining, y_remaining,
            test_size=split_test_size,
            random_state=RANDOM_SEED,
            stratify=get_stratify_array(y_remaining)
        )
    logging.info(f"Train set shape: {X_train.shape}")

    # --- Second Split: Val vs. Remainder (Test + Cal) ---
    if not X_remaining.empty:
        test_cal_size = TEST_SIZE + CAL_SIZE
        # Denominator for relative size calculation: size of the current remaining pool
        current_remaining_size_frac = VAL_SIZE + test_cal_size # = val_test_cal_size

        if np.isclose(VAL_SIZE, 0): # No Val set, pass remainder to next stage
            X_temp2, y_temp2 = X_remaining, y_remaining # Remainder is Test + Cal
            logging.info("Validation set is empty (VAL_SIZE=0).")
        elif np.isclose(test_cal_size, 0): # Only Val set left in remainder
            X_val, y_val = X_remaining, y_remaining
            X_temp2, y_temp2 = empty_X.copy(), empty_y.copy() # No data left for Test/Cal
            logging.info(f"Validation set shape: {X_val.shape}")
        else: # Split Val vs (Test + Cal)
            # Proportion of (Test + Cal) relative to (Val + Test + Cal)
            split_test_size = test_cal_size / current_remaining_size_frac
            X_val, X_temp2, y_val, y_temp2 = train_test_split(
                X_remaining, y_remaining,
                test_size=split_test_size,
                random_state=RANDOM_SEED,
                stratify=get_stratify_array(y_remaining)
            )
            logging.info(f"Validation set shape: {X_val.shape}")
    else: # No data remaining after train split
        X_temp2, y_temp2 = empty_X.copy(), empty_y.copy()
        if not np.isclose(VAL_SIZE, 0): # Log only if Val set was expected
            logging.info("Validation set is empty (no data remaining after train split).")

    # --- Third Split: Test vs. Cal ---
    if not X_temp2.empty:
        # Denominator for relative size calculation: size of the current remaining pool
        current_remaining_size_frac = TEST_SIZE + CAL_SIZE # = test_cal_size

        if np.isclose(CAL_SIZE, 0): # No Cal set, remainder is Test
            X_test, y_test = X_temp2, y_temp2
            logging.info("Calibration set is empty (CAL_SIZE=0).")
        elif np.isclose(TEST_SIZE, 0): # Only Cal set left in remainder
            X_cal, y_cal = X_temp2, y_temp2
            logging.info("Test set is empty (TEST_SIZE=0).")
        else: # Split Test vs Cal
            # Proportion of Cal relative to (Test + Cal)
            split_test_size = CAL_SIZE / current_remaining_size_frac
            X_test, X_cal, y_test, y_cal = train_test_split(
                X_temp2, y_temp2,
                test_size=split_test_size,
                random_state=RANDOM_SEED,
                stratify=get_stratify_array(y_temp2)
            )
            # Logging shapes done after the if/else block
    else: # No data remaining for Test/Cal split
        if not (np.isclose(TEST_SIZE, 0) and np.isclose(CAL_SIZE, 0)): # Log only if Test or Cal were expected
            logging.info("Test and Calibration sets are empty (no data remaining for final split).")

    # Log final shapes for Test and Cal
    logging.info(f"Test set shape: {X_test.shape}")
    logging.info(f"Calibration set shape: {X_cal.shape}")

    # --- Verification and Final Logging ---
    total_len = len(X_train) + len(X_val) + len(X_test) + len(X_cal)
    original_len = len(X)

    if total_len != original_len:
        # Calculate actual proportions based on lengths
        actual_train = len(X_train) / original_len if original_len > 0 else 0
        actual_val = len(X_val) / original_len if original_len > 0 else 0
        actual_test = len(X_test) / original_len if original_len > 0 else 0
        actual_cal = len(X_cal) / original_len if original_len > 0 else 0
        logging.warning(f"Total split length ({total_len}) does not exactly match original length ({original_len}). "
                        f"This can happen with stratification or rounding. "
                        f"Target proportions: Train={TRAIN_SIZE:.3f}, Val={VAL_SIZE:.3f}, Test={TEST_SIZE:.3f}, Cal={CAL_SIZE:.3f}. "
                        f"Actual proportions: Train={actual_train:.3f}, Val={actual_val:.3f}, Test={actual_test:.3f}, Cal={actual_cal:.3f}")
    else:
        logging.info("Split lengths verification successful.")

    logging.info("Data splitting complete.")

    # Log distributions, handling empty sets
    def log_distribution(name, y_set):
        if y_set.empty:
            logging.info(f"{name} target distribution: Set is empty.")
        else:
            try:
                # Use normalize=True, handle potential division by zero if counts are zero (though unlikely if not empty)
                counts = y_set.value_counts()
                dist = counts / counts.sum() if counts.sum() > 0 else counts
                logging.info(f"{name} target distribution:\n{dist}")
                # Log absolute counts as well for clarity
                logging.info(f"{name} target counts:\n{counts}")
            except Exception as e:
                logging.error(f"Could not calculate distribution for {name}: {e}")
                # Attempt to log raw value counts even if normalization fails
                try:
                    logging.info(f"{name} raw value counts:\n{y_set.value_counts()}")
                except Exception as e_raw:
                    logging.error(f"Could not get raw value counts for {name}: {e_raw}")

    log_distribution("Train", y_train)
    log_distribution("Validation", y_val)
    log_distribution("Test", y_test)
    log_distribution("Calibration", y_cal)

    return X_train, y_train, X_val, y_val, X_test, y_test, X_cal, y_cal

### Hyperparameter Optimization via Hyperband

In [10]:
# --- Internal Helper ---
def _train_and_eval(model_class, params,
                    X_train, y_train, X_val, y_val,
                    resource, resource_type,
                    scoring_func, random_state):
    """Internal helper function to train and evaluate a single configuration."""
    try:
        # Instantiate the base model without iteration-specific params first
        # Iteration param (e.g., n_estimators) will be handled later if needed
        model = model_class(**params)

        fit_duration = 0.0
        eval_duration = 0.0
        start_fit = time.time() # Start timing fit process

        if resource_type == 'data_fraction':
            # --- FIX 1: Implement data subsetting ---
            if resource < 1.0:
                # Use train_test_split to get a stratified fraction
                # We only need the 'train' part of this split for the subset
                try:
                    X_subset, _, y_subset, _ = train_test_split(
                        X_train, y_train,
                        train_size=resource,
                        random_state=random_state, # Use provided random state
                        stratify=y_train # Stratify based on original train labels
                    )
                except ValueError as e:
                    # Handle cases where stratification is not possible (e.g., too few samples)
                    logging.warning(f"Stratification failed for resource {resource:.2f}: {e}. Falling back to non-stratified split.")
                    X_subset, _, y_subset, _ = train_test_split(
                        X_train, y_train,
                        train_size=resource,
                        random_state=random_state
                    )
            else:
                # Use the full training data if resource is 1.0
                X_subset, y_subset = X_train, y_train

            # Ensure y_subset is numpy for fitting if needed by model
            y_subset_np = y_subset.values if isinstance(y_subset, pd.Series) else y_subset

            # Fit the model 
            model.fit(X_subset, y_subset_np)
            fit_duration = time.time() - start_fit
            # -----------------------------------------

        elif resource_type == 'iterations':
            # Resource represents n_estimators or similar iteration parameter
            params_iter = params.copy() # Avoid modifying original params dict
            iter_param_name = 'n_estimators' # Common case for RF, XGB, LGBM

            # Ensure resource is an integer for iterations
            params_iter[iter_param_name] = int(max(1, resource)) # Ensure at least 1 iteration
            model = model_class(**params_iter) # Re-instantiate with correct n_estimators

            # --- FIX 2 & 3: Conditional Fit Parameters ---
            current_fit_args = {} # Dictionary for specific fit arguments
            eval_set_for_fit = [(X_val, y_val)] # Common eval set

            if model_class is xgb.XGBClassifier:
                current_fit_args['eval_set'] = eval_set_for_fit
                current_fit_args['verbose'] = False

            elif model_class is lgb.LGBMClassifier:
                current_fit_args['eval_set'] = eval_set_for_fit
                if 'metric' in params_iter: # Get metric from HPO params
                     current_fit_args['eval_metric'] = params_iter['metric']
                elif isinstance(model.metric, str): # Get metric from model instance if set
                     current_fit_args['eval_metric'] = model.metric
                else: # Default if not found (might cause issues if early stopping expects it)
                     logging.warning(f"LGBM eval_metric not found in HPO params or model instance for config {params_iter}. Early stopping might fail.")
                     # You might need to add a default like 'logloss' or raise an error
                     # current_fit_args['eval_metric'] = 'logloss' # Example default

            # For models like RandomForest or DecisionTree, current_fit_args remains empty {}
            # as they don't use eval_set or callbacks in their standard fit method

            # Fit the model with appropriate arguments
            # Ensure y_train is numpy if needed
            y_train_np = y_train.values if isinstance(y_train, pd.Series) else y_train
            try:
                model.fit(X_train, y_train_np, **current_fit_args)
            except Exception as fit_error:
                 logging.error(f"Fit failed for config {params_iter} with resource {resource}: {fit_error}")
                 # logging.exception("Fit Traceback:") # Uncomment for full traceback
                 return -1.0 # Indicate failure
            fit_duration = time.time() - start_fit
            # -----------------------------------------

        else:
            raise ValueError("Invalid resource_type. Choose 'data_fraction' or 'iterations'.")

        # Evaluate on the full validation set (common part)
        start_eval = time.time()
        try:
             y_pred_val = model.predict(X_val)
             # Ensure y_val is numpy if needed by scoring_func
             y_val_np = y_val.values if isinstance(y_val, pd.Series) else y_val
             score = scoring_func(y_val_np, y_pred_val)
        except Exception as eval_error:
             logging.error(f"Predict/Score failed for config {params} with resource {resource}: {eval_error}")
             score = -1.0 # Indicate failure
        eval_duration = time.time() - start_eval

        logging.debug(f"Evaluated config: {params} | Resource: {resource:.2f} | Score: {score:.4f} | Fit: {fit_duration:.2f}s | Eval: {eval_duration:.2f}s")
        return score

    except Exception as e:
        # Log the configuration that caused the error
        logging.error(f"Error training/evaluating config {params} with resource {resource}: {e}", exc_info=False) # Set exc_info=True for traceback if needed
        return -1.0 # Return a clearly bad score


def hyperband_hpo(model_class, param_space,
                  X_train, y_train, X_val, y_val,
                  max_resource, eta=3, resource_type='iterations',
                  min_resource=1, # Min iterations or min data fraction
                  scoring_func=f1_score, # Function accepting (y_true, y_pred)
                  random_state=None): # For early stopping etc. passed to .fit()
    """
    Performs Hyperband Hyperparameter Optimization.

    Args:
        model_class: The model class (e.g., SVC, RandomForestClassifier).
        param_space (dict): Dictionary defining the hyperparameter search space
                           compatible with ParameterSampler.
        X_train, y_train: Training data and labels.
        X_val, y_val: Validation data and labels for evaluation.
        max_resource (float/int): Maximum resource allocation
                                 (e.g., max n_estimators or 1.0 for data fraction).
        eta (int): Reduction factor for successive halving (>= 2).
        resource_type (str): How resource is allocated:
                             'iterations' -> resource sets n_estimators (or similar).
                             'data_fraction' -> resource is fraction of training data used (stratified).
        min_resource (float/int): Minimum resource for the first iteration.
                                 Must be >= 1 for 'iterations', > 0 for 'data_fraction'.
        scoring_func (callable): Function to evaluate performance (e.g., f1_score).
                                Higher score is assumed better.
        random_state (int): Seed for reproducibility of parameter sampling and data subsetting.

    Returns:
        tuple: (best_params, best_score)
               best_params (dict): The hyperparameters of the best performing configuration.
               best_score (float): The score achieved by the best configuration on the validation set
                                  using the maximum resource.
    """

    log_max_r = math.log(max_resource / min_resource, eta) if max_resource > min_resource and min_resource > 0 else 0
    s_max = int(log_max_r)
    B = (s_max + 1) * max_resource # Approximate total resource budget

    logging.info(f"--- Starting Hyperband HPO ---")
    logging.info(f"Model: {model_class.__name__}")
    logging.info(f"Resource Type: {resource_type}")
    logging.info(f"Resource Range: [{min_resource}, {max_resource}]")
    logging.info(f"Eta: {eta}")
    logging.info(f"Max Brackets (s_max): {s_max}")
    logging.info(f"Approx. Budget (B): {B:.2f}")
    logging.info(f"Scoring: {scoring_func.__name__}")

    best_params = None
    best_score = -1.0
    total_configs_evaluated = 0
    outer_tqdm = tqdm(range(s_max, -1, -1), desc="Hyperband Brackets (s)")

    # Outer loop: Iterate through brackets (s values)
    for s in outer_tqdm:
        n_configs = int(math.ceil(int(B / max_resource / (s + 1)) * eta**s)) # Number of configs in this bracket
        r_initial = max_resource * eta**(-s) # Initial resource for this bracket
        # Ensure initial resource is not less than min_resource
        r_initial = max(r_initial, min_resource)

        outer_tqdm.set_description(f"Bracket s={s} (n={n_configs}, r0={r_initial:.2f})")
        logging.info(f"\n>> Bracket s={s}: n_configs={n_configs}, r_initial={r_initial:.2f}")

        # Sample configurations for this bracket
        param_list = list(ParameterSampler(param_space, n_iter=n_configs, random_state=random_state + s if random_state is not None else None))
        
        # --- Add common fixed parameters ---
        # Calculate scale_pos_weight once if needed
        scale_pos_weight_val = None
        if model_class in [xgb.XGBClassifier, lgb.LGBMClassifier]:
             neg_count = (y_train == 0).sum()
             pos_count = (y_train == 1).sum()
             if pos_count > 0:
                 scale_pos_weight_val = neg_count / pos_count

        for p in param_list:
             # Add random_state if model supports it and it's not sampled
             if 'random_state' not in p and hasattr(model_class(random_state=1), 'random_state'): # Check if attr exists
                 p['random_state'] = random_state
             # Add class_weight='balanced' for relevant sklearn models if not sampled
             if model_class in [SVC, RandomForestClassifier, DecisionTreeClassifier] and 'class_weight' not in p:
                 p['class_weight'] = 'balanced'
             # Add scale_pos_weight for boosting models if not sampled and calculated
             if model_class in [xgb.XGBClassifier, lgb.LGBMClassifier] and 'scale_pos_weight' not in p and scale_pos_weight_val is not None:
                  p['scale_pos_weight'] = scale_pos_weight_val
             # For LightGBM, also consider adding 'objective': 'binary' if not sampled
             if model_class is lgb.LGBMClassifier and 'objective' not in p:
                  p['objective'] = 'binary'
        # -----------------------------------

        # Inner loop: Successive halving rounds
        inner_tqdm = tqdm(range(s + 1), desc=f"SH Round (s={s})", leave=False)
        for i in inner_tqdm:
            current_resource = r_initial * eta**i
            # Ensure resource doesn't exceed max_resource due to floating point/rounding
            current_resource = min(current_resource, max_resource)

            n_configs_in_round = len(param_list)
            inner_tqdm.set_description(f"SH Round i={i} (n={n_configs_in_round}, r={current_resource:.2f})")
            logging.info(f"  -- Round i={i}: Evaluating {n_configs_in_round} configs with resource={current_resource:.2f} --")

            round_scores = []
            # Use tqdm for the configurations within the round
            eval_tqdm = tqdm(param_list, desc=f"Evaluating Configs (i={i})", leave=False)
            for params in eval_tqdm:
                score = _train_and_eval(model_class, params, X_train, y_train, X_val, y_val,
                                        current_resource, resource_type, scoring_func,
                                        random_state)
                round_scores.append((score, params))
                total_configs_evaluated += 1 # Count unique evaluations

            # Sort by score (descending, higher is better)
            round_scores.sort(key=lambda x: x[0], reverse=True)

            # Track the best overall score and params seen so far *at max resource*
            # Only update if we are actually at max resource in this round
            if abs(current_resource - max_resource) < 1e-6: # Check if we are at max resource
                 if round_scores and round_scores[0][0] > best_score:
                      best_score = round_scores[0][0]
                      best_params = round_scores[0][1]
                      logging.info(f"  ** New Best Found (Score: {best_score:.4f}) at max resource ** Params: {best_params}")
                      # Update outer tqdm description with best score found so far
                      outer_tqdm.set_postfix_str(f"Best F1: {best_score:.4f}", refresh=True)


            # --- Halving Step ---
            n_keep = int(n_configs_in_round / eta)
            logging.info(f"  -- Round i={i}: Completed {len(round_scores)} evaluations. Keeping top {n_keep} configs. --")

            if n_keep < 1 or i == s: # Keep at least one, or if it's the last round
                # If it's the last round, ensure the best score from *this bracket* at *max resource* is considered
                if abs(current_resource - max_resource) < 1e-6 and round_scores:
                     bracket_best_score = round_scores[0][0]
                     bracket_best_params = round_scores[0][1]
                     logging.info(f"  Bracket s={s} final best score: {bracket_best_score:.4f}")
                     # No need to update global best here, already done above
                break # Exit inner loop

            # Prepare parameter list for the next round
            param_list = [params for score, params in round_scores[:n_keep]]
            if not param_list: # Safety break if list becomes empty unexpectedly
                 logging.warning(f"  Param list empty after halving round i={i}. Stopping bracket.")
                 break

    logging.info(f"\n--- Hyperband HPO Finished ---")
    logging.info(f"Total configurations evaluated (approx): {total_configs_evaluated}") # Might overcount if errors happened
    if best_params:
        logging.info(f"Best Overall Score ({scoring_func.__name__}): {best_score:.4f}")
        logging.info(f"Best Params: {best_params}")
    else:
        logging.warning("No best parameters found. Check logs for errors or increase resources/configs.")

    return best_params, best_score

---
### Calibration (Platt Scaling/Isotonic Regression/Cross Venn Abers Predictors)
---

#### Understanding CVAP Calibration: Raw Scores vs. Probabilities & Implementation Approaches

Cross Venn-Abers Prediction (CVAP) is a method for producing well-calibrated probability predictions from machine learning models by applying the Inductive Venn-Abers Predictor (IVAP) logic within a cross-validation framework. A key implementation detail arises depending on whether the underlying base model produces probabilities \[0, 1] or **raw scores** (e.g., decision function values, log-odds, margins). The reference code is the [venn-abers python library](https://github.com/ip200/venn-abers/blob/main/src/venn_abers.py), in particular its source file `venn_abers.py`.

**CVAP Core Idea:**

1.  Split the training data into *K* folds.
2.  For each fold *k*:
    *   Train a base model on the *K-1* other folds.
    *   Use this model to get **scores** $S_{cal}^k$ for the held-out fold *k*, creating the Out-Of-Fold (OOF) calibration set $(S_{cal}^k, y_{cal}^k)$.
3.  Train a final base model on *all* training data.
4.  For a new test instance $x_{test}$:
    *   Get its score $s_{test}$ using the final base model.
    *   For *each* fold *k*, calculate the IVAP lower/upper probability bounds $p_0^k(s_{test})$ and $p_1^k(s_{test})$ using the fixed calibration set $(S_{cal}^k, y_{cal}^k)$ and the test score $s_{test}$.
5.  Aggregate the $K$ pairs of $(p_0^k, p_1^k)$ across all folds to produce the final CVAP probability prediction for $x_{test}$.

The crucial part is **Step 4**. The method used to calculate $p_0^k$ and $p_1^k$ depends on how the IVAP step is implemented, especially concerning the nature of the scores.

##### Approach 1: Efficient Pre-Calculation / Lookup (Standard IVAP Algorithm)

This is the computationally efficient algorithm presented in Vovk et al. (2015, Algorithms 1-6) for implementing the IVAP calibration step. ([Large-scale probabilistic predictors...](https://proceedings.neurips.cc/paper_files/paper/2015/file/a9a1d5317a33ae8cef33961c34144f84-Paper.pdf), Proposition 2).

**Algorithm Explanation:**

1.  **Calibration Pre-computation (per fold k):** Using *only* the OOF calibration set $(S_{cal}^k, y_{cal}^k)$:
    *   Take the calibration scores $S_{cal}^k$. **Crucially, these only need to be totally ordered real numbers; they are *not* required by the theory to be probabilities in \[0, 1].**
    *   Sort the unique calibration scores to get $C^k = \{c_1^k, ..., c_m^k\}$.
    *   Compute the cumulative sum diagram (CSD) and use PAVA (or equivalent GCM/LCM construction, see Algorithms 1-4) to derive two structures (`p0_structure^k`, `p1_structure^k`). These structures implicitly store the upper ($p_0$) and lower ($p_1$) probability bounds corresponding to the intervals defined by $C^k$.
2.  **Prediction (per fold k):** For *all* test scores $S_{test}$:
    *   For each $s_{test}$ (which is on the same raw score scale as $S_{cal}^k$):
        *   Find its position relative to the sorted unique *raw* scores $C^k$ using binary search (Algorithm 6).
        *   Use this position to **lookup** the appropriate probability values from the pre-calculated `p0_structure^k` and `p1_structure^k`.

**Why Practical Implementations Often Assume Probabilities (The Source of Confusion):**

While the *algorithm* itself works perfectly with raw scores, popular *implementations* (like the cited `venn_abers.py`) often introduce constraints for convenience or specific use cases:
*   They might expect input arrays (`p_cal`, `p_test`) to have two columns (probabilities for class 0 and 1).
*   They might explicitly select the column corresponding to the positive class probability (e.g., `[:, 1]`).
*   They might include checks or internal logic assuming values are within \[0, 1].

Feeding raw scores (e.g., `[-10, 0, 50]`) directly into **such specific library functions** causes errors or warnings (like `Input probabilities p_test are outside [0, 1]`). This is an **implementation artifact**, not a flaw in the theoretical Approach 1 algorithm itself. The algorithm remains mathematically sound for raw scores.

*   *(Note: One could adapt the library to handle 1D raw score arrays directly, bypassing the probability-specific checks and indexing, to leverage this efficient O(log k) lookup.)*

##### Approach 2: Direct IVAP Definition via Re-fitting Isotonic Regression

This approach bypasses the efficient lookup algorithm and instead directly implements the *fundamental definition* of IVAP for each test point, correctly handling raw scores using standard isotonic regression tools.

**Algorithm Explanation:**

This happens *inside* the loop over the *K* folds of CVAP. For fold *k* with OOF data $(S_{cal}^k, y_{cal}^k)$ and raw test scores $S_{test}$:

1.  **Iterate through each test score $s_{test, j}$ in $S_{test}$:**
    *   **Calculate $p_0^k(s_{test, j})$:** Augment the *raw score* calibration data with $(s_{test, j}, 0)$, fit Isotonic Regression $IR_0$ to this augmented set, and get $p_0^k(s_{test, j}) = IR_0(s_{test, j})$.
    *   **Calculate $p_1^k(s_{test, j})$:** Augment with $(s_{test, j}, 1)$, fit $IR_1$, and get $p_1^k(s_{test, j}) = IR_1(s_{test, j})$.
2.  **Aggregate:** Collect all $(p_0^k, p_1^k)$ pairs and aggregate across folds.

**Justification & Equivalence:**

*   **Follows Definition:** This exactly mirrors the definition of IVAP given in Vovk et al. (2015, lines L11-L13):
    > "When a new test object x arrives, compute its score s. Fit isotonic regression to (s1, y1),..., (sk, yk), (s, 0) obtaining a function f0. Fit isotonic regression to (s1, y1), ..., (sk, yk), (s, 1) obtaining a function f1. The multiprobability prediction ... is the pair (p0, p1) := (f0(s), f1(s))"
*   **Mathematical Equivalence:** Importantly, **Approach 2 yields the exact same $(p_0, p_1)$ values as the correctly implemented Approach 1**. The efficient algorithms (Approach 1) were proven to be equivalent ways of computing the result of the definition (Approach 2). Approach 2 is essentially a brute-force evaluation of the functions defined by the GCM/LCM construction when the efficient lookup code isn't suitable.
*   **Handles Raw Scores:** Standard isotonic regression libraries (`sklearn.isotonic.IsotonicRegression`) naturally handle arbitrary real-valued inputs (raw scores).

**Why Use Approach 2?**

Given that Approach 1 is theoretically valid and much faster (O(log k) vs O(k) isotonic fits per test point), why use Approach 2?
1.  **Implementation Convenience:** It avoids modifying existing probability-based library code (like `venn_abers.py`). One can use standard IR tools directly.
2.  **Direct Definition:** It serves as a clear, direct implementation of the foundational IVAP definition.

The trade-off is **computational cost during prediction**, which becomes O(K \* n_test \* k_fold), where k_fold is the cost of the IR fit (roughly linear in the fold size). This can be prohibitive for very large test sets or calibration fold sizes.

**Why Approach 2 is Not "Online Training":**

This approach remains **inductive**:
1.  The base model generating scores is fixed during prediction.
2.  The OOF calibration sets $(S_{cal}^k, y_{cal}^k)$ are fixed for each fold.
3.  The repeated IR fitting computes the output of a *fixed* (though complex) calibration rule defined by the OOF set for that fold; it does not update the rule itself based on test data (beyond the single test point required by the definition).

**Conclusion:**

While the efficient Venn-Abers/IVAP algorithm (Approach 1) is theoretically sound for raw scores, practical implementations often assume probability inputs. If using such implementations or needing a direct application of the definition, Approach 2 (re-fitting IR twice per test point within each CVAP fold) is the correct method for handling raw scores. It is mathematically equivalent to Approach 1 but computationally more expensive at prediction time. This necessity arises from implementation details, not theoretical limitations of the efficient algorithm itself.

### APPROACH 1

In [11]:
from raw_cvap import RawVennAbers, CVAPPredictorRaw

# --- Helper Function to Get Scores ---
def get_scores(estimator, X, score_method):
    """Gets scores from an estimator based on the specified method."""
    if score_method == 'decision_function':
        if hasattr(estimator, 'decision_function'):
            scores = estimator.decision_function(X)
            # Ensure scores are 1D for binary classification
            if scores.ndim == 2 and scores.shape[1] == 1:
                 scores = scores.flatten()
            elif scores.ndim > 1:
                 # For binary, decision_function should be 1D. If not, maybe multiclass? Raise error.
                 raise ValueError(f"decision_function returned shape {scores.shape}, expected 1D for binary classification.")
            return scores
        else:
            raise AttributeError(f"{estimator.__class__.__name__} does not have 'decision_function' method.")
    elif score_method == 'predict_proba':
        if hasattr(estimator, 'predict_proba'):
            # Return probability of the positive class (class 1)
            proba = estimator.predict_proba(X)
            if proba.shape[1] != 2:
                 raise ValueError(f"predict_proba returned shape {proba.shape}, expected (n_samples, 2)")
            return proba[:, 1]
        else:
            raise AttributeError(f"{estimator.__class__.__name__} does not have 'predict_proba' method.")
    elif score_method == 'raw_margin_xgb':
        # Check if it looks like an XGBoost model (basic check)
        if hasattr(estimator, 'predict') and 'output_margin' in estimator.predict.__code__.co_varnames:
             try:
                 # XGBoost convention: predict with output_margin=True gives raw scores
                 # For binary classification, this is usually a single value per instance
                 scores = estimator.predict(X, output_margin=True)
                 return scores.flatten() # Ensure 1D
             except TypeError as e:
                 raise TypeError(f"Error calling predict with output_margin=True on {estimator.__class__.__name__}. Is it an XGBoost model? Original error: {e}")
        else:
             raise AttributeError(f"{estimator.__class__.__name__} might not be an XGBoost model supporting 'output_margin'.")
    elif score_method == 'raw_score_lgbm':
         # Check if it looks like a LightGBM model (basic check)
        if hasattr(estimator, 'predict') and 'raw_score' in estimator.predict.__code__.co_varnames:
             try:
                 # LightGBM convention: predict with raw_score=True gives raw scores
                 # For binary classification, output shape might depend on objective.
                 # Often (n_samples,) or (n_samples, 1) for binary logloss/cross_entropy
                 scores = estimator.predict(X, raw_score=True)
                 # Handle potential (n_samples, 1) output for binary
                 if scores.ndim == 2 and scores.shape[1] == 1:
                     scores = scores.flatten()
                 elif scores.ndim != 1:
                      # If multiclass raw_score=True might return (n_samples, n_classes)
                      raise ValueError(f"LightGBM raw_score returned shape {scores.shape}. Expected 1D for binary.")
                 return scores
             except TypeError as e:
                 raise TypeError(f"Error calling predict with raw_score=True on {estimator.__class__.__name__}. Is it a LightGBM model? Original error: {e}")
        else:
             raise AttributeError(f"{estimator.__class__.__name__} might not be a LightGBM model supporting 'raw_score'.")
    else:
        raise ValueError(f"Unsupported score_method: {score_method}. Choose 'decision_function', 'predict_proba', 'raw_margin_xgb', or 'raw_score_lgbm'.")

# --- Main Unified Function ---
def train_calibrate_model(base_estimator_class, best_params, X_train, y_train,
                          calibration_method='platt', # 'platt', 'isotonic', 'cvap'
                          n_splits=5, random_state=None,
                          # CVAP specific params
                          score_method='decision_function', # 'decision_function', 'predict_proba', 'raw_margin_xgb', 'raw_score_lgbm'
                          cvap_loss='log', # 'log' or 'brier' for aggregation
                          cvap_precision=None, # Precision for rounding scores in VA fit
                          # Platt/Isotonic specific params (CalibratedClassifierCV handles score method)
                         ):
    """
    Trains a base estimator and calibrates its outputs using the specified method.

    For Platt/Isotonic, uses sklearn's CalibratedClassifierCV.
    For CVAP, uses the provided VennAbers implementation with k-fold CV,
    operating on raw scores specified by `score_method`.

    Args:
        base_estimator_class: Class of the base estimator (e.g., SVC, RandomForestClassifier).
        best_params (dict): Dictionary of best hyperparameters for the base estimator.
        X_train (pd.DataFrame or np.ndarray): Training features.
        y_train (pd.Series or np.ndarray): Training labels (binary 0/1).
        calibration_method (str): 'platt', 'isotonic', or 'cvap'.
        n_splits (int): Number of folds for cross-validation (used by all methods).
        random_state (int): Random state for reproducibility.
        score_method (str): Method to get scores for CVAP calibration.
                            Options: 'decision_function', 'predict_proba',
                                     'raw_margin_xgb', 'raw_score_lgbm'.
                            Ignored for 'platt' and 'isotonic' methods.
        cvap_loss (str): Aggregation loss for CVAP ('log' or 'brier').
        cvap_precision (int, optional): Precision for rounding scores in CVAP's VennAbers fit.

    Returns:
        tuple: (fitted_base_estimator, fitted_calibrator_object)
               - fitted_base_estimator: The base estimator trained on the full training data.
               - fitted_calibrator_object: An object with a `predict_proba` method
                 that returns calibrated probabilities.
                 For Platt/Isotonic, this is a CalibratedClassifierCV instance.
                 For CVAP, this is the custom _CVAPPredictor instance.
               Returns (None, None) if an error occurs.
    """
    logging.info(f"--- Starting Model Training & Calibration ({calibration_method}) ---")

    # Input Type Handling
    if isinstance(X_train, pd.DataFrame):
        X_train_np = X_train.values
    else:
        X_train_np = np.asarray(X_train)
    if isinstance(y_train, pd.Series):
        y_train_np = y_train.values
    else:
        y_train_np = np.asarray(y_train)
    if len(np.unique(y_train_np)) != 2:
        raise ValueError(f"This function currently supports only binary classification. Found labels: {np.unique(y_train_np)}")

    # Instantiate the base estimator
    try:
        # Special handling for SVC probability if needed by Platt/Isotonic *internal* logic
        # CalibratedClassifierCV might internally require probability=True for some base estimators
        # even if we don't explicitly use predict_proba. Let's ensure it's set if method needs it.
        current_params = best_params.copy()
        is_svc = issubclass(base_estimator_class, SVC)

        # Check if the chosen calibration method *might* rely on predict_proba internally
        needs_proba = False
        if calibration_method in ['platt', 'isotonic']:
             # CalibratedClassifierCV's default ('auto') tries decision_function first,
             # but might fall back to predict_proba. Safest to enable for SVC.
             if is_svc: needs_proba = True
        elif calibration_method == 'cvap' and score_method == 'predict_proba':
             needs_proba = True

        if needs_proba and is_svc and not current_params.get('probability', False):
             logging.warning(f"Setting probability=True for SVC as required by calibration method '{calibration_method}' or score_method '{score_method}'.")
             current_params['probability'] = True

        base_estimator = base_estimator_class(**current_params)

    except Exception as e:
        logging.error(f"Error instantiating base estimator {base_estimator_class.__name__} with params {current_params}: {e}", exc_info=True)
        return None, None


    # --- Calibration Method Logic ---
    try:
        if calibration_method in ['platt', 'isotonic']:
            logging.info(f"Using CalibratedClassifierCV with method='{'sigmoid' if calibration_method == 'platt' else 'isotonic'}'")
            logging.info(f"(Ignoring 'score_method' parameter '{score_method}' for CalibratedClassifierCV)")

            cv_strategy = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
            # Pass the potentially modified estimator (e.g., SVC with probability=True)
            calibrator = CalibratedClassifierCV(
                base_estimator, # Use the instance created above
                method='sigmoid' if calibration_method == 'platt' else 'isotonic',
                cv=cv_strategy,
                n_jobs=-1
            )

            logging.info("Fitting CalibratedClassifierCV...")
            calibrator.fit(X_train_np, y_train_np)
            logging.info("CalibratedClassifierCV fitting complete.")

            # Extract final estimator (same logic as before)
            if isinstance(calibrator.base_estimator_, list):
                 logging.warning("CalibratedClassifierCV returned a list of base estimators. Returning the first one.")
                 final_base_estimator = calibrator.base_estimator_[0]
            else:
                 final_base_estimator = calibrator.base_estimator_

            logging.info(f"--- {calibration_method.capitalize()} Scaling Training Complete ---")
            return final_base_estimator, calibrator

        elif calibration_method == 'cvap':
            logging.info(f"Using Cross Venn-Abers Prediction (CVAP) with score_method='{score_method}'")

            # 1. Out-of-fold raw scores
            cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
            oof_scores_list, oof_y_cal_list = [], []
            for train_idx, val_idx in cv.split(X_train_np, y_train_np):
                est = clone(base_estimator).fit(X_train_np[train_idx], y_train_np[train_idx])
                scores = get_scores(est, X_train_np[val_idx], score_method)
                oof_scores_list.append(scores)
                oof_y_cal_list.append(y_train_np[val_idx])

            # 2. Final base model on all data
            final_base_estimator = clone(base_estimator).fit(X_train_np, y_train_np)

            # 3. Build one RawVennAbers per fold
            calibrators = [
                RawVennAbers(precision=cvap_precision).fit(scores, y_cal)
                for scores, y_cal in zip(oof_scores_list, oof_y_cal_list)
            ]

            # 4. Wrap them into CVAPPredictorRaw
            cvap_predictor = CVAPPredictorRaw(
                final_estimator_=final_base_estimator,
                calibrators_=calibrators,
                loss_=cvap_loss,
                score_method_=score_method
            )
            logging.info("--- CVAP Training Complete ---")
            return final_base_estimator, cvap_predictor
        
        else:
            raise ValueError(f"Unknown calibration_method: {calibration_method}. Choose 'platt', 'isotonic', or 'cvap'.")

    except Exception as e:
        logging.error(f"Error during {calibration_method} calibration: {e}", exc_info=True)
        return None, None


### APPROACH 2

In [12]:
# # --- Helper Function for aggregation of the p0 and p1 values across folds ---
# def geo_mean(a):
#     """Calculates geometric mean along axis 1."""
#     # Handle potential zeros or negative values before taking the product/root
#     # If any value in a row is <= 0, the geometric mean is typically considered 0.
#     if a.shape[1] == 0:
#         return np.ones(a.shape[0]) # Geometric mean of empty set is 1? Or NaN? Let's return 1.

#     # Check for non-positive values
#     has_non_positive = np.any(a <= 1e-9, axis=1) # Use tolerance

#     # Calculate product safely
#     log_a = np.log(np.maximum(a, 1e-9)) # Avoid log(0)
#     geo_mean_val = np.exp(np.mean(log_a, axis=1))

#     # Set geo_mean to 0 for rows that had non-positive values
#     geo_mean_val[has_non_positive] = 0.0

#     return geo_mean_val

# # --- Helper Function to Get Scores ---
# def get_scores(estimator, X, score_method):
#     """Gets scores from an estimator based on the specified method."""
#     if score_method == 'decision_function':
#         if hasattr(estimator, 'decision_function'):
#             scores = estimator.decision_function(X)
#             # Ensure scores are 1D for binary classification
#             if scores.ndim == 2 and scores.shape[1] == 1:
#                  scores = scores.flatten()
#             elif scores.ndim > 1:
#                  # For binary, decision_function should be 1D. If not, maybe multiclass? Raise error.
#                  raise ValueError(f"decision_function returned shape {scores.shape}, expected 1D for binary classification.")
#             return scores
#         else:
#             raise AttributeError(f"{estimator.__class__.__name__} does not have 'decision_function' method.")
#     elif score_method == 'predict_proba':
#         if hasattr(estimator, 'predict_proba'):
#             # Return probability of the positive class (class 1)
#             proba = estimator.predict_proba(X)
#             if proba.shape[1] != 2:
#                  raise ValueError(f"predict_proba returned shape {proba.shape}, expected (n_samples, 2)")
#             return proba[:, 1]
#         else:
#             raise AttributeError(f"{estimator.__class__.__name__} does not have 'predict_proba' method.")
#     elif score_method == 'raw_margin_xgb':
#         # Check if it looks like an XGBoost model (basic check)
#         if hasattr(estimator, 'predict') and 'output_margin' in estimator.predict.__code__.co_varnames:
#              try:
#                  # XGBoost convention: predict with output_margin=True gives raw scores
#                  # For binary classification, this is usually a single value per instance
#                  scores = estimator.predict(X, output_margin=True)
#                  return scores.flatten() # Ensure 1D
#              except TypeError as e:
#                  raise TypeError(f"Error calling predict with output_margin=True on {estimator.__class__.__name__}. Is it an XGBoost model? Original error: {e}")
#         else:
#              raise AttributeError(f"{estimator.__class__.__name__} might not be an XGBoost model supporting 'output_margin'.")
#     elif score_method == 'raw_score_lgbm':
#          # Check if it looks like a LightGBM model (basic check)
#         if hasattr(estimator, 'predict') and 'raw_score' in estimator.predict.__code__.co_varnames:
#              try:
#                  # LightGBM convention: predict with raw_score=True gives raw scores
#                  # For binary classification, output shape might depend on objective.
#                  # Often (n_samples,) or (n_samples, 1) for binary logloss/cross_entropy
#                  scores = estimator.predict(X, raw_score=True)
#                  # Handle potential (n_samples, 1) output for binary
#                  if scores.ndim == 2 and scores.shape[1] == 1:
#                      scores = scores.flatten()
#                  elif scores.ndim != 1:
#                       # If multiclass raw_score=True might return (n_samples, n_classes)
#                       raise ValueError(f"LightGBM raw_score returned shape {scores.shape}. Expected 1D for binary.")
#                  return scores
#              except TypeError as e:
#                  raise TypeError(f"Error calling predict with raw_score=True on {estimator.__class__.__name__}. Is it a LightGBM model? Original error: {e}")
#         else:
#              raise AttributeError(f"{estimator.__class__.__name__} might not be a LightGBM model supporting 'raw_score'.")
#     else:
#         raise ValueError(f"Unsupported score_method: {score_method}. Choose 'decision_function', 'predict_proba', 'raw_margin_xgb', or 'raw_score_lgbm'.")

# # --- Helper Class for CVAP Prediction ---
# class _CVAPPredictor(BaseEstimator, ClassifierMixin):
#     """Internal helper class to store CVAP results using raw scores and provide prediction."""
#     def __init__(self, final_base_estimator, oof_scores_list, oof_y_cal_list,
#                  score_method, precision=None, loss='log'): # precision is now unused here, but kept for consistency
#         self.final_base_estimator_ = final_base_estimator
#         self.oof_scores_list_ = oof_scores_list # List of raw score arrays (n_fold_samples,) from each fold
#         self.oof_y_cal_list_ = oof_y_cal_list # List of y_cal arrays from each fold
#         self.score_method_ = score_method     # How scores were obtained
#         # self.precision_ = precision # Precision was for the old VennAbers, not needed for IsotonicRegression
#         self.loss_ = loss
#         self.n_splits_ = len(oof_scores_list)
#         self.classes_ = np.array([0, 1]) # Hardcoded for binary

#         if len(self.oof_scores_list_) != len(self.oof_y_cal_list_):
#              raise ValueError("Mismatch between number of OOF score folds and label folds.")
#         if self.n_splits_ == 0:
#              raise ValueError("Cannot initialize _CVAPPredictor with zero folds.")
#         for i, scores in enumerate(self.oof_scores_list_):
#              if scores.ndim != 1:
#                   raise ValueError(f"OOF scores for fold {i} must be 1D, but got shape {scores.shape}")
#              if len(scores) != len(self.oof_y_cal_list_[i]):
#                   raise ValueError(f"Mismatch between score length ({len(scores)}) and label length ({len(self.oof_y_cal_list_[i])}) in fold {i}")

#     def predict_proba(self, X):
#         """Generates CVAP calibrated probabilities for new data X using raw scores."""
#         if not hasattr(self, 'final_base_estimator_') or self.final_base_estimator_ is None:
#              raise NotFittedError("The final base estimator for CVAP is not available.")

#         X = check_array(X, accept_sparse=True, force_all_finite=False)

#         logging.debug(f"Getting test scores using method: {self.score_method_}")
#         try:
#             raw_test_scores = get_scores(self.final_base_estimator_, X, self.score_method_)
#             if raw_test_scores.ndim != 1:
#                  raise ValueError(f"get_scores returned non-1D scores (shape {raw_test_scores.shape}) for test data.")
#             logging.debug(f"Raw test scores sample: {raw_test_scores[:5]}")
#         except Exception as e:
#             logging.error(f"Error getting scores for test data using method {self.score_method_}: {e}", exc_info=True)
#             raise

#         n_test_samples = X.shape[0]
#         p0p1_test_folds = np.zeros((self.n_splits_, n_test_samples, 2)) # Store p0, p1 for each fold and sample

#         # --- CORRECTED Calibration Loop ---
#         logging.info(f"Calculating CVAP probabilities using {self.n_splits_} folds...")
#         for i in tqdm(range(self.n_splits_), desc="CVAP Fold Calibration", leave=False):
#             cal_scores_fold = self.oof_scores_list_[i]
#             cal_y_fold = self.oof_y_cal_list_[i]

#             if len(cal_scores_fold) == 0:
#                  logging.warning(f"Fold {i} has empty calibration data. Assigning default 0.5 probabilities.")
#                  p0p1_test_folds[i, :, 0] = 0.5 # Default p0
#                  p0p1_test_folds[i, :, 1] = 0.5 # Default p1
#                  continue

#             # Pre-allocate results for this fold
#             p0_results_fold = np.zeros(n_test_samples)
#             p1_results_fold = np.zeros(n_test_samples)

#             # Fit Isotonic Regression twice for each test point (as per IVAP definition)
#             # This is necessary because the isotonic fit depends on the test point's score
#             # and its *hypothesized* label.
#             for j in range(n_test_samples):
#                 test_score = raw_test_scores[j]

#                 # Calculate p0 (assuming test label is 0)
#                 scores_aug_0 = np.append(cal_scores_fold, test_score)
#                 y_aug_0 = np.append(cal_y_fold, 0)
#                 ir_0 = IsotonicRegression(out_of_bounds='clip', y_min=0, y_max=1, increasing='auto')
#                 try:
#                     ir_0.fit(scores_aug_0, y_aug_0)
#                     p0_results_fold[j] = ir_0.predict([test_score])[0]
#                 except Exception as e:
#                     logging.warning(f"Isotonic fit for p0 failed fold {i}, sample {j}. Score: {test_score}. Error: {e}. Setting p0=0.5")
#                     p0_results_fold[j] = 0.5 # Fallback

#                 # Calculate p1 (assuming test label is 1)
#                 scores_aug_1 = np.append(cal_scores_fold, test_score)
#                 y_aug_1 = np.append(cal_y_fold, 1)
#                 ir_1 = IsotonicRegression(out_of_bounds='clip', y_min=0, y_max=1, increasing='auto')
#                 try:
#                     ir_1.fit(scores_aug_1, y_aug_1)
#                     p1_results_fold[j] = ir_1.predict([test_score])[0]
#                 except Exception as e:
#                      logging.warning(f"Isotonic fit for p1 failed fold {i}, sample {j}. Score: {test_score}. Error: {e}. Setting p1=0.5")
#                      p1_results_fold[j] = 0.5 # Fallback

#             # Store results for this fold
#             p0p1_test_folds[i, :, 0] = p0_results_fold
#             p0p1_test_folds[i, :, 1] = p1_results_fold

#             logging.debug(f"Fold {i} p0 sample: {p0_results_fold[:5]}")
#             logging.debug(f"Fold {i} p1 sample: {p1_results_fold[:5]}")


#         # 3. Aggregate p0, p1 probability bounds across folds
#         # Reshape for aggregation: (n_test_samples, n_splits)
#         p0_stack = p0p1_test_folds[:, :, 0].T
#         p1_stack = p0p1_test_folds[:, :, 1].T
#         logging.debug(f"p0_stack shape: {p0_stack.shape}, p1_stack shape: {p1_stack.shape}")

#         # 4. Calculate final calibrated probability based on loss function
#         p_prime = np.zeros((n_test_samples, 2))
#         if self.loss_ == 'log':
#             geo_mean_1_minus_p0 = geo_mean(1 - p0_stack)
#             geo_mean_p1 = geo_mean(p1_stack)
#             denominator = geo_mean_1_minus_p0 + geo_mean_p1
#             valid_denom = denominator > 1e-9
#             p_prime[valid_denom, 1] = geo_mean_p1[valid_denom] / denominator[valid_denom]
#             p_prime[~valid_denom, 1] = 0.5 # Default if denominator is zero (p0=1, p1=0 for all folds)
#         elif self.loss_ == 'brier':
#              # Avoid potential nan if a fold had no results (though handled above)
#              mean_p1 = np.nanmean(p1_stack, axis=1)
#              mean_p0_sq = np.nanmean(p0_stack**2, axis=1)
#              mean_p1_sq = np.nanmean(p1_stack**2, axis=1)
#              p_prime[:, 1] = mean_p1 + 0.5 * mean_p0_sq - 0.5 * mean_p1_sq
#              # Replace potential NaN with 0.5 if all folds failed for a sample
#              p_prime[np.isnan(p_prime[:, 1]), 1] = 0.5
#         else:
#             raise ValueError(f"Unsupported loss function for CVAP aggregation: {self.loss_}")

#         # Ensure probabilities are valid
#         p_prime[:, 1] = np.clip(p_prime[:, 1], 0, 1)
#         p_prime[:, 0] = 1 - p_prime[:, 1]

#         logging.debug(f"Final calibrated probs sample: {p_prime[:5, 1]}")
#         return p_prime

#     # --- predict and _more_tags remain unchanged ---
#     def predict(self, X):
#         """Predicts class labels."""
#         proba = self.predict_proba(X)
#         return self.classes_[np.argmax(proba, axis=1)]

#     def _more_tags(self):
#         return {'binary_only': True}

#     def fit(self, X, y):
#          # Dummy fit method needed for sklearn compatibility if used directly
#          check_X_y(X, y, accept_sparse=True, force_all_finite=False)
#          if not hasattr(self, 'final_base_estimator_'):
#               raise NotFittedError("Cannot call fit on _CVAPPredictor directly. It's fitted internally.")
#          self.classes_ = np.unique(y)
#          if len(self.classes_) != 2:
#               raise ValueError("CVAP Predictor internal error: Expected 2 classes.")
#          return self


# # --- Main Unified Function ---
# def train_calibrate_model(base_estimator_class, best_params, X_train, y_train,
#                           calibration_method='platt', # 'platt', 'isotonic', 'cvap'
#                           n_splits=5, random_state=None,
#                           # CVAP specific params
#                           score_method='decision_function', # 'decision_function', 'predict_proba', 'raw_margin_xgb', 'raw_score_lgbm'
#                           cvap_loss='log', # 'log' or 'brier' for aggregation
#                           cvap_precision=None, # Precision for rounding scores in VA fit
#                           # Platt/Isotonic specific params (CalibratedClassifierCV handles score method)
#                          ):
#     """
#     Trains a base estimator and calibrates its outputs using the specified method.

#     For Platt/Isotonic, uses sklearn's CalibratedClassifierCV.
#     For CVAP, uses the provided VennAbers implementation with k-fold CV,
#     operating on raw scores specified by `score_method`.

#     Args:
#         base_estimator_class: Class of the base estimator (e.g., SVC, RandomForestClassifier).
#         best_params (dict): Dictionary of best hyperparameters for the base estimator.
#         X_train (pd.DataFrame or np.ndarray): Training features.
#         y_train (pd.Series or np.ndarray): Training labels (binary 0/1).
#         calibration_method (str): 'platt', 'isotonic', or 'cvap'.
#         n_splits (int): Number of folds for cross-validation (used by all methods).
#         random_state (int): Random state for reproducibility.
#         score_method (str): Method to get scores for CVAP calibration.
#                             Options: 'decision_function', 'predict_proba',
#                                      'raw_margin_xgb', 'raw_score_lgbm'.
#                             Ignored for 'platt' and 'isotonic' methods.
#         cvap_loss (str): Aggregation loss for CVAP ('log' or 'brier').
#         cvap_precision (int, optional): Precision for rounding scores in CVAP's VennAbers fit.

#     Returns:
#         tuple: (fitted_base_estimator, fitted_calibrator_object)
#                - fitted_base_estimator: The base estimator trained on the full training data.
#                - fitted_calibrator_object: An object with a `predict_proba` method
#                  that returns calibrated probabilities.
#                  For Platt/Isotonic, this is a CalibratedClassifierCV instance.
#                  For CVAP, this is the custom _CVAPPredictor instance.
#                Returns (None, None) if an error occurs.
#     """
#     logging.info(f"--- Starting Model Training & Calibration ({calibration_method}) ---")

#     # Input Type Handling
#     if isinstance(X_train, pd.DataFrame):
#         X_train_np = X_train.values
#     else:
#         X_train_np = np.asarray(X_train)
#     if isinstance(y_train, pd.Series):
#         y_train_np = y_train.values
#     else:
#         y_train_np = np.asarray(y_train)
#     if len(np.unique(y_train_np)) != 2:
#         raise ValueError(f"This function currently supports only binary classification. Found labels: {np.unique(y_train_np)}")

#     # Instantiate the base estimator
#     try:
#         # Special handling for SVC probability if needed by Platt/Isotonic *internal* logic
#         # CalibratedClassifierCV might internally require probability=True for some base estimators
#         # even if we don't explicitly use predict_proba. Let's ensure it's set if method needs it.
#         current_params = best_params.copy()
#         is_svc = issubclass(base_estimator_class, SVC)

#         # Check if the chosen calibration method *might* rely on predict_proba internally
#         needs_proba = False
#         if calibration_method in ['platt', 'isotonic']:
#              # CalibratedClassifierCV's default ('auto') tries decision_function first,
#              # but might fall back to predict_proba. Safest to enable for SVC.
#              if is_svc: needs_proba = True
#         elif calibration_method == 'cvap' and score_method == 'predict_proba':
#              needs_proba = True

#         if needs_proba and is_svc and not current_params.get('probability', False):
#              logging.warning(f"Setting probability=True for SVC as required by calibration method '{calibration_method}' or score_method '{score_method}'.")
#              current_params['probability'] = True

#         base_estimator = base_estimator_class(**current_params)

#     except Exception as e:
#         logging.error(f"Error instantiating base estimator {base_estimator_class.__name__} with params {current_params}: {e}", exc_info=True)
#         return None, None


#     # --- Calibration Method Logic ---
#     try:
#         if calibration_method in ['platt', 'isotonic']:
#             logging.info(f"Using CalibratedClassifierCV with method='{'sigmoid' if calibration_method == 'platt' else 'isotonic'}'")
#             logging.info(f"(Ignoring 'score_method' parameter '{score_method}' for CalibratedClassifierCV)")

#             cv_strategy = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
#             # Pass the potentially modified estimator (e.g., SVC with probability=True)
#             calibrator = CalibratedClassifierCV(
#                 base_estimator, # Use the instance created above
#                 method='sigmoid' if calibration_method == 'platt' else 'isotonic',
#                 cv=cv_strategy,
#                 n_jobs=-1
#             )

#             logging.info("Fitting CalibratedClassifierCV...")
#             calibrator.fit(X_train_np, y_train_np)
#             logging.info("CalibratedClassifierCV fitting complete.")

#             # Extract final estimator (same logic as before)
#             if isinstance(calibrator.base_estimator_, list):
#                  logging.warning("CalibratedClassifierCV returned a list of base estimators. Returning the first one.")
#                  final_base_estimator = calibrator.base_estimator_[0]
#             else:
#                  final_base_estimator = calibrator.base_estimator_

#             logging.info(f"--- {calibration_method.capitalize()} Scaling Training Complete ---")
#             return final_base_estimator, calibrator

#         elif calibration_method == 'cvap':
#             logging.info(f"Using Cross Venn-Abers Prediction (CVAP) with score_method='{score_method}'")

#             # Check if the chosen score_method is available on a temp instance *before* CV
#             # try:
#             #      temp_estimator = clone(base_estimator)
#             #      # Fit on a tiny subset just to enable score method call
#             #      temp_estimator.fit(X_train_np[:2], y_train_np[:2])
#             #      _ = get_scores(temp_estimator, X_train_np[:2], score_method)
#             #      logging.info(f"Score method '{score_method}' seems available on {base_estimator_class.__name__}.")
#             # except (AttributeError, TypeError, ValueError) as e:
#             #      logging.error(f"Score method '{score_method}' not available or failed for estimator {base_estimator_class.__name__}: {e}", exc_info=True)
#             #      raise AttributeError(f"Estimator {base_estimator_class.__name__} does not support the required score_method '{score_method}'. Error: {e}") from e


#             # 1. Get out-of-fold *raw scores* using k-fold CV
#             logging.info(f"Performing {n_splits}-fold CV to get out-of-fold scores for CVAP...")
#             cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

#             oof_scores_list = [] # List to store raw score arrays (n_fold_samples,)
#             oof_y_cal_list = [] # List to store y_cal arrays

#             for fold, (train_idx, val_idx) in enumerate(tqdm(cv.split(X_train_np, y_train_np), total=n_splits, desc=f"CVAP OOF Scores ({score_method})", leave=False)):
#                 X_train_fold, X_val_fold = X_train_np[train_idx], X_train_np[val_idx]
#                 y_train_fold, y_val_fold = y_train_np[train_idx], y_train_np[val_idx]

#                 estimator_fold = clone(base_estimator)
#                 estimator_fold.fit(X_train_fold, y_train_fold)

#                 # Get RAW scores for the validation set using the specified method
#                 scores_fold = get_scores(estimator_fold, X_val_fold, score_method)
#                 if scores_fold.ndim != 1 or len(scores_fold) != len(y_val_fold):
#                      raise ValueError(f"Fold {fold} get_scores returned unexpected shape {scores_fold.shape} or length, expected ({len(y_val_fold)},)")

#                 oof_scores_list.append(scores_fold)
#                 oof_y_cal_list.append(y_val_fold)

#             logging.info("Out-of-fold scores collected.")

#             # 2. Train the final base model on the entire training set
#             logging.info("Training final base model on full training data...")
#             final_base_estimator = clone(base_estimator)
#             final_base_estimator.fit(X_train_np, y_train_np)
#             logging.info("Final base model trained.")

#             # 3. Create the CVAP Predictor object using raw scores
#             cvap_predictor = _CVAPPredictor(
#                 final_base_estimator=final_base_estimator,
#                 oof_scores_list=oof_scores_list, # Pass raw scores
#                 oof_y_cal_list=oof_y_cal_list,
#                 score_method=score_method,      # Pass score method
#                 precision=cvap_precision,
#                 loss=cvap_loss
#             )
#             logging.info("CVAP Predictor object created.")

#             logging.info("--- CVAP Training Complete ---")
#             return final_base_estimator, cvap_predictor # Return the predictor object

#         else:
#             raise ValueError(f"Unknown calibration_method: {calibration_method}. Choose 'platt', 'isotonic', or 'cvap'.")

#     except Exception as e:
#         logging.error(f"Error during {calibration_method} calibration: {e}", exc_info=True)
#         return None, None

### Mondrian Inductive Conformal Prediction

In [13]:
# Option 1: Define the helper function
def probs_to_alphas(p_mat: np.ndarray) -> np.ndarray:
    """
    Transform calibrated probability matrix
    (n_samples, n_classes) → non-conformity scores α
    using  α = 1 − p.
    """
    if not isinstance(p_mat, np.ndarray) or p_mat.ndim != 2:
         raise ValueError("Input must be a 2D numpy probability matrix.")
    return 1.0 - p_mat

# Option 2: If you imported `margin`(from crepes.extras import margin), you don't strictly need probs_to_alphas,
#           you can call margin(probs_cal) and margin(probs_test) later.
#           We'll use the defined function probs_to_alphas below for clarity.


# Add the Mondrian ICP wrapper function. First the fit function that uses the calibration split.
def fit_mondrian_classifier(probs_cal, bins_cal=None):
    """
    Fits a crepes ConformalClassifier using calibration probabilities.

    Args:
        probs_cal (np.ndarray): Calibrated probabilities (n_cal, n_classes).
        bins_cal (np.ndarray, optional): Mondrian bins for calibration set. Defaults to None (standard ICP).

    Returns:
        ConformalClassifier: The fitted crepes classifier object, or None if fitting fails.
    """
    logging.info("--- Fitting Mondrian Conformal Classifier ---")
    if probs_cal is None or len(probs_cal) == 0:
        logging.error("Calibration probabilities are empty or None. Cannot fit Mondrian classifier.")
        return None
    try:
        alphas_cal = probs_to_alphas(probs_cal) # Or use margin(probs_cal)
        cc = ConformalClassifier()
        cc.fit(alphas_cal, bins=bins_cal)
        logging.info("--- Mondrian Conformal Classifier Fitted ---")
        return cc
    except Exception as e:
        logging.error(f"Error fitting Mondrian classifier: {e}", exc_info=True)
        return None
    
# The function to evaluate MICP in the test split
def evaluate_mondrian_prediction(fitted_cc, probs_test, y_test_true, bins_test=None, alpha=0.1):
    """
    Evaluates the fitted Mondrian classifier on the test set, including class-specific coverage.

    Args:
        fitted_cc (ConformalClassifier): The pre-fitted crepes classifier object.
        probs_test (np.ndarray): Calibrated probabilities for test set (n_test, n_classes).
        y_test_true (np.ndarray): True labels for the test set.
        bins_test (np.ndarray, optional): Mondrian bins for test set. Defaults to None.
        alpha (float): The significance level (1 - confidence).

    Returns:
        tuple: (coverage, avg_set_size, prediction_sets, class_coverage_dict) or (None, None, None, None) if evaluation fails.
    """
    logging.info(f"--- Evaluating Mondrian Prediction (alpha={alpha}) ---")
    if fitted_cc is None:
        logging.error("Fitted classifier is None. Cannot evaluate.")
        return None, None, None, None
    if probs_test is None or len(probs_test) == 0 or y_test_true is None or len(y_test_true) == 0:
        logging.warning("Test probabilities or labels are empty/None. Skipping evaluation.")
        # Return values indicating skipped evaluation but not necessarily an error state
        return 0.0, 0.0, np.array([[]]), {}

    try:
        alphas_test = probs_to_alphas(probs_test) # Or use margin(probs_test)
        pred_sets = fitted_cc.predict_set(alphas_test,
                                          bins=bins_test,
                                          confidence=1 - alpha) # boolean array (n_test, n_classes)

        # Evaluate Coverage and Size
        y_test_true_np = np.asarray(y_test_true) # Ensure numpy array
        n_test = len(y_test_true_np)
        if n_test == 0:
            return 0.0, 0.0, np.array([[]]), {}

        # Check if true label is in the non-zero indices (predicted classes) of the set row
        contains = np.array([y_test_true_np[i] in np.where(pred_sets[i])[0] for i in range(n_test)])

        coverage = contains.mean() if n_test > 0 else 0.0
        avg_size = pred_sets.sum(axis=1).mean() if n_test > 0 else 0.0

        # Class-specific coverage
        class_coverage_dict = {}
        unique_classes = np.unique(y_test_true_np)
        for cls in unique_classes:
            idx = np.where(y_test_true_np == cls)[0]
            if len(idx) == 0:
                class_coverage = np.nan
            else:
                class_coverage = contains[idx].mean()
            class_coverage_dict[cls] = class_coverage
            logging.info(f"Mondrian CP Coverage for class {cls}: {class_coverage:.4f}")

        # Log a warning if any conformal set is empty
        empty_sets = np.where(pred_sets.sum(axis=1) == 0)[0]
        n_empty = len(empty_sets)
        if n_empty > 0:
            logging.warning(f"{n_empty} conformal prediction sets are empty out of {n_test} samples.")

        logging.info(f"Mondrian CP Coverage: {coverage:.4f}")
        logging.info(f"Mondrian CP Avg Set Size: {avg_size:.4f}")
        logging.info("--- Mondrian Prediction Evaluation Complete ---")

        return coverage, avg_size, pred_sets, class_coverage_dict

    except Exception as e:
        logging.error(f"Error during Mondrian prediction evaluation: {e}", exc_info=True)
        return None, None, None, None

### Metrics

In [14]:
# --- Define Comprehensive Metrics ---

def calculate_metrics(y_true, y_pred, y_proba, model_name="Model", conf_mat = False):
    """
    Calculates a comprehensive set of classification metrics.

    Args:
        y_true (array-like): Ground truth labels.
        y_pred (array-like): Predicted labels.
        y_proba (array-like): Predicted probabilities for the positive class (class 1).
        model_name (str): Name of the model for logging.

    Returns:
        dict: A dictionary containing calculated metrics.
              Returns None if input arrays are empty or invalid.
    """
    if len(y_true) == 0 or len(y_pred) == 0 or len(y_proba) == 0:
        logging.error(f"[{model_name}] Empty input arrays provided for metric calculation.")
        return None
    if len(y_true) != len(y_pred) or len(y_true) != len(y_proba):
        logging.error(f"[{model_name}] Mismatched lengths in input arrays for metric calculation.")
        return None

    metrics = {}

    # --- Threshold-based Metrics (using y_pred) ---
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision
    metrics['recall_tpr'] = recall # True Positive Rate (Sensitivity)
    metrics['f1_score'] = f1

    # Specificity (True Negative Rate)
    metrics['specificity_tnr'] = tn / (tn + fp) if (tn + fp) > 0 else 0.0

    # Geometric Mean
    metrics['g_mean'] = np.sqrt(metrics['recall_tpr'] * metrics['specificity_tnr'])

    # Confusion Matrix
    metrics['confusion_matrix'] = {'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp}

    # --- Ranking/Probabilistic Metrics (using y_proba) ---
    try:
        metrics['roc_auc'] = roc_auc_score(y_true, y_proba)
    except ValueError as e:
        logging.warning(f"[{model_name}] Could not calculate ROC AUC: {e}. Setting to 0.0.")
        metrics['roc_auc'] = 0.0 # Handle cases with only one class present

    # PR AUC
    pr_curve_precision, pr_curve_recall, _ = precision_recall_curve(y_true, y_proba)
    metrics['pr_auc'] = auc(pr_curve_recall, pr_curve_precision) # Note order: recall is x, precision is y

    # Brier Score
    metrics['brier_score'] = brier_score_loss(y_true, y_proba)

    logging.info(f"--- {model_name} Metrics ---")
    logging.info(f"Accuracy: {metrics['accuracy']:.4f}")
    logging.info(f"Precision: {metrics['precision']:.4f}")
    logging.info(f"Recall (TPR): {metrics['recall_tpr']:.4f}")
    logging.info(f"Specificity (TNR): {metrics['specificity_tnr']:.4f}")
    logging.info(f"F1-Score: {metrics['f1_score']:.4f}")
    logging.info(f"G-Mean: {metrics['g_mean']:.4f}")
    logging.info(f"ROC AUC: {metrics['roc_auc']:.4f}")
    logging.info(f"PR AUC: {metrics['pr_auc']:.4f}")
    logging.info(f"Brier Score: {metrics['brier_score']:.4f}")
    logging.info(f"Confusion Matrix (TN, FP, FN, TP): ({tn}, {fp}, {fn}, {tp})")

    # Optional: Plot Confusion Matrix
    if conf_mat:
        plt.figure(figsize=(6, 4))
        sns.heatmap([[tn, fp], [fn, tp]], annot=True, fmt='d', cmap='Blues',
                    xticklabels=['Predicted Galaxy (0)', 'Predicted Star (1)'],
                    yticklabels=['Actual Galaxy (0)', 'Actual Star (1)'])
        plt.title(f'{model_name} Confusion Matrix')
        plt.ylabel('Actual Label')
        plt.xlabel('Predicted Label')
        cm_filename = os.path.join(MODEL_DIR, f"{model_name}_confusion_matrix_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
        plt.savefig(cm_filename)
        plt.close()
        logging.info(f"Confusion matrix plot saved to {cm_filename}")


    return metrics

### Feature Scaling

In [15]:
# --- Feature Scaling ---
# Important for SVM, not used for the other models.
# Fit scaler ONLY on training data, then transform all sets.

# Check if training set and other datasets are non-empty before scaling
def apply_feature_scaling(
    X_train, X_val, X_test, X_cal,
    TRAIN_SIZE, VAL_SIZE, TEST_SIZE, CAL_SIZE,
    MODEL_DIR,
    save_scaler=True,  # New param: whether to save the scaler to disk
    group_name=None    # New param: optional, for unique scaler filename per group
):
    """
    Applies StandardScaler to the provided datasets if training data is available.
    Optionally saves the fitted scaler to disk.
    Returns: X_train_scaled, X_val_scaled, X_test_scaled, X_cal_scaled, scaler (or None)
    """

    scaler = None
    if len(X_train) > 0 and TRAIN_SIZE > 0:
        logging.info("Applying StandardScaler to features...")
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
    else:
        logging.info("Empty training set, NOT able to apply StandardScaler!")
        X_train_scaled = X_train

    if len(X_val) > 0 and VAL_SIZE > 0 and scaler is not None:
        X_val_scaled = scaler.transform(X_val)
    else:
        X_val_scaled = X_val

    if len(X_test) > 0 and TEST_SIZE > 0 and scaler is not None:
        X_test_scaled = scaler.transform(X_test)
    else:
        X_test_scaled = X_test

    if len(X_cal) > 0 and CAL_SIZE > 0 and scaler is not None:
        X_cal_scaled = scaler.transform(X_cal)
    else:
        X_cal_scaled = X_cal

    # Save the scaler if it was fitted and requested
    if scaler is not None and save_scaler:
        if group_name is not None:
            scaler_filename = os.path.join(
                MODEL_DIR, f"scaler_{group_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.joblib"
            )
        else:
            scaler_filename = os.path.join(
                MODEL_DIR, f"scaler_{datetime.now().strftime('%Y%m%d_%H%M%S')}.joblib"
            )
        joblib.dump(scaler, scaler_filename)
        logging.info(f"Scaler saved to {scaler_filename}")

    logging.info("Feature scaling complete.")
    return X_train_scaled, X_val_scaled, X_test_scaled, X_cal_scaled, scaler

# Example usage:
# X_train_scaled, X_val_scaled, X_test_scaled, X_cal_scaled, scaler = apply_feature_scaling(
#     X_train, X_val, X_test, X_cal, TRAIN_SIZE, VAL_SIZE, TEST_SIZE, CAL_SIZE, MODEL_DIR,
#     save_scaler=True, # Ensure scaler is saved
#     group_name=group_name # Pass group name for potentially unique scaler filename
# )

## 3. Model Workflows

In [16]:
all_results = {} # Dictionary to store metrics for each model

ALPHA = 0.8 #Coverage, mean size set will tend to 1 + ALPHA

CALIBRATOR = 'cvap'

### 3.1 SVM

In [17]:
# HPO Settings for SVM (using data fraction)
MAX_RESOURCE_SVM = 1.0  # Max data fraction
MIN_RESOURCE_SVM = 0.1  # Min data fraction (adjust based on minority class size)
ETA_SVM = 3
RESOURCE_TYPE_SVM = 'data_fraction'
model_name_svm = "SVM"

In [18]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_SVM = 1.0  # Se mantiene en 1.0 para usar todos los datos al final
MIN_RESOURCE_SVM = 0.5  # Aumentado para reducir s_max (menos brackets/configs)
ETA_SVM = 4             # Aumentado para eliminar configuraciones más rápido

In [19]:
def svm_hpo(
    X_train_scaled, y_train, X_val_scaled, y_val,
    MAX_RESOURCE_SVM,
    MIN_RESOURCE_SVM,
    ETA_SVM,
    RESOURCE_TYPE_SVM,
    RANDOM_SEED,
    SVC,
    loguniform,
    hyperband_hpo,
    f1_score,
    logging,
    model_name_svm
):
    """
    Run Hyperband HPO for SVM and return best parameters and best score.
    """
    # --- SVM: Define Search Space and HPO Params ---
    param_space_svm = {
        'C': loguniform(1e-2, 1e3),
        'gamma': loguniform(1e-4, 1e1),
        'kernel': ['rbf'], # Example: Fixed RBF kernel
        # 'kernel': ['rbf', 'linear'], # Example: If you want to search kernels
        # class_weight is added automatically inside hyperband_hpo
        # random_state is added automatically inside hyperband_hpo
    }

    logging.info(f"--- [{model_name_svm}] Running Hyperband HPO ---")
    best_params_svm, best_score_hpo_svm = hyperband_hpo(
        model_class=SVC,
        param_space=param_space_svm,
        X_train=X_train_scaled, # USE SCALED DATA
        y_train=y_train,
        X_val=X_val_scaled,     # USE SCALED DATA
        y_val=y_val,
        max_resource=MAX_RESOURCE_SVM,
        eta=ETA_SVM,
        resource_type=RESOURCE_TYPE_SVM,
        min_resource=MIN_RESOURCE_SVM,
        scoring_func=f1_score,
        random_state=RANDOM_SEED
    )
    return best_params_svm, best_score_hpo_svm

def svm_workflow(
    X_train_scaled, y_train, X_val_scaled, y_val,
    X_cal_scaled, y_cal, X_test_scaled, y_test,
    MAX_RESOURCE_SVM, MIN_RESOURCE_SVM, ETA_SVM, RESOURCE_TYPE_SVM,
    model_name_svm, CALIBRATOR, ALPHA, RANDOM_SEED,
    SVC=SVC,
    loguniform=loguniform,
    hyperband_hpo=hyperband_hpo,
    f1_score=f1_score,
    logging=logging,
    best_params_svm=None,  # <-- Allow passing best_params directly
    best_score_hpo_svm=None
):
    """
    Complete SVM workflow: HPO, calibration, Mondrian ICP, and evaluation.

    This function performs the following steps for SVM:
      1. Defines the hyperparameter search space.
      2. Runs Hyperband HPO using the provided data splits.
      3. Trains the final SVM model and calibrates it (Platt/Isotonic/CVAP).
      4. Fits Mondrian Inductive Conformal Predictor (ICP) on the calibration set.
      5. Evaluates the model and Mondrian ICP on the test set.
      6. Stores results in the provided all_results dictionary.

    All logic is preserved from the original notebook cell.

    Args:
        X_train_scaled, y_train: Training data (scaled, labels).
        X_val_scaled, y_val: Validation data (scaled, labels).
        X_cal_scaled, y_cal: Calibration data (scaled, labels).
        X_test_scaled, y_test: Test data (scaled, labels).
        
        MAX_RESOURCE_SVM, MIN_RESOURCE_SVM, ETA_SVM, RESOURCE_TYPE_SVM: HPO settings.
        model_name_svm: Name for the model (e.g., "SVM").
        CALIBRATOR: Calibration method ('platt', 'isotonic', 'cvap').
        ALPHA: Conformal prediction significance level.
        RANDOM_SEED: Random seed for reproducibility.
        best_params_svm: (Optional) If provided, skips HPO and uses these params
        best_score_hpo_svm: (Optional) If provided, uses this as HPO score

    Returns:
        None. Results are stored in all_results[model_name_svm].
    """
    import time
    import numpy as np
    from scipy.stats import loguniform
    import logging

    logging.info(f"\n\n===== Starting Workflow for {model_name_svm} =====")
    timestamp_svm = datetime.now().strftime("%Y%m%d_%H%M%S")

    if best_params_svm is None:
        hpo_start_time_svm = time.time()
        best_params_svm, best_score_hpo_svm = svm_hpo(
            X_train_scaled, y_train, X_val_scaled, y_val,
            MAX_RESOURCE_SVM,
            MIN_RESOURCE_SVM,
            ETA_SVM,
            RESOURCE_TYPE_SVM,
            RANDOM_SEED,
            SVC,
            loguniform,
            hyperband_hpo,
            f1_score,
            logging,
            model_name_svm
        )
        hpo_duration_svm = time.time() - hpo_start_time_svm
        logging.info(f"--- [{model_name_svm}] HPO finished in {hpo_duration_svm:.2f} seconds ---")
    else:
        hpo_duration_svm = 0.0
        logging.info(f"--- [{model_name_svm}] Using provided hyperparameters, skipping HPO. ---")

    # --- 1.3 SVM: Train Final Model & Calibration (using Full Training Set) ---
    fitted_svm_base = None
    calibrator_svm = None
    if best_params_svm:
        logging.info(f"--- [{model_name_svm}] Training final model and Platt scaler ---")
        calibration_start_time_svm = time.time()
        # Ensure necessary fixed parameters are present for the final fit
        best_params_svm['random_state'] = RANDOM_SEED
        if 'class_weight' not in best_params_svm: best_params_svm['class_weight'] = 'balanced'
        if 'probability' in best_params_svm: del best_params_svm['probability'] # Use decision_function

        fitted_svm_base, calibrator_svm = train_calibrate_model(
            base_estimator_class=SVC, # Pass the class
            best_params=best_params_svm,
            X_train=X_train_scaled,   # Use scaled training data
            y_train=y_train,          # Use original y_train for CV indexing
            calibration_method=CALIBRATOR,# Use choose calibration method
            n_splits=5,               # Folds
            random_state=RANDOM_SEED,
            score_method='decision_function', # SVC supports decision_function
            cvap_loss='log',          # Use log-loss aggregation for CVAP
            cvap_precision=None       # Default precision
        )
        calibration_duration_svm = time.time() - calibration_start_time_svm
        if fitted_svm_base and calibrator_svm:
            logging.info(f"--- [{model_name_svm}] Calibration with {CALIBRATOR} finished in {calibration_duration_svm:.2f} seconds ---")
            # Optional: Save models
            # joblib.dump(...)
        else:
            logging.error(f"[{model_name_svm}] Failed to train base model or the calibration.")
    else:
        logging.warning(f"[{model_name_svm}] HPO did not find best parameters. Skipping subsequent steps.")

    # --- Section 1.4: Mondrian ICP Calibration ---
    fitted_cc_svm = None # Initialize classifier variable
    if fitted_svm_base and calibrator_svm:
        if not y_cal.empty:
            logging.info(f"--- [{model_name_svm}] Calibrating Mondrian Conformal Prediction ---")
            mcp_cal_start_time_svm = time.time()

            # Calculate probabilities needed for crepes on Calibration set
            probs_cal_svm = calibrator_svm.predict_proba(X_cal_scaled) # (n_cal, 2)

            # Define Mondrian Bins (Class-conditional example)
            bins_cal_svm = y_cal.values # Assumes y_cal is pd.Series/np.array

            # Fit the Mondrian classifier
            fitted_cc_svm = fit_mondrian_classifier(probs_cal_svm, bins_cal=bins_cal_svm)

            mcp_cal_duration_svm = time.time() - mcp_cal_start_time_svm
            if fitted_cc_svm:
                logging.info(f"--- [{model_name_svm}] Mondrian CP calibration finished in {mcp_cal_duration_svm:.2f} seconds ---")
                # Optional: Save the fitted_cc_svm object
                # cc_filename = ...
                # joblib.dump(...)
            else:
                logging.error(f"[{model_name_svm}] Failed to fit Mondrian classifier.")
        else:
            logging.warning(f"[{model_name_svm}] Calibration set is empty. Skipping Mondrian ICP calibration.")
    else:
        logging.warning(f"[{model_name_svm}] Base model or Calibrator not available. Skipping Mondrian ICP calibration.")

    # --- Section 1.5: Final Evaluation ---
    if fitted_svm_base and calibrator_svm: # Check base model availability
        logging.info(f"--- [{model_name_svm}] Final Evaluation on Test Set ---")
        eval_start_time_svm = time.time()

        # --- Calculate Base Metrics ---
        probs_test_svm_full = calibrator_svm.predict_proba(X_test_scaled) # (n_test, 2)

        # Derive probabilities and predictions from the calibrated output
        y_proba_test_svm = probs_test_svm_full[:, 1] # Prob positive class
        y_pred_test_svm = (y_proba_test_svm >= 0.5).astype(int) # Threshold calibrated probs
        metrics_svm = calculate_metrics(y_test, y_pred_test_svm, y_proba_test_svm, model_name=model_name_svm)

        # --- Mondrian Conformal Prediction Evaluation ---
        cp_coverage_mond_svm, cp_avg_set_size_mond_svm = None, None # Initialize results
        class_coverage_dict = None

        if fitted_cc_svm is not None: # Check if Mondrian classifier was fitted successfully
            mcp_eval_start_time_svm = time.time()
            # Define Mondrian Bins for test set (Class-conditional example)
            bins_test_svm = y_test.values if not y_test.empty else np.array([])
            y_test_true_np = y_test.values if not y_test.empty else np.array([])

            # Evaluate the fitted Mondrian classifier
            cp_coverage_mond_svm, cp_avg_set_size_mond_svm, _, class_coverage_dict = evaluate_mondrian_prediction(
                fitted_cc=fitted_cc_svm,          # Pass the fitted classifier
                probs_test=probs_test_svm_full,   # Pass test probabilities (n_test, 2)
                y_test_true=y_test_true_np,       # Pass true test labels
                bins_test=bins_test_svm,          # Pass test bins
                alpha=ALPHA
            )
            mcp_eval_duration_svm = time.time() - mcp_eval_start_time_svm
            logging.info(f"--- [{model_name_svm}] Mondrian CP evaluation finished in {mcp_eval_duration_svm:.2f} seconds ---")
        else:
            logging.warning(f"[{model_name_svm}] Skipping Mondrian CP evaluation: Classifier not fitted.")

        eval_duration_svm = time.time() - eval_start_time_svm # Total eval time
        logging.info(f"--- [{model_name_svm}] Total Evaluation finished in {eval_duration_svm:.2f} seconds ---")

        # --- Store results (Same as before, using the new variables) ---
        all_results = {model_name_svm : {
            'metrics': metrics_svm,
            'cp_coverage_mond': cp_coverage_mond_svm,           # Store Mondrian coverage
            'cp_class_coverage_dict': class_coverage_dict,
            'cp_avg_set_size_mond': cp_avg_set_size_mond_svm,     # Store Mondrian avg set size
            'best_hpo_params': best_params_svm,
            'hpo_f1_score': best_score_hpo_svm,
            'hpo_duration_s': hpo_duration_svm,
        }}
    else:
        logging.warning(f"[{model_name_svm}] Skipping final evaluation (Base model or Calibrator not available).")

    # Keep the final logging line:
    logging.info(f"===== Finished Workflow for {model_name_svm} =====")
    return all_results

### 3.2 CART

##### Why Log Loss with CVAP Can Be Overly Conservative (and How Brier Helps)

When using **Cross Venn-Abers Predictors (CVAP)** with **log-loss aggregation**, the final probability is chosen to **minimize worst-case regret** under log-loss. This leads to **very conservative predictions**, especially when the individual calibration folds assign wide or low-confidence probability intervals. In imbalanced datasets or weak classifiers (like shallow decision trees), this can push all probabilities close to 0 or 1 — often collapsing to always predicting the majority class.

**Brier-score aggregation**, on the other hand, minimizes expected squared error instead of log-loss. This encourages **less extreme probabilities**, making the final predictions more balanced and spread out. As a result, CVAP with Brier aggregation is less likely to ignore the minority class, improving its practical usability when log-loss calibration is too cautious.


In [20]:
# HPO Settings for CART (using data fraction)
MAX_RESOURCE_CART = 1.0
MIN_RESOURCE_CART = 0.1 # Can start with smaller fraction for trees
ETA_CART = 3
RESOURCE_TYPE_CART = 'data_fraction'
model_name_cart = "CART"

In [21]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_CART = 1.0
MIN_RESOURCE_CART = 0.5 # Aumentado para reducir s_max
ETA_CART = 4            # Aumentado para eliminar configuraciones más rápido

In [22]:
def cart_hpo(
    X_train, y_train, X_val, y_val,
    MAX_RESOURCE_CART,
    MIN_RESOURCE_CART,
    ETA_CART,
    RESOURCE_TYPE_CART,
    RANDOM_SEED,
    DecisionTreeClassifier,
    randint,
    f1_score,
    hyperband_hpo,
    logging,
    model_name_cart
):
    """
    Run Hyperband HPO for CART (DecisionTreeClassifier) and return best parameters and best score.
    """
    # --- CART: Define Search Space and HPO Params ---
    param_space_cart = {
        'criterion': ['gini', 'entropy'],
        'max_depth': randint(3, 50),
        'min_samples_split': randint(2, 100),
        'min_samples_leaf': randint(1, 50),
        # class_weight added automatically
        # random_state added automatically
    }

    logging.info(f"--- [{model_name_cart}] Running Hyperband HPO ---")
    best_params_cart, best_score_hpo_cart = hyperband_hpo(
        model_class=DecisionTreeClassifier,
        param_space=param_space_cart,
        X_train=X_train, # USE UNSCALED DATA
        y_train=y_train,
        X_val=X_val,     # USE UNSCALED DATA
        y_val=y_val,
        max_resource=MAX_RESOURCE_CART,
        eta=ETA_CART,
        resource_type=RESOURCE_TYPE_CART,
        min_resource=MIN_RESOURCE_CART,
        scoring_func=f1_score,
        random_state=RANDOM_SEED
    )
    return best_params_cart, best_score_hpo_cart

def cart_workflow(
    X_train, y_train, X_val, y_val, X_cal, y_cal, X_test, y_test,
    model_name_cart,
    MAX_RESOURCE_CART,
    MIN_RESOURCE_CART,
    ETA_CART,
    RESOURCE_TYPE_CART,
    RANDOM_SEED,
    CALIBRATOR,
    ALPHA,
    DecisionTreeClassifier,
    randint,
    f1_score,
    hyperband_hpo,
    logging,
    best_params_cart=None,  # <-- Allow passing best_params directly
    best_score_hpo_cart=None
):
    """
    Complete workflow for CART (DecisionTreeClassifier) model:
    - Hyperparameter optimization using Hyperband
    - Final model training and calibration
    - Mondrian Inductive Conformal Prediction calibration
    - Final evaluation on test set (including Mondrian CP evaluation)
    - Results are stored in the provided all_results dictionary

    All logic is preserved exactly as in the original notebook code.

    Args:
        X_train, y_train: Training data (unscaled)
        X_val, y_val: Validation data (unscaled)
        X_cal, y_cal: Calibration data (unscaled)
        X_test, y_test: Test data (unscaled)
        all_results: dict to store results
        model_name_cart: str, name for the model (e.g. "CART")
        MAX_RESOURCE_CART, MIN_RESOURCE_CART, ETA_CART, RESOURCE_TYPE_CART: HPO settings
        RANDOM_SEED: int, random seed for reproducibility
        CALIBRATOR: str, calibration method ('cvap', 'platt', etc.)
        ALPHA: float, significance level for conformal prediction

    Returns:
        None (results are stored in all_results)
    """
    logging.info(f"\n\n===== Starting Workflow for {model_name_cart} =====")
    timestamp_cart = datetime.now().strftime("%Y%m%d_%H%M%S")

    if best_params_cart is None:
        hpo_start_time_cart = time.time()
        best_params_cart, best_score_hpo_cart = cart_hpo(
            X_train, y_train, X_val, y_val,
            MAX_RESOURCE_CART,
            MIN_RESOURCE_CART,
            ETA_CART,
            RESOURCE_TYPE_CART,
            RANDOM_SEED,
            DecisionTreeClassifier,
            randint,
            f1_score,
            hyperband_hpo,
            logging,
            model_name_cart
        )
        hpo_duration_cart = time.time() - hpo_start_time_cart
        logging.info(f"--- [{model_name_cart}] HPO finished in {hpo_duration_cart:.2f} seconds ---")
    else:
        hpo_duration_cart = 0.0
        logging.info(f"--- [{model_name_cart}] Using provided hyperparameters, skipping HPO. ---")

    # --- 2.3 CART: Train Final Model & Calibration (using Full Training Set) ---
    fitted_cart_base = None
    calibrator_cart = None
    if best_params_cart:
        logging.info(f"--- [{model_name_cart}] Training final model and calibrator ---")
        calibration_start_time_cart = time.time()
        # Ensure necessary fixed parameters are present for the final fit
        best_params_cart['random_state'] = RANDOM_SEED
        if 'class_weight' not in best_params_cart: best_params_cart['class_weight'] = 'balanced'

        fitted_cart_base, calibrator_cart = train_calibrate_model(
            base_estimator_class=DecisionTreeClassifier,
            best_params=best_params_cart,
            X_train=X_train,  # Use UNSCALED training data
            y_train=y_train,
            calibration_method=CALIBRATOR,  # Use chosen calibration method (e.g., 'cvap', 'platt', etc.)
            n_splits=5,
            random_state=RANDOM_SEED,
            score_method='predict_proba',  # For CART, use predict_proba
            cvap_loss='brier',  # or 'brier' if desired
            cvap_precision=None  # or set as needed
        )
        calibration_duration_cart = time.time() - calibration_start_time_cart
        if fitted_cart_base and calibrator_cart:
            logging.info(f"--- [{model_name_cart}] Calibration with {CALIBRATOR} finished in {calibration_duration_cart:.2f} seconds ---")
            # Optional: Save models
            # joblib.dump(...)
        else:
            logging.error(f"[{model_name_cart}] Failed to train base model or the calibration.")
    else:
        logging.warning(f"[{model_name_cart}] HPO did not find best parameters. Skipping subsequent steps.")

    # --- 2.4 CART: Mondrian ICP Calibration (using Calibration Set) ---
    fitted_cc_cart = None  # Initialize Mondrian classifier variable
    if fitted_cart_base and calibrator_cart:
        if not y_cal.empty:
            logging.info(f"--- [{model_name_cart}] Calibrating Mondrian Conformal Prediction ---")
            mcp_cal_start_time_cart = time.time()

            # Get calibrated probabilities from calibrator
            calibrated_probs_cal_cart = calibrator_cart.predict_proba(X_cal)  # (n_cal, 2)

            # Define Mondrian Bins (Class-conditional example)
            bins_cal_cart = y_cal.values  # Assumes y_cal is pd.Series/np.array

            # Fit the Mondrian classifier
            fitted_cc_cart = fit_mondrian_classifier(calibrated_probs_cal_cart, bins_cal=bins_cal_cart)

            mcp_cal_duration_cart = time.time() - mcp_cal_start_time_cart
            if fitted_cc_cart:
                logging.info(f"--- [{model_name_cart}] Mondrian CP calibration finished in {mcp_cal_duration_cart:.2f} seconds ---")
                # Optional: Save the fitted_cc_cart object using joblib
                # cc_filename = os.path.join(MODEL_DIR, f"{model_name_cart}_mondrian_classifier_{timestamp_cart}.joblib")
                # joblib.dump(fitted_cc_cart, cc_filename)
                # logging.info(f"Mondrian classifier saved to {cc_filename}")
            else:
                logging.error(f"[{model_name_cart}] Failed to fit Mondrian classifier.")
        else:
            logging.warning(f"[{model_name_cart}] Calibration set is empty. Skipping Mondrian ICP calibration.")
    else:
        logging.warning(f"[{model_name_cart}] Base model or Calibrator not available. Skipping Mondrian ICP calibration.")

    # --- 2.5 CART: Final Evaluation (using Test Set) ---
    if fitted_cart_base and calibrator_cart:
        logging.info(f"--- [{model_name_cart}] Final Evaluation on Test Set ---")
        eval_start_time_cart = time.time()

        # Get calibrated probabilities from calibrator
        calibrated_probs_test_cart = calibrator_cart.predict_proba(X_test)
        y_proba_test_cart = calibrated_probs_test_cart[:, 1]  # Probability of positive class
        logging.info(f"[{model_name_cart}] Calibrated test probabilities for class 1 (sample): {y_proba_test_cart[:20]}")
        logging.info(f"[{model_name_cart}] Calibrated test probabilities range: min={np.min(y_proba_test_cart):.4f}, max={np.max(y_proba_test_cart):.4f}, mean={np.mean(y_proba_test_cart):.4f}")
        y_pred_test_cart = (y_proba_test_cart >= 0.5).astype(int)  # Threshold calibrated probabilities

        metrics_cart = calculate_metrics(y_test, y_pred_test_cart, y_proba_test_cart, model_name=model_name_cart)

        # --- Mondrian Conformal Prediction Evaluation ---
        cp_coverage_mond_cart, cp_avg_set_size_mond_cart = None, None  # Initialize results

        if fitted_cc_cart is not None:  # Check if Mondrian classifier was fitted successfully
            mcp_eval_start_time_cart = time.time()
            # Define Mondrian Bins for test set (Class-conditional example)
            bins_test_cart = y_test.values if not y_test.empty else np.array([])
            y_test_true_np_cart = y_test.values if not y_test.empty else np.array([])

            # Evaluate the fitted Mondrian classifier
            cp_coverage_mond_cart, cp_avg_set_size_mond_cart, _, class_coverage_dict = evaluate_mondrian_prediction(
                fitted_cc=fitted_cc_cart,                # Pass the fitted classifier
                probs_test=calibrated_probs_test_cart,   # Pass test probabilities (n_test, 2)
                y_test_true=y_test_true_np_cart,         # Pass true test labels
                bins_test=bins_test_cart,                # Pass test bins
                alpha=ALPHA
            )
            mcp_eval_duration_cart = time.time() - mcp_eval_start_time_cart
            logging.info(f"--- [{model_name_cart}] Mondrian CP evaluation finished in {mcp_eval_duration_cart:.2f} seconds ---")
        else:
            logging.warning(f"[{model_name_cart}] Skipping Mondrian CP evaluation: Classifier not fitted.")

        eval_duration_cart = time.time() - eval_start_time_cart
        logging.info(f"--- [{model_name_cart}] Total Evaluation finished in {eval_duration_cart:.2f} seconds ---")

        # Store results
        all_results = {model_name_cart : {
            'metrics': metrics_cart,
            'cp_coverage_mond': cp_coverage_mond_cart,
            'cp_class_coverage_dict': class_coverage_dict,
            'cp_avg_set_size_mond': cp_avg_set_size_mond_cart,
            'best_hpo_params': best_params_cart,
            'hpo_f1_score': best_score_hpo_cart,
            'hpo_duration_s': hpo_duration_cart,
        }}
    else:
        logging.warning(f"[{model_name_cart}] Skipping final evaluation (Base model or Calibrator not available).")

    logging.info(f"===== Finished Workflow for {model_name_cart} =====")
    return all_results

### 3.3 Random Forest

In [23]:
# HPO Settings for RF (using iterations)
MAX_RESOURCE_RF = 300  # Max n_estimators
MIN_RESOURCE_RF = 20   # Min n_estimators
ETA_RF = 3
RESOURCE_TYPE_RF = 'iterations'
model_name_rf = "Random_Forest"

In [24]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_RF = 20   # ¡Reducido drásticamente! (Antes 300)
MIN_RESOURCE_RF = 5    # Mínimo bajo pero cercano a max para pocos brackets
ETA_RF = 4             # Aumentado

In [25]:
def rf_hpo(
    X_train, y_train, X_val, y_val,
    MAX_RESOURCE_RF,
    MIN_RESOURCE_RF,
    ETA_RF,
    RESOURCE_TYPE_RF,
    RANDOM_SEED,
    RandomForestClassifier,
    randint,
    f1_score,
    hyperband_hpo,
    logging,
    model_name_rf
):
    """
    Run Hyperband HPO for Random Forest and return best parameters and best score.
    """
    # --- RF: Define Search Space and HPO Params ---
    param_space_rf = {
        # n_estimators is controlled by resource
        'max_depth': randint(5, 50),
        'min_samples_split': randint(2, 50),
        'min_samples_leaf': randint(1, 25),
        'max_features': ['sqrt', 'log2', None], # None means max_features=n_features
        'criterion': ['gini', 'entropy'],
        # class_weight added automatically
        # random_state added automatically
    }

    logging.info(f"--- [{model_name_rf}] Running Hyperband HPO ---")
    best_params_rf, best_score_hpo_rf = hyperband_hpo(
        model_class=RandomForestClassifier,
        param_space=param_space_rf,
        X_train=X_train, # USE UNSCALED DATA
        y_train=y_train,
        X_val=X_val,     # USE UNSCALED DATA
        y_val=y_val,
        max_resource=MAX_RESOURCE_RF,
        eta=ETA_RF,
        resource_type=RESOURCE_TYPE_RF,
        min_resource=MIN_RESOURCE_RF,
        scoring_func=f1_score,
        random_state=RANDOM_SEED
    )
    return best_params_rf, best_score_hpo_rf

def random_forest_workflow(
    X_train, y_train, X_val, y_val, X_cal, y_cal, X_test, y_test,
    model_name_rf,
    MAX_RESOURCE_RF,
    MIN_RESOURCE_RF,
    ETA_RF,
    RESOURCE_TYPE_RF,
    CALIBRATOR,
    ALPHA,
    RANDOM_SEED,
    RandomForestClassifier,
    randint,
    f1_score,
    hyperband_hpo,
    train_calibrate_model,
    fit_mondrian_classifier,
    evaluate_mondrian_prediction,
    calculate_metrics,
    logging,
    np,
    datetime,
    best_params_rf=None,  # <-- Allow passing best_params directly
    best_score_hpo_rf=None
):
    """
    Full Random Forest workflow: HPO, training, calibration, Mondrian conformal prediction, and evaluation.
    This function is a direct conversion of the notebook workflow to a callable function.
    All logic is preserved exactly as in the original notebook cell.

    Parameters
    ----------
    X_train, y_train : Training data and labels
    X_val, y_val     : Validation data and labels
    X_cal, y_cal     : Calibration data and labels
    X_test, y_test   : Test data and labels
    model_name_rf    : Name of the model (string)
    MAX_RESOURCE_RF, MIN_RESOURCE_RF, ETA_RF, RESOURCE_TYPE_RF : HPO params
    CALIBRATOR       : Calibration method (e.g., 'cvap', 'platt', etc.)
    ALPHA            : Conformal prediction significance level
    RANDOM_SEED      : Random seed for reproducibility
    RandomForestClassifier : RF class
    randint          : Distribution for HPO
    f1_score         : F1 scoring function
    hyperband_hpo    : Hyperband HPO function
    train_calibrate_model : Model training and calibration function
    fit_mondrian_classifier : Mondrian conformal classifier fitting function
    evaluate_mondrian_prediction : Mondrian conformal prediction evaluation function
    calculate_metrics : Function to calculate metrics
    logging          : Logging module
    np               : Numpy module
    datetime         : Datetime module
    best_params_rf   : (Optional) If provided, skips HPO and uses these params
    best_score_hpo_rf: (Optional) If provided, uses this as HPO score

    Returns
    -------
    None (results are stored in all_results[model_name_rf])
    """
    logging.info(f"\n\n===== Starting Workflow for {model_name_rf} =====")
    timestamp_rf = datetime.now().strftime("%Y%m%d_%H%M%S")

    if best_params_rf is None:
        hpo_start_time_rf = time.time()
        best_params_rf, best_score_hpo_rf = rf_hpo(
            X_train, y_train, X_val, y_val,
            MAX_RESOURCE_RF,
            MIN_RESOURCE_RF,
            ETA_RF,
            RESOURCE_TYPE_RF,
            RANDOM_SEED,
            RandomForestClassifier,
            randint,
            f1_score,
            hyperband_hpo,
            logging,
            model_name_rf
        )
        hpo_duration_rf = time.time() - hpo_start_time_rf
        logging.info(f"--- [{model_name_rf}] HPO finished in {hpo_duration_rf:.2f} seconds ---")
    else:
        hpo_duration_rf = 0.0
        logging.info(f"--- [{model_name_rf}] Using provided hyperparameters, skipping HPO. ---")


    # --- 3.3 RF: Train Final Model & Calibration (using Full Training Set) ---
    fitted_rf_base = None
    calibrator_rf = None
    if best_params_rf:
        logging.info(f"--- [{model_name_rf}] Training final model and calibrator ---")
        calibration_start_time_rf = time.time()
        # Ensure necessary fixed parameters are present for the final fit
        best_params_rf['random_state'] = RANDOM_SEED
        if 'class_weight' not in best_params_rf: best_params_rf['class_weight'] = 'balanced'
        best_params_rf['n_jobs'] = -1 # Use all cores

        fitted_rf_base, calibrator_rf = train_calibrate_model(
            base_estimator_class=RandomForestClassifier,
            best_params=best_params_rf,
            X_train=X_train,  # Use UNSCALED training data
            y_train=y_train,
            calibration_method=CALIBRATOR,  # Use chosen calibration method (e.g., 'cvap', 'platt', etc.)
            n_splits=5,
            random_state=RANDOM_SEED,
            score_method='predict_proba',  # For RF, use predict_proba
            cvap_loss='log',  # or 'brier' if desired
            cvap_precision=None  # or set as needed
        )
        calibration_duration_rf = time.time() - calibration_start_time_rf
        if fitted_rf_base and calibrator_rf:
            logging.info(f"--- [{model_name_rf}] Calibration with {CALIBRATOR} finished in {calibration_duration_rf:.2f} seconds ---")
            # Optional: Save models
            # joblib.dump(...)
        else:
            logging.error(f"[{model_name_rf}] Failed to train base model or the calibration.")
    else:
        logging.warning(f"[{model_name_rf}] HPO did not find best parameters. Skipping subsequent steps.")

    # --- 3.4 RF: Mondrian ICP Calibration (using Calibration Set) ---
    fitted_cc_rf = None  # Initialize Mondrian classifier variable
    if fitted_rf_base and calibrator_rf:
        if not y_cal.empty:
            logging.info(f"--- [{model_name_rf}] Calibrating Mondrian Conformal Prediction ---")
            mcp_cal_start_time_rf = time.time()

            # Get calibrated probabilities from calibrator
            calibrated_probs_cal_rf = calibrator_rf.predict_proba(X_cal)  # (n_cal, 2)

            # Define Mondrian Bins (Class-conditional example)
            bins_cal_rf = y_cal.values  # Assumes y_cal is pd.Series/np.array

            # Fit the Mondrian classifier
            fitted_cc_rf = fit_mondrian_classifier(calibrated_probs_cal_rf, bins_cal=bins_cal_rf)

            mcp_cal_duration_rf = time.time() - mcp_cal_start_time_rf
            if fitted_cc_rf:
                logging.info(f"--- [{model_name_rf}] Mondrian CP calibration finished in {mcp_cal_duration_rf:.2f} seconds ---")
                # Optional: Save the fitted_cc_rf object using joblib
                # cc_filename = os.path.join(MODEL_DIR, f"{model_name_rf}_mondrian_classifier_{timestamp_rf}.joblib")
                # joblib.dump(fitted_cc_rf, cc_filename)
                # logging.info(f"Mondrian classifier saved to {cc_filename}")
            else:
                logging.error(f"[{model_name_rf}] Failed to fit Mondrian classifier.")
        else:
            logging.warning(f"[{model_name_rf}] Calibration set is empty. Skipping Mondrian ICP calibration.")
    else:
        logging.warning(f"[{model_name_rf}] Base model or Calibrator not available. Skipping Mondrian ICP calibration.")

    # --- 3.5 RF: Final Evaluation (using Test Set) ---
    if fitted_rf_base and calibrator_rf:
        logging.info(f"--- [{model_name_rf}] Final Evaluation on Test Set ---")
        eval_start_time_rf = time.time()

        # Get calibrated probabilities from calibrator
        calibrated_probs_test_rf = calibrator_rf.predict_proba(X_test)
        y_proba_test_rf = calibrated_probs_test_rf[:, 1]  # Probability of positive class
        y_pred_test_rf = (y_proba_test_rf >= 0.5).astype(int)  # Threshold calibrated probabilities

        metrics_rf = calculate_metrics(y_test, y_pred_test_rf, y_proba_test_rf, model_name=model_name_rf)

        # --- Mondrian Conformal Prediction Evaluation ---
        cp_coverage_mond_rf, cp_avg_set_size_mond_rf = None, None  # Initialize results

        if fitted_cc_rf is not None:  # Check if Mondrian classifier was fitted successfully
            mcp_eval_start_time_rf = time.time()
            # Define Mondrian Bins for test set (Class-conditional example)
            bins_test_rf = y_test.values if not y_test.empty else np.array([])
            y_test_true_np_rf = y_test.values if not y_test.empty else np.array([])

            # Evaluate the fitted Mondrian classifier
            cp_coverage_mond_rf, cp_avg_set_size_mond_rf, _, class_coverage_dict = evaluate_mondrian_prediction(
                fitted_cc=fitted_cc_rf,                # Pass the fitted classifier
                probs_test=calibrated_probs_test_rf,   # Pass test probabilities (n_test, 2)
                y_test_true=y_test_true_np_rf,         # Pass true test labels
                bins_test=bins_test_rf,                # Pass test bins
                alpha=ALPHA
            )
            mcp_eval_duration_rf = time.time() - mcp_eval_start_time_rf
            logging.info(f"--- [{model_name_rf}] Mondrian CP evaluation finished in {mcp_eval_duration_rf:.2f} seconds ---")
        else:
            logging.warning(f"[{model_name_rf}] Skipping Mondrian CP evaluation: Classifier not fitted.")

        eval_duration_rf = time.time() - eval_start_time_rf  # Total eval time
        logging.info(f"--- [{model_name_rf}] Total Evaluation finished in {eval_duration_rf:.2f} seconds ---")

        # Store results
        all_results = {model_name_rf: {
            'metrics': metrics_rf,
            'cp_coverage_mond': cp_coverage_mond_rf,
            'cp_class_coverage_dict': class_coverage_dict,
            'cp_avg_set_size_mond': cp_avg_set_size_mond_rf,
            'best_hpo_params': best_params_rf,
            'hpo_f1_score': best_score_hpo_rf,
            'hpo_duration_s': hpo_duration_rf,
        }}
    else:
        logging.warning(f"[{model_name_rf}] Skipping final evaluation (Base model or Calibrator not available).")

    logging.info(f"===== Finished Workflow for {model_name_rf} =====")
    # Return best hyperparameters for the Random Forest model
    return all_results

### 3.4 XGBoost

In [26]:
# HPO Settings for XGB (using iterations)
MAX_RESOURCE_XGB = 500 # Max n_estimators
MIN_RESOURCE_XGB = 30  # Min n_estimators
ETA_XGB = 3
RESOURCE_TYPE_XGB = 'iterations'
model_name_xgb = "XGBoost"
ROUNDS = 20        # Number of rounds to wait for improvement

In [27]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_XGB = 30  # ¡Reducido drásticamente! (Antes 500)
MIN_RESOURCE_XGB = 10  # Mínimo bajo pero cercano a max
ETA_XGB = 4            # Aumentado

In [28]:
def xgb_hpo(
    X_train, y_train, X_val, y_val,
    MAX_RESOURCE_XGB,
    MIN_RESOURCE_XGB,
    ETA_XGB,
    RESOURCE_TYPE_XGB,
    RANDOM_SEED,
    xgb,
    loguniform,
    randint,
    uniform,
    f1_score,
    hyperband_hpo,
    logging,
    model_name_xgb
):
    """
    Run Hyperband HPO for XGBoost and return best parameters and best score.
    """
    # --- XGB: Define Search Space and HPO Params ---
    param_space_xgb = {
        # n_estimators controlled by resource
        'learning_rate': loguniform(0.01, 0.3),
        'max_depth': randint(3, 10),
        'subsample': uniform(0.6, 0.4), # range [0.6, 1.0)
        'colsample_bytree': uniform(0.6, 0.4),
        'gamma': loguniform(1e-2, 1.0), # Min loss reduction
        'reg_alpha': loguniform(1e-3, 1.0), # L1 reg
        'reg_lambda': loguniform(1e-3, 1.0), # L2 reg
        # scale_pos_weight added automatically
        # random_state added automatically
        'objective': ['binary:logistic'], # Fixed objective
        'eval_metric': ['logloss'],        # Fixed eval metric for early stopping
    }

    logging.info(f"--- [{model_name_xgb}] Running Hyperband HPO ---")
    best_params_xgb, best_score_hpo_xgb = hyperband_hpo(
        model_class=xgb.XGBClassifier,
        param_space=param_space_xgb,
        X_train=X_train, # USE UNSCALED DATA
        y_train=y_train,
        X_val=X_val,     # USE UNSCALED DATA
        y_val=y_val,
        max_resource=MAX_RESOURCE_XGB,
        eta=ETA_XGB,
        resource_type=RESOURCE_TYPE_XGB,
        min_resource=MIN_RESOURCE_XGB,
        scoring_func=f1_score,
        random_state=RANDOM_SEED
    )
    return best_params_xgb, best_score_hpo_xgb

def xgb_workflow(
    X_train, y_train, X_val, y_val, X_cal, y_cal, X_test, y_test,
    all_results,
    model_name_xgb,
    MAX_RESOURCE_XGB,
    MIN_RESOURCE_XGB,
    ETA_XGB,
    RESOURCE_TYPE_XGB,
    ROUNDS,
    CALIBRATOR,
    ALPHA,
    RANDOM_SEED,
    loguniform,
    randint,
    uniform,
    xgb,
    hyperband_hpo,
    f1_score,
    EarlyStopping,
    train_calibrate_model,
    fit_mondrian_classifier,
    evaluate_mondrian_prediction,
    calculate_metrics,
    logging,
    np,
    best_params_xgb=None,  # <-- Allow passing best_params directly
    best_score_hpo_xgb=None
):
    """
    Complete XGBoost workflow: HPO, calibration, Mondrian conformal prediction, and evaluation.

    This function performs the following steps:
    1. Defines the XGBoost hyperparameter search space.
    2. Runs Hyperband HPO to find the best hyperparameters.
    3. Determines the best number of boosting rounds using early stopping.
    4. Trains the final XGBoost model and calibrator.
    5. Fits a Mondrian conformal predictor on the calibration set.
    6. Evaluates the model and conformal predictor on the test set.
    7. Stores all results in the provided all_results dictionary.

    All logic is preserved exactly as in the original notebook workflow.

    Parameters
    ----------
    X_train, y_train, X_val, y_val, X_cal, y_cal, X_test, y_test : pd.DataFrame/np.ndarray
        Data splits for training, validation, calibration, and testing.
    model_name_xgb : str
        Name of the model (e.g., "XGBoost").
    MAX_RESOURCE_XGB, MIN_RESOURCE_XGB, ETA_XGB, RESOURCE_TYPE_XGB, ROUNDS : int/str
        Hyperband and early stopping parameters.
    CALIBRATOR : str
        Calibration method to use.
    ALPHA : float
        Significance level for conformal prediction.
    RANDOM_SEED : int
        Random seed for reproducibility.
    loguniform, randint, uniform : scipy.stats distributions
        Distributions for hyperparameter sampling.
    xgb : module
        XGBoost module.
    hyperband_hpo : function
        Function to run Hyperband HPO.
    f1_score : function
        F1 scoring function.
    EarlyStopping : class
        XGBoost early stopping callback.
    train_calibrate_model : function
        Function to train and calibrate the model.
    fit_mondrian_classifier : function
        Function to fit Mondrian conformal classifier.
    evaluate_mondrian_prediction : function
        Function to evaluate Mondrian conformal prediction.
    calculate_metrics : function
        Function to calculate base metrics.
    logging : module
        Logging module.
    np : module
        Numpy module.
    best_params_xgb : (Optional) If provided, skips HPO and uses these params
    best_score_hpo_xgb : (Optional) If provided, uses this as HPO score

    Returns
    -------
    None
        Results are stored in all_results[model_name_xgb].
    """
    logging.info(f"\n\n===== Starting Workflow for {model_name_xgb} =====")
    timestamp_xgb = datetime.now().strftime("%Y%m%d_%H%M%S")

    if best_params_xgb is None:
        hpo_start_time_xgb = time.time()
        best_params_xgb, best_score_hpo_xgb = xgb_hpo(
            X_train, y_train, X_val, y_val,
            MAX_RESOURCE_XGB,
            MIN_RESOURCE_XGB,
            ETA_XGB,
            RESOURCE_TYPE_XGB,
            RANDOM_SEED,
            xgb,
            loguniform,
            randint,
            uniform,
            f1_score,
            hyperband_hpo,
            logging,
            model_name_xgb
        )
        hpo_duration_xgb = time.time() - hpo_start_time_xgb
        logging.info(f"--- [{model_name_xgb}] HPO finished in {hpo_duration_xgb:.2f} seconds ---")
    else:
        hpo_duration_xgb = 0.0
        logging.info(f"--- [{model_name_xgb}] Using provided hyperparameters, skipping HPO. ---")

    # --- 4.3 XGB: Train Final Model & Calibration (using Full Training Set) ---
    fitted_xgb_base = None
    calibrator_xgb = None
    final_best_params_xgb = None  # Initialize

    if best_params_xgb:
        logging.info(f"--- [{model_name_xgb}] Determining best iteration and training calibrator ---")
        calibration_start_time_xgb = time.time()

        # 1. Determine best iteration using early stopping on validation set
        temp_best_params_xgb = best_params_xgb.copy()  # Work with a copy
        temp_best_params_xgb['random_state'] = RANDOM_SEED
        if 'objective' not in temp_best_params_xgb: temp_best_params_xgb['objective'] = 'binary:logistic'
        if 'eval_metric' not in temp_best_params_xgb: temp_best_params_xgb['eval_metric'] = 'logloss'
        if 'n_jobs' not in temp_best_params_xgb: temp_best_params_xgb['n_jobs'] = -1
        if 'scale_pos_weight' not in temp_best_params_xgb:
            neg_count = (y_train == 0).sum()
            pos_count = (y_train == 1).sum()
            if pos_count > 0:
                temp_best_params_xgb['scale_pos_weight'] = neg_count / pos_count
        # Define callbacks for the fit that determines the best iteration
        xgb_final_iteration_callbacks = [
                EarlyStopping(rounds=ROUNDS,        # Number of rounds to wait for improvement
                                save_best=True,   # Saves the model from the best iteration
                                metric_name='logloss', # Explicitly state the metric to monitor (optional but good practice)
                                maximize=False)    # We want to minimize logloss
            ]

        logging.info("Training temporary XGBoost with early stopping to find best iteration...")
        temp_xgb_model = xgb.XGBClassifier(**temp_best_params_xgb, callbacks=xgb_final_iteration_callbacks)
        eval_set_final = [(X_val, y_val)]
        temp_xgb_model.fit(X_train, y_train, eval_set=eval_set_final, verbose=False)

        # Retrieve the best iteration
        best_iteration = temp_xgb_model.best_iteration
        if best_iteration is None or best_iteration <= 0:
            logging.warning(f"Early stopping did not trigger or returned invalid iteration ({best_iteration}). Using max_resource ({MAX_RESOURCE_XGB}) as n_estimators.")
            best_iteration = MAX_RESOURCE_XGB
        logging.info(f"Best iteration found: {best_iteration}")

        # Update best_params with the optimal number of estimators found
        final_best_params_xgb = temp_best_params_xgb.copy()
        final_best_params_xgb['n_estimators'] = best_iteration

        # 2. Train final model and calibrator using train_calibrate_model (like CART)
        logging.info(f"--- [{model_name_xgb}] Training final model ({final_best_params_xgb['n_estimators']} est.) and calibrator ---")
        fitted_xgb_base, calibrator_xgb = train_calibrate_model(
            base_estimator_class=xgb.XGBClassifier,
            best_params=final_best_params_xgb,
            X_train=X_train,
            y_train=y_train,
            calibration_method=CALIBRATOR,  # Use chosen calibration method (e.g., 'cvap', 'platt', etc.)
            n_splits=5,
            random_state=RANDOM_SEED,
            score_method='raw_margin_xgb',  # For XGB, use raw margin for Platt/CVAP
            cvap_loss='log',                # or 'brier' if desired
            cvap_precision=None             # or set as needed
        )
        calibration_duration_xgb = time.time() - calibration_start_time_xgb
        if fitted_xgb_base and calibrator_xgb:
            logging.info(f"--- [{model_name_xgb}] Calibration with {CALIBRATOR} finished in {calibration_duration_xgb:.2f} seconds ---")
            # Optional: Save models
            # joblib.dump(...)
        else:
            logging.error(f"[{model_name_xgb}] Failed to train base model or the calibration.")
    else:
        logging.warning(f"[{model_name_xgb}] HPO did not find best parameters. Skipping subsequent steps.")

    # --- 4.4 XGB: Mondrian ICP Calibration (using Calibration Set) ---
    fitted_cc_xgb = None  # Initialize Mondrian classifier variable
    if fitted_xgb_base and calibrator_xgb:
        if not y_cal.empty:
            logging.info(f"--- [{model_name_xgb}] Calibrating Mondrian Conformal Prediction ---")
            mcp_cal_start_time_xgb = time.time()

            # Calculate probabilities needed for Mondrian ICP on Calibration set
            probs_cal_xgb = calibrator_xgb.predict_proba(X_cal)  # Calibrated probs for BOTH classes

            # Define Mondrian Bins (Class-conditional example)
            bins_cal_xgb = y_cal.values  # Assumes y_cal is pd.Series/np.array

            # Fit the Mondrian classifier
            fitted_cc_xgb = fit_mondrian_classifier(probs_cal_xgb, bins_cal=bins_cal_xgb)

            mcp_cal_duration_xgb = time.time() - mcp_cal_start_time_xgb
            if fitted_cc_xgb:
                logging.info(f"--- [{model_name_xgb}] Mondrian CP calibration finished in {mcp_cal_duration_xgb:.2f} seconds ---")
                # Optional: Save the fitted_cc_xgb object using joblib alongside the base model and calibrator
                # cc_filename = os.path.join(MODEL_DIR, f"{model_name_xgb}_mondrian_classifier_{timestamp_xgb}.joblib")
                # joblib.dump(fitted_cc_xgb, cc_filename)
                # logging.info(f"Mondrian classifier saved to {cc_filename}")
            else:
                logging.error(f"[{model_name_xgb}] Failed to fit Mondrian classifier.")
        else:
            logging.warning(f"[{model_name_xgb}] Calibration set is empty. Skipping Mondrian ICP calibration.")
    else:
        logging.warning(f"[{model_name_xgb}] Base model or Calibrator not available. Skipping Mondrian ICP calibration.")

    # --- 4.5 XGB: Final Evaluation (using Test Set) ---
    if fitted_xgb_base and calibrator_xgb:
        logging.info(f"--- [{model_name_xgb}] Final Evaluation on Test Set ---")
        eval_start_time_xgb = time.time()

        # --- Calculate Base Metrics (Same as before) ---
        probs_test_xgb_full = calibrator_xgb.predict_proba(X_test)  # Calibrated probs for BOTH classes
        y_proba_test_xgb = probs_test_xgb_full[:, 1]
        y_pred_test_xgb = (y_proba_test_xgb >= 0.5).astype(int)
        metrics_xgb = calculate_metrics(y_test, y_pred_test_xgb, y_proba_test_xgb, model_name=model_name_xgb)

        # --- Mondrian Conformal Prediction Evaluation ---
        cp_coverage_mond_xgb, cp_avg_set_size_mond_xgb = None, None  # Initialize results

        if fitted_cc_xgb is not None:  # Check if Mondrian classifier was fitted successfully
            mcp_eval_start_time_xgb = time.time()
            # Define Mondrian Bins for test set (Class-conditional example)
            bins_test_xgb = y_test.values if not y_test.empty else np.array([])
            y_test_true_np = y_test.values if not y_test.empty else np.array([])

            # Evaluate the fitted Mondrian classifier
            cp_coverage_mond_xgb, cp_avg_set_size_mond_xgb, _, class_coverage_dict = evaluate_mondrian_prediction(
                fitted_cc=fitted_cc_xgb,            # Pass the fitted classifier
                probs_test=probs_test_xgb_full,     # Pass test probabilities (n_test, 2)
                y_test_true=y_test_true_np,         # Pass true test labels
                bins_test=bins_test_xgb,            # Pass test bins
                alpha=ALPHA
            )
            mcp_eval_duration_xgb = time.time() - mcp_eval_start_time_xgb
            logging.info(f"--- [{model_name_xgb}] Mondrian CP evaluation finished in {mcp_eval_duration_xgb:.2f} seconds ---")
        else:
            logging.warning(f"[{model_name_xgb}] Skipping Mondrian CP evaluation: Classifier not fitted.")

        eval_duration_xgb = time.time() - eval_start_time_xgb  # Total eval time
        logging.info(f"--- [{model_name_xgb}] Total Evaluation finished in {eval_duration_xgb:.2f} seconds ---")

        # --- Store results (Same as before, using the new variables) ---
        all_results = {model_name_xgb : {
            'metrics': metrics_xgb,
            'cp_coverage_mond': cp_coverage_mond_xgb,           # Store Mondrian coverage
            'cp_class_coverage_dict': class_coverage_dict,
            'cp_avg_set_size_mond': cp_avg_set_size_mond_xgb,   # Store Mondrian avg set size
            'best_hpo_params': best_params_xgb,
            'final_n_estimators': final_best_params_xgb.get('n_estimators', None) if final_best_params_xgb else None,
            'hpo_f1_score': best_score_hpo_xgb,
            'hpo_duration_s': hpo_duration_xgb,
        }}
    else:
        logging.warning(f"[{model_name_xgb}] Skipping final evaluation (Base model or Calibrator not available).")

    logging.info(f"===== Finished Workflow for {model_name_xgb} =====")
    return all_results

### 3.5 LightGBM

In [29]:
# HPO Settings for LGBM (using iterations)
MAX_RESOURCE_LGBM = 500 # Max n_estimators
MIN_RESOURCE_LGBM = 30  # Min n_estimators
ETA_LGBM = 3
RESOURCE_TYPE_LGBM = 'iterations'
model_name_lgbm = "LightGBM"
ROUNDS = 20

In [30]:
#¡MODIFICADO PARA PRUEBA RÁPIDA!
MAX_RESOURCE_LGBM = 30  # ¡Reducido drásticamente! (Antes 500)
MIN_RESOURCE_LGBM = 10  # Mínimo bajo pero cercano a max
ETA_LGBM = 4            # Aumentado
RESOURCE_TYPE_LGBM = 'iterations'

In [31]:
def lgbm_hpo(
    X_train, y_train, X_val, y_val,
    MAX_RESOURCE_LGBM,
    MIN_RESOURCE_LGBM,
    ETA_LGBM,
    RESOURCE_TYPE_LGBM,
    RANDOM_SEED,
    lgb,
    loguniform,
    randint,
    uniform,
    f1_score,
    hyperband_hpo,
    logging,
    model_name_lgb
):
    """
    Run Hyperband HPO for LightGBM and return best parameters and best score.
    """
    # --- LGBM: Define Search Space and HPO Params ---
    param_space_lgbm = {
        # n_estimators controlled by resource
        'learning_rate': loguniform(0.01, 0.3),
        'num_leaves': randint(20, 100),
        'max_depth': randint(3, 15), # Often kept lower than XGB depth
        'subsample': uniform(0.6, 0.4), # Aliased as bagging_fraction
        'colsample_bytree': uniform(0.6, 0.4), # Aliased as feature_fraction
        'reg_alpha': loguniform(1e-3, 1.0), # L1
        'reg_lambda': loguniform(1e-3, 1.0), # L2
        # scale_pos_weight or is_unbalance=True added automatically
        # random_state added automatically
        'objective': ['binary'], # Fixed objective
        'metric': ['logloss'],   # Fixed metric for early stopping
        'verbose': [-1]          # Suppress LightGBM's internal verbosity if desired
    }

    logging.info(f"--- [{model_name_lgbm}] Running Hyperband HPO ---")
    best_params_lgbm, best_score_hpo_lgbm = hyperband_hpo(
        model_class=lgb.LGBMClassifier,
        param_space=param_space_lgbm,
        X_train=X_train, # USE UNSCALED DATA
        y_train=y_train,
        X_val=X_val,     # USE UNSCALED DATA
        y_val=y_val,
        max_resource=MAX_RESOURCE_LGBM,
        eta=ETA_LGBM,
        resource_type=RESOURCE_TYPE_LGBM,
        min_resource=MIN_RESOURCE_LGBM,
        scoring_func=f1_score,
        random_state=RANDOM_SEED
    )
    return best_params_lgbm, best_score_hpo_lgbm

def lgbm_workflow(
    X_train, y_train, X_val, y_val, X_cal, y_cal, X_test, y_test,
    model_name_lgbm,
    MAX_RESOURCE_LGBM,
    MIN_RESOURCE_LGBM,
    ETA_LGBM,
    RESOURCE_TYPE_LGBM,
    ROUNDS,
    CALIBRATOR,
    ALPHA,
    RANDOM_SEED,
    lgb,
    loguniform,
    randint,
    uniform,
    f1_score,
    hyperband_hpo,
    early_stopping,
    train_calibrate_model,
    fit_mondrian_classifier,
    evaluate_mondrian_prediction,
    calculate_metrics,
    logging,
    np,
    datetime,
    best_params_lgbm=None,  # <-- Allow passing best_params directly
    best_score_hpo_lgbm=None
):
    """
    Full LightGBM workflow: HPO, training, calibration, Mondrian conformal prediction, and evaluation.
    This function is a direct conversion of the notebook workflow to a callable function.
    All logic is preserved exactly as in the original notebook cell.

    Parameters
    ----------
    X_train, y_train : Training data and labels
    X_val, y_val     : Validation data and labels
    X_cal, y_cal     : Calibration data and labels
    X_test, y_test   : Test data and labels
    model_name_lgbm  : Name of the model (string)
    MAX_RESOURCE_LGBM, MIN_RESOURCE_LGBM, ETA_LGBM, RESOURCE_TYPE_LGBM, ROUNDS : HPO and early stopping params
    CALIBRATOR       : Calibration method (e.g., 'cvap', 'platt', etc.)
    ALPHA            : Conformal prediction significance level
    RANDOM_SEED      : Random seed for reproducibility
    lgb              : LightGBM module
    loguniform, randint, uniform : Distributions for HPO
    f1_score         : F1 scoring function
    hyperband_hpo    : Hyperband HPO function
    early_stopping   : LightGBM early stopping callback
    train_calibrate_model : Model training and calibration function
    fit_mondrian_classifier : Mondrian conformal classifier fitting function
    evaluate_mondrian_prediction : Mondrian conformal prediction evaluation function
    calculate_metrics : Function to calculate metrics
    logging          : Logging module
    np               : Numpy module
    datetime         : Datetime module
    best_params_lgbm : (Optional) If provided, skips HPO and uses these params
    best_score_hpo_lgbm : (Optional) If provided, uses this as HPO score

    Returns
    -------
    None (results are stored in all_results[model_name_lgbm])
    """
    logging.info(f"\n\n===== Starting Workflow for {model_name_lgbm} =====")
    timestamp_lgbm = datetime.now().strftime("%Y%m%d_%H%M%S")

    if best_params_lgbm is None:
        hpo_start_time_lgbm = time.time()
        best_params_lgbm, best_score_hpo_lgbm = lgbm_hpo(
            X_train, y_train, X_val, y_val,
            MAX_RESOURCE_LGBM,
            MIN_RESOURCE_LGBM,
            ETA_LGBM,
            RESOURCE_TYPE_LGBM,
            RANDOM_SEED,
            lgb,
            loguniform,
            randint,
            uniform,
            f1_score,
            hyperband_hpo,
            logging,
            model_name_lgbm
        )
        hpo_duration_lgbm = time.time() - hpo_start_time_lgbm
        logging.info(f"--- [{model_name_lgbm}] HPO finished in {hpo_duration_lgbm:.2f} seconds ---")
    else:
        hpo_duration_lgbm = 0.0
        logging.info(f"--- [{model_name_lgbm}] Using provided hyperparameters, skipping HPO. ---")

    # --- 5.3 LGBM: Train Final Model & Calibration (using Full Training Set) ---
    fitted_lgbm_base = None
    calibrator_lgbm = None
    final_best_params_lgbm = None  # Initialize

    if best_params_lgbm:
        logging.info(f"--- [{model_name_lgbm}] Determining best iteration and training calibrator ---")
        calibration_start_time_lgbm = time.time()

        # 1. Determine best iteration using early stopping on validation set
        temp_best_params_lgbm = best_params_lgbm.copy()  # Work with a copy
        temp_best_params_lgbm['random_state'] = RANDOM_SEED
        if 'objective' not in temp_best_params_lgbm: temp_best_params_lgbm['objective'] = 'binary'
        if 'metric' not in temp_best_params_lgbm: temp_best_params_lgbm['metric'] = 'logloss'
        if 'n_jobs' not in temp_best_params_lgbm: temp_best_params_lgbm['n_jobs'] = -1
        if 'verbose' not in temp_best_params_lgbm: temp_best_params_lgbm['verbose'] = -1  # Control verbosity
        if 'scale_pos_weight' not in temp_best_params_lgbm:
            neg_count = (y_train == 0).sum()
            pos_count = (y_train == 1).sum()
            if pos_count > 0:
                temp_best_params_lgbm['scale_pos_weight'] = neg_count / pos_count
                if 'is_unbalance' in temp_best_params_lgbm: del temp_best_params_lgbm['is_unbalance']
            elif 'is_unbalance' not in temp_best_params_lgbm:
                temp_best_params_lgbm['is_unbalance'] = True

        # Define callbacks for the fit that determines the best iteration
        callbacks_final = [
            early_stopping(stopping_rounds=ROUNDS, verbose=False)
        ]
        metric_to_monitor = 'logloss'

        logging.info(f"Training temporary LightGBM with early stopping (monitoring '{metric_to_monitor}') to find best iteration...")
        temp_lgbm_model = lgb.LGBMClassifier(**temp_best_params_lgbm)
        eval_set_final_lgbm = [(X_val, y_val)]

        # Check if eval_set is valid before fitting
        if not eval_set_final_lgbm or not isinstance(eval_set_final_lgbm, list) or not eval_set_final_lgbm[0]:
            raise ValueError("eval_set_final_lgbm is not correctly defined before fitting.")
        if len(eval_set_final_lgbm[0]) != 2:
            raise ValueError("Each element in eval_set must be a tuple (X, y).")

        try:
            temp_lgbm_model.fit(
                X_train,
                y_train,
                eval_set=eval_set_final_lgbm,
                eval_metric=metric_to_monitor,
                callbacks=callbacks_final
            )

            # Retrieve the best iteration
            best_iteration_lgbm = temp_lgbm_model.best_iteration_
            if best_iteration_lgbm is None or best_iteration_lgbm <= 0:
                logging.warning(f"Early stopping did not trigger or returned invalid iteration ({best_iteration_lgbm}). Using max_resource ({MAX_RESOURCE_LGBM}) as n_estimators.")
                best_iteration_lgbm = MAX_RESOURCE_LGBM
            logging.info(f"Best iteration found: {best_iteration_lgbm}")

            # Update best_params with the optimal number of estimators found
            final_best_params_lgbm = temp_best_params_lgbm.copy()
            final_best_params_lgbm['n_estimators'] = best_iteration_lgbm

            # 2. Train final model and calibrator using train_calibrate_model (like XGB)
            logging.info(f"--- [{model_name_lgbm}] Training final model ({final_best_params_lgbm['n_estimators']} est.) and calibrator ---")
            fitted_lgbm_base, calibrator_lgbm = train_calibrate_model(
                base_estimator_class=lgb.LGBMClassifier,
                best_params=final_best_params_lgbm,
                X_train=X_train,
                y_train=y_train,
                calibration_method=CALIBRATOR,  # Use chosen calibration method (e.g., 'cvap', 'platt', etc.)
                n_splits=5,
                random_state=RANDOM_SEED,
                score_method='raw_score_lgbm',  # Specify score method for LGBM if needed
                cvap_loss='log',                # or 'brier' if desired
                cvap_precision=None             # or set as needed
            )
            calibration_duration_lgbm = time.time() - calibration_start_time_lgbm
            if fitted_lgbm_base and calibrator_lgbm:
                logging.info(f"--- [{model_name_lgbm}] Calibration finished in {calibration_duration_lgbm:.2f} seconds ---")
                # Optional: Save models
                # joblib.dump(...)
            else:
                logging.error(f"[{model_name_lgbm}] Failed to train base model or calibrator.")
        except ValueError as ve:
            logging.error(f"ValueError during temp_lgbm_model.fit: {ve}")
            logging.error(f"Shapes: X_train={X_train.shape}, y_train={y_train.shape}, X_val={X_val.shape}, y_val={y_val.shape}")
            raise

    # --- 5.4 LGBM: Mondrian ICP Calibration ---
    fitted_cc_lgbm = None  # Initialize Mondrian classifier variable
    if fitted_lgbm_base and calibrator_lgbm:
        if not y_cal.empty:
            logging.info(f"--- [{model_name_lgbm}] Calibrating Mondrian Conformal Prediction ---")
            mcp_cal_start_time_lgbm = time.time()

            # Calculate probabilities needed for Mondrian ICP on Calibration set
            probs_cal_lgbm = calibrator_lgbm.predict_proba(X_cal)  # Calibrated probs for BOTH classes

            # Define Mondrian Bins (Class-conditional example)
            bins_cal_lgbm = y_cal.values  # Assumes y_cal is pd.Series/np.array

            # Fit the Mondrian classifier
            fitted_cc_lgbm = fit_mondrian_classifier(probs_cal_lgbm, bins_cal=bins_cal_lgbm)

            mcp_cal_duration_lgbm = time.time() - mcp_cal_start_time_lgbm
            if fitted_cc_lgbm:
                logging.info(f"--- [{model_name_lgbm}] Mondrian CP calibration finished in {mcp_cal_duration_lgbm:.2f} seconds ---")
                # Optional: Save the fitted_cc_lgbm object using joblib alongside the base model and calibrator
                # cc_filename = os.path.join(MODEL_DIR, f"{model_name_lgbm}_mondrian_classifier_{timestamp_lgbm}.joblib")
                # joblib.dump(fitted_cc_lgbm, cc_filename)
                # logging.info(f"Mondrian classifier saved to {cc_filename}")
            else:
                logging.error(f"[{model_name_lgbm}] Failed to fit Mondrian classifier.")
        else:
            logging.warning(f"[{model_name_lgbm}] Calibration set is empty. Skipping Mondrian ICP calibration.")
    else:
        logging.warning(f"[{model_name_lgbm}] Base model or Calibrator not available. Skipping Mondrian ICP calibration.")

    # --- 5.5 LGBM: Final Evaluation ---
    if fitted_lgbm_base and calibrator_lgbm:
        logging.info(f"--- [{model_name_lgbm}] Final Evaluation on Test Set ---")
        eval_start_time_lgbm = time.time()

        # --- Calculate Base Metrics (Same as before) ---
        probs_test_lgbm_full = calibrator_lgbm.predict_proba(X_test)  # Calibrated probs for BOTH classes
        y_proba_test_lgbm = probs_test_lgbm_full[:, 1]
        y_pred_test_lgbm = (y_proba_test_lgbm >= 0.5).astype(int)
        metrics_lgbm = calculate_metrics(y_test, y_pred_test_lgbm, y_proba_test_lgbm, model_name=model_name_lgbm)

        # --- Mondrian Conformal Prediction Evaluation ---
        cp_coverage_mond_lgbm, cp_avg_set_size_mond_lgbm = None, None  # Initialize results

        if fitted_cc_lgbm is not None:  # Check if Mondrian classifier was fitted successfully
            mcp_eval_start_time_lgbm = time.time()
            # Define Mondrian Bins for test set (Class-conditional example)
            bins_test_lgbm = y_test.values if not y_test.empty else np.array([])
            y_test_true_np = y_test.values if not y_test.empty else np.array([])

            # Evaluate the fitted Mondrian classifier
            cp_coverage_mond_lgbm, cp_avg_set_size_mond_lgbm, _, class_coverage_dict = evaluate_mondrian_prediction(
                fitted_cc=fitted_cc_lgbm,            # Pass the fitted classifier
                probs_test=probs_test_lgbm_full,     # Pass test probabilities (n_test, 2)
                y_test_true=y_test_true_np,          # Pass true test labels
                bins_test=bins_test_lgbm,            # Pass test bins
                alpha=ALPHA
            )
            mcp_eval_duration_lgbm = time.time() - mcp_eval_start_time_lgbm
            logging.info(f"--- [{model_name_lgbm}] Mondrian CP evaluation finished in {mcp_eval_duration_lgbm:.2f} seconds ---")
        else:
            logging.warning(f"[{model_name_lgbm}] Skipping Mondrian CP evaluation: Classifier not fitted.")

        eval_duration_lgbm = time.time() - eval_start_time_lgbm  # Total eval time
        logging.info(f"--- [{model_name_lgbm}] Total Evaluation finished in {eval_duration_lgbm:.2f} seconds ---")

        # --- Store results (Same as before, using the new variables) ---
        all_results = {model_name_lgbm : {
            'metrics': metrics_lgbm,
            'cp_coverage_mond': cp_coverage_mond_lgbm,           # Store Mondrian coverage
            'cp_class_coverage_dict': class_coverage_dict,
            'cp_avg_set_size_mond': cp_avg_set_size_mond_lgbm,   # Store Mondrian avg set size
            'best_hpo_params': best_params_lgbm, # Original HPO params
            # Store actual used estimators if available
            'final_n_estimators': final_best_params_lgbm.get('n_estimators', None) if final_best_params_lgbm else None,
            'hpo_f1_score': best_score_hpo_lgbm,
            'hpo_duration_s': hpo_duration_lgbm,
        }}
    else:
        logging.warning(f"[{model_name_lgbm}] Skipping final evaluation (Base model or Calibrator not available).")
    logging.info(f"===== Finished Workflow for {model_name_lgbm} =====")
    return all_results


### 4 Model Evaluation

In [33]:
#################################################
# HPO Loop - Find and Save Best Hyperparameters #
#################################################

# --- Configuration ---
BEST_PARAMS_DIR = "best_params"
os.makedirs(BEST_PARAMS_DIR, exist_ok=True)

# List of feature groups to process (keys from the 'groups' dictionary)
# Adjust this list if you only want to run specific groups
GROUP_NAMES_TO_PROCESS = list(groups.keys()) # Process all defined groups
GROUP_NAMES_TO_PROCESS = ['group_1', 'group_2'] # Example: Process only specific groups

# Ensure necessary variables are defined in the global scope before running this cell:
# groups, df, TARGET_COLUMN, clean_data, split_data, apply_feature_scaling,
# MAX_RESOURCE_*, MIN_RESOURCE_*, ETA_*, RESOURCE_TYPE_*, RANDOM_SEED, MODEL_DIR,
# SVC, loguniform, hyperband_hpo, f1_score, model_name_svm,
# DecisionTreeClassifier, randint, model_name_cart,
# RandomForestClassifier, model_name_rf,
# xgb, uniform, model_name_xgb,
# lgb, model_name_lgbm,
# svm_hpo, cart_hpo, rf_hpo, xgb_hpo, lgbm_hpo

# Dictionary to store best params found (optional, mainly for logging/debugging here)
all_best_params_found = {}

logging.info(f"Starting HPO loop for groups: {GROUP_NAMES_TO_PROCESS}")
hpo_loop_start_time = time.time()

# Use tqdm for the outer loop to show progress over groups
for group_name in tqdm(GROUP_NAMES_TO_PROCESS, desc="Running HPO"):
    logging.info(f"\n{'='*30} Running HPO for Group: {group_name} {'='*30}")
    group_hpo_start_time = time.time()
    group_best_params = {} # Store best params for the current group

    try:
        # --- Data Preparation ---
        logging.info(f"[{group_name}] Cleaning data...")
        X, y, df_clean = clean_data(df, group_name, TARGET_COLUMN, logger=logging)
        if X.empty or y.empty:
            logging.warning(f"[{group_name}] Skipping HPO due to insufficient data after cleaning.")
            continue

        logging.info(f"[{group_name}] Splitting data (Train/Val only for HPO)...")
        # Only need train/val splits for Hyperparameter Optimization
        X_train, y_train, X_val, y_val, _, _, _, _ = split_data(X, y)

        if X_train.empty or y_train.empty or X_val.empty or y_val.empty:
             logging.warning(f"[{group_name}] Skipping HPO due to empty train or validation set after splitting.")
             continue

        logging.info(f"[{group_name}] Applying feature scaling (for SVM HPO)...")
        # Note: Only scale train/val needed for HPO. Don't save scaler yet.
        X_train_scaled, X_val_scaled, _, _, _ = apply_feature_scaling(
            X_train, X_val, pd.DataFrame(), pd.DataFrame(), # Pass empty DFs for test/cal
            len(X_train) / len(X) if len(X) > 0 else 0,
            len(X_val) / len(X) if len(X) > 0 else 0,
            0, 0,
            MODEL_DIR, save_scaler=False # Don't save scaler during HPO phase
        )

        # --- Run HPO for each model ---

        # SVM HPO
        try:
            logging.info(f"--- [{group_name}] Running SVM HPO ---")
            best_params_svm, best_score_hpo_svm = svm_hpo(
                X_train_scaled, y_train, X_val_scaled, y_val,
                MAX_RESOURCE_SVM, MIN_RESOURCE_SVM, ETA_SVM, RESOURCE_TYPE_SVM,
                RANDOM_SEED, SVC, loguniform, hyperband_hpo, f1_score, logging, model_name_svm
            )
            group_best_params[model_name_svm] = best_params_svm
            logging.info(f"[{group_name}] SVM Best Params: {best_params_svm}, Score: {best_score_hpo_svm:.4f}")
        except Exception as e:
            logging.error(f"[{group_name}] SVM HPO failed: {e}", exc_info=False) # Set exc_info=True for full traceback
            group_best_params[model_name_svm] = None # Indicate failure

        # CART HPO
        try:
            logging.info(f"--- [{group_name}] Running CART HPO ---")
            best_params_cart, best_score_hpo_cart = cart_hpo(
                X_train, y_train, X_val, y_val,
                MAX_RESOURCE_CART, MIN_RESOURCE_CART, ETA_CART, RESOURCE_TYPE_CART,
                RANDOM_SEED, DecisionTreeClassifier, randint, f1_score, hyperband_hpo, logging, model_name_cart
            )
            group_best_params[model_name_cart] = best_params_cart
            logging.info(f"[{group_name}] CART Best Params: {best_params_cart}, Score: {best_score_hpo_cart:.4f}")
        except Exception as e:
            logging.error(f"[{group_name}] CART HPO failed: {e}", exc_info=False)
            group_best_params[model_name_cart] = None

        # Random Forest HPO
        try:
            logging.info(f"--- [{group_name}] Running Random Forest HPO ---")
            best_params_rf, best_score_hpo_rf = rf_hpo(
                 X_train, y_train, X_val, y_val,
                 MAX_RESOURCE_RF, MIN_RESOURCE_RF, ETA_RF, RESOURCE_TYPE_RF,
                 RANDOM_SEED, RandomForestClassifier, randint, f1_score, hyperband_hpo, logging, model_name_rf
            )
            group_best_params[model_name_rf] = best_params_rf
            logging.info(f"[{group_name}] RF Best Params: {best_params_rf}, Score: {best_score_hpo_rf:.4f}")
        except Exception as e:
            logging.error(f"[{group_name}] Random Forest HPO failed: {e}", exc_info=False)
            group_best_params[model_name_rf] = None

        # XGBoost HPO
        try:
            logging.info(f"--- [{group_name}] Running XGBoost HPO ---")
            best_params_xgb, best_score_hpo_xgb = xgb_hpo(
                X_train, y_train, X_val, y_val,
                MAX_RESOURCE_XGB, MIN_RESOURCE_XGB, ETA_XGB, RESOURCE_TYPE_XGB,
                RANDOM_SEED, xgb, loguniform, randint, uniform, f1_score, hyperband_hpo, logging, model_name_xgb
            )
            group_best_params[model_name_xgb] = best_params_xgb
            logging.info(f"[{group_name}] XGB Best Params: {best_params_xgb}, Score: {best_score_hpo_xgb:.4f}")
        except Exception as e:
            logging.error(f"[{group_name}] XGBoost HPO failed: {e}", exc_info=False)
            group_best_params[model_name_xgb] = None

        # LightGBM HPO
        try:
            logging.info(f"--- [{group_name}] Running LightGBM HPO ---")
            best_params_lgbm, best_score_hpo_lgbm = lgbm_hpo(
                 X_train, y_train, X_val, y_val,
                 MAX_RESOURCE_LGBM, MIN_RESOURCE_LGBM, ETA_LGBM, RESOURCE_TYPE_LGBM,
                 RANDOM_SEED, lgb, loguniform, randint, uniform, f1_score, hyperband_hpo, logging, model_name_lgbm
            )
            group_best_params[model_name_lgbm] = best_params_lgbm
            logging.info(f"[{group_name}] LGBM Best Params: {best_params_lgbm}, Score: {best_score_hpo_lgbm:.4f}")
        except Exception as e:
             logging.error(f"[{group_name}] LightGBM HPO failed: {e}", exc_info=False)
             group_best_params[model_name_lgbm] = None

        # --- Save Best Params for the Group ---
        params_filepath = os.path.join(BEST_PARAMS_DIR, f"{group_name}_best_params.json")
        try:
            # Convert numpy types to standard Python types for JSON serialization
            serializable_params = {}
            for model, params in group_best_params.items():
                if params is not None:
                     serializable_params[model] = {k: (int(v) if isinstance(v, np.integer) else
                                                      float(v) if isinstance(v, np.floating) else
                                                      v)
                                                 for k, v in params.items()}
                else:
                    serializable_params[model] = None # Keep None for failed HPO

            with open(params_filepath, 'w') as f:
                json.dump(serializable_params, f, indent=4)
            logging.info(f"[{group_name}] Successfully saved best parameters to {params_filepath}")
            all_best_params_found[group_name] = serializable_params # Store the saved params
        except TypeError as e:
             logging.error(f"[{group_name}] Failed to serialize best parameters: {e}. Params: {group_best_params}", exc_info=True)
        except Exception as e:
            logging.error(f"[{group_name}] Failed to save best parameters to {params_filepath}: {e}", exc_info=True)


        group_hpo_duration = time.time() - group_hpo_start_time
        logging.info(f"--- Finished HPO for Group: {group_name} in {group_hpo_duration:.2f} seconds ---")

    except Exception as e:
        logging.error(f"[{group_name}] Unhandled exception during HPO processing for group: {e}", exc_info=True)
        # Optionally mark this group as failed in some way if needed later

# --- End of HPO Loop ---
hpo_loop_duration = time.time() - hpo_loop_start_time
logging.info(f"\n{'='*30} Finished HPO for all groups in {hpo_loop_duration:.2f} seconds {'='*30}")
print(f"\nHPO loop finished. Best parameters saved in '{BEST_PARAMS_DIR}' directory.")

Running HPO:   0%|          | 0/2 [00:00<?, ?it/s]

Selecting feature set group 'group_1' with 11 columns.



[A
[A

[A[A

[A[A

[A[A
Bracket s=0 (n=1, r0=1.00): 100%|██████████| 1/1 [00:08<00:00,  8.43s/it, Best F1: 0.5257]

[A
[A

[A[A

[A[A

[A[A
Bracket s=0 (n=1, r0=1.00): 100%|██████████| 1/1 [00:00<00:00,  4.54it/s, Best F1: 0.3817]

[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A
[A
[A

[A[A

[A[A

[A[A
[A
[A
[A

[A[A

[A[A

[A[A

[A[A
Bracket s=0 (n=2, r0=20.00): 100%|██████████| 2/2 [00:08<00:00,  4.16s/it, Best F1: 0.6610]

[A
[A

[A[A

[A[A

[A[A
Bracket s=0 (n=1, r0=30.00): 100%|██████████| 1/1 [00:00<00:00,  7.01it/s, Best F1: 0.5339]

[A
[A

[A[A

[A[A
Bracket s=0 (n=1, r0=30.00): 100%|██████████| 1/1 [00:00<00:00,  8.97it/s, Best F1: 0.6061]
Running HPO:  50%|█████     | 1/2 [00:17<00:17, 17.36s/it]

Selecting feature set group 'group_2' with 25 columns.



[A
[A

[A[A

[A[A

[A[A
Bracket s=0 (n=1, r0=1.00): 100%|██████████| 1/1 [00:12<00:00, 13.00s/it, Best F1: 0.3018]

[A
[A

[A[A

[A[A

[A[A
Bracket s=0 (n=1, r0=1.00): 100%|██████████| 1/1 [00:00<00:00,  1.53it/s, Best F1: 0.3863]

[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A
[A
[A

[A[A

[A[A

[A[A
[A
[A
[A

[A[A

[A[A

[A[A

[A[A
Bracket s=0 (n=2, r0=20.00): 100%|██████████| 2/2 [00:21<00:00, 10.52s/it, Best F1: 0.6389]

[A
[A

[A[A

[A[A

[A[A
Bracket s=0 (n=1, r0=30.00): 100%|██████████| 1/1 [00:00<00:00,  4.94it/s, Best F1: 0.3857]

[A
[A

[A[A

[A[A

[A[A
Bracket s=0 (n=1, r0=30.00): 100%|██████████| 1/1 [00:00<00:00,  6.35it/s, Best F1: 0.5409]
Running HPO: 100%|██████████| 2/2 [00:52<00:00, 26.26s/it]



HPO loop finished. Best parameters saved in 'best_params' directory.


In [None]:
################################################################
# Workflow Loop - Run Models with Best Parameters and Evaluate #
################################################################

# --- Configuration ---
BEST_PARAMS_DIR = "best_params" # Directory where HPO results are saved
RESULTS_DIR = "results" # Directory to save final results JSON
os.makedirs(RESULTS_DIR, exist_ok=True)
RESULTS_FILENAME = os.path.join(RESULTS_DIR, f"all_group_results_{datetime.now():%Y%m%d_%H%M%S}.json") # Timestamped results file

# List of feature groups to process (should match HPO loop or be a subset)
GROUP_NAMES_TO_PROCESS = list(groups.keys()) # Process all defined groups
GROUP_NAMES_TO_PROCESS = ['group_1', 'group_2'] # Example: Process only specific groups

# Ensure necessary variables and functions are defined in the global scope:
# groups, df, TARGET_COLUMN, clean_data, split_data, apply_feature_scaling,
# CALIBRATOR, ALPHA, RANDOM_SEED, MODEL_DIR, ROUNDS (for XGB/LGBM),
# svm_workflow, cart_workflow, random_forest_workflow, xgb_workflow, lgbm_workflow,
# All model names (model_name_svm, etc.),
# All HPO/Resource settings (MAX_RESOURCE_*, etc. - needed if best_params are missing),
# All necessary modules/functions passed to workflows (SVC, DecisionTreeClassifier,
# RandomForestClassifier, xgb, lgb, loguniform, randint, uniform, f1_score,
# hyperband_hpo, EarlyStopping, early_stopping, train_calibrate_model,
# fit_mondrian_classifier, evaluate_mondrian_prediction, calculate_metrics,
# logging, np, datetime)

# Dictionary to store results for all groups and models
all_group_results = {}

logging.info(f"Starting main workflow loop for groups: {GROUP_NAMES_TO_PROCESS}")
outer_loop_start_time = time.time()

# Use tqdm for the outer loop to show progress over groups
for group_name in tqdm(GROUP_NAMES_TO_PROCESS, desc="Processing Feature Groups"):
    logging.info(f"\n{'='*30} Processing Feature Group: {group_name} {'='*30}")
    group_start_time = time.time()
    group_results = {} # Store results for the current group
    group_best_params = {} # To store loaded best params

    # --- Load Best Params for the Group ---
    params_filepath = os.path.join(BEST_PARAMS_DIR, f"{group_name}_best_params.json")
    if os.path.exists(params_filepath):
        try:
            with open(params_filepath, 'r') as f:
                group_best_params = json.load(f)
            logging.info(f"[{group_name}] Successfully loaded best parameters from {params_filepath}")
        except Exception as e:
            logging.error(f"[{group_name}] Failed to load best parameters from {params_filepath}: {e}. Proceeding without pre-tuned params.", exc_info=True)
            group_best_params = {} # Reset if loading fails
    else:
        logging.warning(f"[{group_name}] Best parameters file not found: {params_filepath}. Workflows will run HPO internally.")
        group_best_params = {} # Ensure it's a dict

    # Extract best params for each model, defaulting to None if not found/loaded
    # The workflow functions are expected to handle None and run HPO if needed.
    best_params_svm = group_best_params.get(model_name_svm, None)
    best_params_cart = group_best_params.get(model_name_cart, None)
    best_params_rf = group_best_params.get(model_name_rf, None)
    best_params_xgb = group_best_params.get(model_name_xgb, None)
    best_params_lgbm = group_best_params.get(model_name_lgbm, None)

    try:
        # --- 4.1 Data Preparation for the Current Group ---
        logging.info(f"[{group_name}] Cleaning data...")
        X, y, df_clean = clean_data(df, group_name, TARGET_COLUMN, logger=logging)

        if X.empty or y.empty or df_clean.empty:
            logging.warning(f"[{group_name}] Skipping group due to insufficient data after cleaning (X: {X.shape}, y: {y.shape}).")
            all_group_results[group_name] = {'status': 'skipped_insufficient_data', 'results': group_results}
            continue

        logging.info(f"[{group_name}] Splitting data (Train/Val/Test/Cal)...")
        # Split into all necessary sets for training, validation (optional HPO and early stopping), calibration, and testing
        X_train, y_train, X_val, y_val, X_test, y_test, X_cal, y_cal = split_data(X, y)

        if X_train.empty or y_train.empty or X_test.empty or y_test.empty:
             logging.warning(f"[{group_name}] Skipping group due to empty train or test set after splitting.")
             all_group_results[group_name] = {'status': 'skipped_empty_splits', 'results': group_results}
             continue

        # Check if calibration/validation sets are needed and non-empty
        # Note: HPO might run internally if best_params were not loaded, requiring X_val/y_val
        hpo_might_run = any(p is None for p in [best_params_svm, best_params_cart, best_params_rf, best_params_xgb, best_params_lgbm])
        if (CALIBRATOR != 'none' and (X_cal.empty or y_cal.empty)):
             logging.warning(f"[{group_name}] Calibration set is empty, but CALIBRATOR is '{CALIBRATOR}'. Calibration will likely fail or be skipped.")
             # Decide whether to `continue` here based on strictness
        if (hpo_might_run and (X_val.empty or y_val.empty)):
             logging.warning(f"[{group_name}] Validation set is empty, and HPO might run internally (missing best params). HPO will likely fail.")
             # Decide whether to `continue` here

        logging.info(f"[{group_name}] Applying feature scaling (for SVM)...")
        # Scale all necessary splits and save the scaler this time
        X_train_scaled, X_val_scaled, X_test_scaled, X_cal_scaled, scaler = apply_feature_scaling(
            X_train, X_val, X_test, X_cal,
            len(X_train) / len(X) if len(X) > 0 else 0, # Calculate actual proportions for logging inside the function
            len(X_val) / len(X) if len(X) > 0 else 0,
            len(X_test) / len(X) if len(X) > 0 else 0,
            len(X_cal) / len(X) if len(X) > 0 else 0,
            MODEL_DIR, # Pass MODEL_DIR for saving the scaler
            save_scaler=True, # Ensure scaler is saved
            group_name=group_name # Pass group name for potentially unique scaler filename
        )

        # --- 4.2 Run Workflows for the Current Group ---

        # Workflow 1: SVM
        try:
            logging.info(f"--- [{group_name}] Running SVM Workflow ---")
            svm_results = svm_workflow(
                X_train_scaled, y_train, X_val_scaled, y_val,
                X_cal_scaled, y_cal, X_test_scaled, y_test,
                MAX_RESOURCE_SVM, MIN_RESOURCE_SVM, ETA_SVM, RESOURCE_TYPE_SVM,
                model_name_svm, CALIBRATOR, ALPHA, RANDOM_SEED,
                SVC=SVC, loguniform=loguniform, hyperband_hpo=hyperband_hpo, # Pass dependencies
                f1_score=f1_score, logging=logging, # Pass dependencies
                best_params_svm=best_params_svm, # Pass loaded/None params
                best_score_hpo_svm=None # HPO score not loaded, workflow calculates if needed
                
            )
            group_results.update(svm_results if svm_results else {model_name_svm: {"status": "failed"}})
        except Exception as e:
            logging.error(f"[{group_name}] SVM Workflow failed: {e}", exc_info=True)
            group_results[model_name_svm] = {"status": "failed", "error": str(e)}

        # Workflow 2: CART
        try:
            logging.info(f"--- [{group_name}] Running CART Workflow ---")
            cart_results = cart_workflow(
                X_train, y_train, X_val, y_val, X_cal, y_cal, X_test, y_test,
                model_name_cart,
                MAX_RESOURCE_CART, MIN_RESOURCE_CART, ETA_CART, RESOURCE_TYPE_CART,
                RANDOM_SEED, CALIBRATOR, ALPHA,
                DecisionTreeClassifier=DecisionTreeClassifier, randint=randint, # Pass dependencies
                f1_score=f1_score, hyperband_hpo=hyperband_hpo, logging=logging, # Pass dependencies
                best_params_cart=best_params_cart, # Pass loaded/None params
                best_score_hpo_cart=None # HPO score not loaded
                
            )
            group_results.update(cart_results if cart_results else {model_name_cart: {"status": "failed"}})
        except Exception as e:
            logging.error(f"[{group_name}] CART Workflow failed: {e}", exc_info=True)
            group_results[model_name_cart] = {"status": "failed", "error": str(e)}

        # Workflow 3: Random Forest
        try:
            logging.info(f"--- [{group_name}] Running Random Forest Workflow ---")
            # Ensure arguments match the function definition (file_context_9)
            rf_results = random_forest_workflow(
                X_train, y_train, X_val, y_val, X_cal, y_cal, X_test, y_test,
                model_name_rf, # model_name first
                MAX_RESOURCE_RF, MIN_RESOURCE_RF, ETA_RF, RESOURCE_TYPE_RF, # HPO settings
                CALIBRATOR, ALPHA, RANDOM_SEED,
                RandomForestClassifier=RandomForestClassifier, randint=randint, f1_score=f1_score, # Dependencies
                hyperband_hpo=hyperband_hpo, train_calibrate_model=train_calibrate_model, # Dependencies
                fit_mondrian_classifier=fit_mondrian_classifier, evaluate_mondrian_prediction=evaluate_mondrian_prediction, # Dependencies
                calculate_metrics=calculate_metrics, logging=logging, np=np, datetime=datetime, # Dependencies
                best_params_rf=best_params_rf, # Pass loaded/None params
                best_score_hpo_rf=None # HPO score not loaded
                
            )
            group_results.update(rf_results if rf_results else {model_name_rf: {"status": "failed"}})
        except Exception as e:
            logging.error(f"[{group_name}] Random Forest Workflow failed: {e}", exc_info=True)
            group_results[model_name_rf] = {"status": "failed", "error": str(e)}


        # Workflow 4: XGBoost
        try:
            logging.info(f"--- [{group_name}] Running XGBoost Workflow ---")
            # Ensure arguments match the function definition (file_context_4)
            xgb_results = xgb_workflow(
                X_train, y_train, X_val, y_val, X_cal, y_cal, X_test, y_test,
                {}, # Pass empty dict for all_results initially, workflow should manage its own return
                model_name_xgb,
                MAX_RESOURCE_XGB, MIN_RESOURCE_XGB, ETA_XGB, RESOURCE_TYPE_XGB, ROUNDS,
                CALIBRATOR, ALPHA, RANDOM_SEED,
                loguniform=loguniform, randint=randint, uniform=uniform, xgb=xgb, # Dependencies
                hyperband_hpo=hyperband_hpo, f1_score=f1_score, EarlyStopping=EarlyStopping, # Dependencies
                train_calibrate_model=train_calibrate_model, fit_mondrian_classifier=fit_mondrian_classifier, # Dependencies
                evaluate_mondrian_prediction=evaluate_mondrian_prediction, calculate_metrics=calculate_metrics, # Dependencies
                logging=logging, np=np, # Dependencies
                best_params_xgb=best_params_xgb, # Pass loaded/None params
                best_score_hpo_xgb=None # HPO score not loaded
                
            )
            group_results.update(xgb_results if xgb_results else {model_name_xgb: {"status": "failed"}})
        except Exception as e:
            logging.error(f"[{group_name}] XGBoost Workflow failed: {e}", exc_info=True)
            group_results[model_name_xgb] = {"status": "failed", "error": str(e)}


        # Workflow 5: LightGBM
        try:
            logging.info(f"--- [{group_name}] Running LightGBM Workflow ---")
            # Ensure arguments match the function definition (file_context_2)
            lgbm_results = lgbm_workflow(
                 X_train, y_train, X_val, y_val, X_cal, y_cal, X_test, y_test,
                 model_name_lgbm,
                 MAX_RESOURCE_LGBM, MIN_RESOURCE_LGBM, ETA_LGBM, RESOURCE_TYPE_LGBM, ROUNDS,
                 CALIBRATOR, ALPHA, RANDOM_SEED,
                 lgb=lgb, loguniform=loguniform, randint=randint, uniform=uniform, f1_score=f1_score, # Dependencies
                 hyperband_hpo=hyperband_hpo, early_stopping=early_stopping, train_calibrate_model=train_calibrate_model, # Dependencies
                 fit_mondrian_classifier=fit_mondrian_classifier, evaluate_mondrian_prediction=evaluate_mondrian_prediction, # Dependencies
                 calculate_metrics=calculate_metrics, logging=logging, np=np, datetime=datetime, # Dependencies
                 best_params_lgbm=best_params_lgbm, # Pass loaded/None params
                 best_score_hpo_lgbm=None # HPO score not loaded
                 
            )
            group_results.update(lgbm_results if lgbm_results else {model_name_lgbm: {"status": "failed"}})
        except Exception as e:
             logging.error(f"[{group_name}] LightGBM Workflow failed: {e}", exc_info=True)
             group_results[model_name_lgbm] = {"status": "failed", "error": str(e)}


        # --- 4.3 Store Results for the Group ---
        all_group_results[group_name] = {'status': 'completed', 'results': group_results}
        group_duration = time.time() - group_start_time
        logging.info(f"--- Finished processing Feature Group: {group_name} in {group_duration:.2f} seconds ---")

    except Exception as e:
        logging.error(f"[{group_name}] Unhandled exception during workflow processing for group: {e}", exc_info=True)
        all_group_results[group_name] = {'status': 'failed_outer', 'error': str(e), 'results': group_results}
        # Optionally `continue` or `break` depending on desired behavior

# --- End of Workflow Loop ---
outer_loop_duration = time.time() - outer_loop_start_time
logging.info(f"\n{'='*30} Finished processing all groups in {outer_loop_duration:.2f} seconds {'='*30}")

# --- Serialize Final Results ---
try:
    # Define a helper function to convert non-standard dictionary keys recursively
    def convert_keys_to_standard_types(obj):
        if isinstance(obj, dict):
            new_dict = {}
            for k, v in obj.items():
                new_key = k
                # Convert numpy integer keys to standard Python int
                if isinstance(k, np.integer):
                    new_key = int(k)
                # Add conversions for other non-standard key types if needed
                # elif isinstance(k, np.floating): new_key = float(k)
                # elif not isinstance(k, (str, int, float, bool, type(None))): new_key = str(k)
                new_dict[new_key] = convert_keys_to_standard_types(v) # Recurse on value
            return new_dict
        elif isinstance(obj, list):
            # Recursively process items in lists
            return [convert_keys_to_standard_types(item) for item in obj]
        else:
            # Return non-dict/list items as is
            return obj

    # Define a helper function to make result VALUES JSON serializable
    def default_serializer(obj):
        if isinstance(obj, np.integer):
            # Convert numpy integers to Python int
            return int(obj)
        elif isinstance(obj, np.floating):
            # Convert numpy floats to Python float, handle NaN/Inf
            if np.isnan(obj): return None # Represent NaN as null
            if np.isinf(obj): return None # Represent Inf as null
            return float(obj)
        elif isinstance(obj, np.ndarray):
            # Convert numpy arrays to lists
            return obj.tolist()
        elif isinstance(obj, (datetime, pd.Timestamp)):
             # Convert datetime/timestamp objects to ISO format string
             return obj.isoformat()
        elif isinstance(obj, pd.DataFrame):
            # Serialize DataFrames (example: to dict with 'split' orientation)
            try:
                return obj.to_dict(orient='split')
            except Exception:
                return f"DataFrame (shape {obj.shape}) - Not serialized"
        elif isinstance(obj, Exception): # Serialize exception objects to string
            return f"Error: {str(obj)}"
        # Fallback for other types: try converting to string
        try:
            return str(obj)
        except Exception:
            # If string conversion fails, represent as unserializable type
            return f"Unserializable type: {type(obj)}"

    # Convert keys in the results dictionary BEFORE dumping to JSON
    serializable_results = convert_keys_to_standard_types(all_group_results)

    # Dump the processed dictionary to JSON
    with open(RESULTS_FILENAME, 'w') as f:
        # Use the default_serializer for values; keys are now standard types
        json.dump(serializable_results, f, indent=4, default=default_serializer)

    logging.info(f"Successfully saved all group results to {RESULTS_FILENAME}")
    print(f"\nWorkflow loop finished. Results saved to '{RESULTS_FILENAME}'.")
except Exception as e:
    # Log the error and inform the user
    logging.error(f"Failed to serialize final results to {RESULTS_FILENAME}: {e}", exc_info=True)
    print("\nWorkflow loop finished, but failed to save results to JSON.")
    # Ensure the in-memory variable name matches what's used later if needed
    print("Results might be available in the 'all_group_results' dictionary variable in this session (keys might not be standard types).")
    print("Processed, serializable results might be available in 'serializable_results'.")


# The 'all_group_results' dictionary (and the saved JSON file) now holds the results.
# The next cell can load the JSON file or use the dictionary directly to create summary tables or plots.
# Example: Access results for SVM in group_1
# loaded_results = {}
# try:
#     with open(RESULTS_FILENAME, 'r') as f:
#         loaded_results = json.load(f)
#     print(loaded_results.get('group_1', {}).get('results', {}).get(model_name_svm))
# except Exception as e:
#     print(f"Error loading results file: {e}")
#     print("Using in-memory results:")
#     print(all_group_results.get('group_1', {}).get('results', {}).get(model_name_svm))

# The next cell should contain the code to parse 'all_group_results' (or loaded results)
# and display the final summary DataFrame.

Processing Feature Groups:   0%|          | 0/2 [00:00<?, ?it/s]

Selecting feature set group 'group_1' with 11 columns.


Processing Feature Groups:  50%|█████     | 1/2 [00:54<00:54, 54.76s/it]

Selecting feature set group 'group_2' with 25 columns.


Processing Feature Groups: 100%|██████████| 2/2 [02:30<00:00, 75.13s/it]


Workflow loop finished, but failed to save results to JSON.
Results are available in the 'all_group_results' dictionary variable in this session.





In [36]:
# --- Serialize Final Results ---
try:
    # Define a helper function to convert non-standard dictionary keys recursively
    def convert_keys_to_standard_types(obj):
        if isinstance(obj, dict):
            new_dict = {}
            for k, v in obj.items():
                new_key = k
                # Convert numpy integer keys to standard Python int
                if isinstance(k, np.integer):
                    new_key = int(k)
                # Add conversions for other non-standard key types if needed
                # elif isinstance(k, np.floating): new_key = float(k)
                # elif not isinstance(k, (str, int, float, bool, type(None))): new_key = str(k)
                new_dict[new_key] = convert_keys_to_standard_types(v) # Recurse on value
            return new_dict
        elif isinstance(obj, list):
            # Recursively process items in lists
            return [convert_keys_to_standard_types(item) for item in obj]
        else:
            # Return non-dict/list items as is
            return obj

    # Define a helper function to make result VALUES JSON serializable
    def default_serializer(obj):
        if isinstance(obj, np.integer):
            # Convert numpy integers to Python int
            return int(obj)
        elif isinstance(obj, np.floating):
            # Convert numpy floats to Python float, handle NaN/Inf
            if np.isnan(obj): return None # Represent NaN as null
            if np.isinf(obj): return None # Represent Inf as null
            return float(obj)
        elif isinstance(obj, np.ndarray):
            # Convert numpy arrays to lists
            return obj.tolist()
        elif isinstance(obj, (datetime, pd.Timestamp)):
             # Convert datetime/timestamp objects to ISO format string
             return obj.isoformat()
        elif isinstance(obj, pd.DataFrame):
            # Serialize DataFrames (example: to dict with 'split' orientation)
            try:
                return obj.to_dict(orient='split')
            except Exception:
                return f"DataFrame (shape {obj.shape}) - Not serialized"
        elif isinstance(obj, Exception): # Serialize exception objects to string
            return f"Error: {str(obj)}"
        # Fallback for other types: try converting to string
        try:
            return str(obj)
        except Exception:
            # If string conversion fails, represent as unserializable type
            return f"Unserializable type: {type(obj)}"

    # Convert keys in the results dictionary BEFORE dumping to JSON
    serializable_results = convert_keys_to_standard_types(all_group_results)

    # Dump the processed dictionary to JSON
    with open(RESULTS_FILENAME, 'w') as f:
        # Use the default_serializer for values; keys are now standard types
        json.dump(serializable_results, f, indent=4, default=default_serializer)

    logging.info(f"Successfully saved all group results to {RESULTS_FILENAME}")
    print(f"\nWorkflow loop finished. Results saved to '{RESULTS_FILENAME}'.")
except Exception as e:
    # Log the error and inform the user
    logging.error(f"Failed to serialize final results to {RESULTS_FILENAME}: {e}", exc_info=True)
    print("\nWorkflow loop finished, but failed to save results to JSON.")
    # Ensure the in-memory variable name matches what's used later if needed
    print("Results might be available in the 'all_group_results' dictionary variable in this session (keys might not be standard types).")
    print("Processed, serializable results might be available in 'serializable_results'.")


Workflow loop finished. Results saved to 'results\all_group_results_20250503_001415.json'.


### 5 Results


In [39]:
results_summary = []
for model_name, results_data in all_results.items():
    summary = {'Model': model_name}
    metrics = results_data.get('metrics')
    if metrics:
        summary.update(metrics)
        cm = summary.pop('confusion_matrix', None)
        if cm:
            summary['TN'] = cm.get('tn')
            summary['FP'] = cm.get('fp')
            summary['FN'] = cm.get('fn')
            summary['TP'] = cm.get('tp')

    # --- Use the new Mondrian CP results ---
    summary['CP Coverage (Mondrian)'] = results_data.get('cp_coverage_mond')
    summary['CP Avg Set Size (Mondrian)'] = results_data.get('cp_avg_set_size_mond')

    # --- Add per-class Mondrian CP coverage ---
    class_coverage_dict = results_data.get('cp_class_coverage_dict') or {}
    # Add columns for class 0 and class 1 coverage (use None if missing)
    summary['CP Coverage (Mondrian, class 0)'] = class_coverage_dict.get(0, None)
    summary['CP Coverage (Mondrian, class 1)'] = class_coverage_dict.get(1, None)

    summary['HPO F1'] = results_data.get('hpo_f1_score')
    summary['HPO Duration (s)'] = results_data.get('hpo_duration_s')
    summary['Final Estimators'] = results_data.get('final_n_estimators', 'N/A') # Keep if relevant (not for SVM)
    results_summary.append(summary)

results_df = pd.DataFrame(results_summary)

# Set display options for float formatting
pd.set_option('display.float_format', lambda x: f'{x:.4f}' if isinstance(x, float) else x)

print("\n===== Performance Metrics Summary =====")
from IPython.display import display # Make sure display is imported
if not results_df.empty:
    # Ensure all columns and all rows are displayed
    with pd.option_context('display.max_columns', None, 'display.max_rows', None):
        display(results_df)
else:
    print("No results to display.")

# --- Update CSV saving ---
results_csv_path = os.path.join(MODEL_DIR, f"model_comparison_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
try:
    results_df.to_csv(results_csv_path, index=False)
    logging.info(f"Results summary DataFrame saved to {results_csv_path}")
except Exception as e:
    logging.error(f"Failed to save results summary CSV: {e}")

# --- Optional Plotting (Update or Remove) ---
# The existing plotting code for 'CP Empty %' and 'CP Multi-Class %'
# might need to be removed or adapted if you don't calculate these stats
# with the new mondrian_icp function.
# For now, let's comment it out as crepes doesn't directly return these counts easily.

# if not results_df.empty and 'CP Empty Sets' in results_df.columns and 'CP Multi-Class Sets' in results_df.columns:
#    ... (keep the existing plot code commented out or remove it) ...
# else:
#    logging.warning("Could not plot CP set types: Results DataFrame is empty or missing required columns.")


===== Performance Metrics Summary =====


Unnamed: 0,Model,accuracy,precision,recall_tpr,f1_score,specificity_tnr,g_mean,roc_auc,pr_auc,brier_score,TN,FP,FN,TP,CP Coverage (Mondrian),CP Avg Set Size (Mondrian),"CP Coverage (Mondrian, class 0)","CP Coverage (Mondrian, class 1)",HPO F1,HPO Duration (s),Final Estimators
0,SVM,0.9644,0.8626,0.587,0.6986,0.9929,0.7635,0.9365,0.7622,0.0305,5057,36,159,226,,,,,0.5852,6.4635,
1,CART,0.956,0.9737,0.3844,0.5512,0.9992,0.6198,0.7106,0.5881,0.0654,5089,4,237,148,,,,,0.4768,0.9327,
2,Random_Forest,0.973,0.9438,0.6545,0.773,0.9971,0.8078,0.9483,0.8201,0.0245,5078,15,133,252,,,,,0.7362,33.8943,
3,XGBoost,0.9708,0.9518,0.6156,0.7476,0.9976,0.7837,0.9496,0.7952,0.0287,5081,12,148,237,,,,,0.5753,0.4922,99.0
4,LightGBM,0.9717,0.9389,0.639,0.7604,0.9969,0.7981,0.9525,0.8157,0.025,5077,16,139,246,,,,,0.7034,0.1797,100.0
