In [25]:
#jupyter nbconvert --to script Model.ipynb


#TODO: Calibración con Conformal Prediction.
#TODO: Hyperparameter selection con el validation dataset. `GridSearchCV` or `RandomizedSearchCV`
#TODO: Undersampling/weighted algorithms.
#TODO: Usar metricas para estudiar la performance.
#TODO: Estudiar relevancia de las features para cada modelo.

In [26]:
# Remove any existing log files
import os
import glob
import logging

# Reset logger to avoid any issues with permissions
logging.shutdown()
# Remove loggers
for log_file in glob.glob("*.log"):
    os.remove(log_file)



# Star-Galaxy Classification using ALHAMBRA Photometry

This notebook implements and evaluates several machine learning models for classifying astronomical objects as stars or galaxies based on multi-band photometric data from the ALHAMBRA survey, using labels derived from higher-resolution COSMOS2020 data.

**Target Variable:** `acs_mu_class` (from COSMOS2020)
 - Which is 1 for Galaxy and 2 for Star. We will remap this to 0 (Galaxy, majority class) and 1 (Star, minority class).

**Features:** Selected columns from the ALHAMBRA survey data.

**Models:**
1. Support Vector Machine (SVM)
2. Decision Tree (CART)
3. Random Forest
4. XGBoost
5. LightGBM

## 0. Setup and Configuration

In [27]:
import pandas as pd
import numpy as np
import os
import logging
from datetime import datetime
import joblib # For saving/loading models efficiently
import glob

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, roc_auc_score
)

# Boosting models
import xgboost as xgb
import lightgbm as lgb


# Configure logging
logging.shutdown()
logging.basicConfig(
    filename=f'models_{datetime.now().strftime("%d_%H-%M-%S")}.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)
# Prevent logs from being printed to console
logging.getLogger().handlers = [h for h in logging.getLogger().handlers if isinstance(h, logging.FileHandler)]

## 1. Loading Dataset & Feature Selection

**Interesting Feature Combinations for Modeling:**

1.  `morphology_only`: Use only features from category 1.
2.  `photometry_magnitudes_only`: Use only features from category 2.
3.  `photometry_mags_errors`: Use features from categories 2 and 3. (Recommended over magnitudes alone).
4.  `photometry_colors_only`: *Requires deriving colors first* (e.g., F457W - F519W, J - KS). Not directly listed but a very common and powerful approach.
5.  `photometry_plus_morphology`: Use features from categories 1, 2, and 3.
6.  `photometry_no_redshift`: Use features from categories 1, 2, 3, and 5. Excludes all BPZ-derived outputs. Tests classification based purely on observed shape and flux.
7.  `redshift_related_only`: Use only features from category 4. Tests how much classification info is contained *just* in the photo-z code's output (which is itself based on photometry).
8.  `full_alhambra_photometry_based`: Use features from categories 2, 3, 4, and 5. All information derivable from ALHAMBRA photometry, including photo-z results and per-band quality.
9.  `full_alhambra_all`: Use features from categories 1, 2, 3, 4, and 5. The most comprehensive set using only ALHAMBRA-derived features.


In [28]:
# Read the df
df = pd.read_csv('data/match_alhambra_cosmos2020_ACS_class_0.8arcsec.csv')
logging.info(f"DataFrame created with shape: {df.shape}")
# Map ACS classification: 1 (Galaxy, Majority) -> 0, 2 (Star, minority) -> 1, 3 (Fake) -> drop
logging.info("Original class counts:")
logging.info(df['acs_mu_class'].value_counts().to_string())

# Drop fake detections (class 3)
# Drop fake detections
n_fakes = (df['acs_mu_class'] == 3).sum()
logging.info(f"Number of fake detections (class 3): {n_fakes}")
df = df[df['acs_mu_class'] != 3]

# Map classifications
df['acs_mu_class'] = df['acs_mu_class'].map({1: 0, 2: 1})

logging.info("After dropping fakes and mapping classes (0: Galaxy, 1: Star):")
logging.info(df['acs_mu_class'].value_counts().to_string())

In [29]:
# Input features

# --- Define feature categories based on ALHAMBRA data using exact names ---

# 1. ALHAMBRA Morphology Features (SExtractor-based)
morphology_features = [
    'area', 'fwhm', 'stell', 'ell', 'a', 'b', 'theta', 'rk', 'rf', 's2n'
]

# 2. ALHAMBRA Photometry Magnitudes (Optical + NIR + Synthetic)
OPTICAL_MAG_COLS = [
    'F365W', 'F396W', 'F427W', 'F458W', 'F489W', 'F520W', 'F551W',
    'F582W', 'F613W', 'F644W', 'F675W', 'F706W', 'F737W', 'F768W',
    'F799W', 'F830W', 'F861W', 'F892W', 'F923W', 'F954W'
]
photometry_magnitudes = (
    OPTICAL_MAG_COLS +
    ['J', 'H', 'KS', 'F814W']
)

# 3. ALHAMBRA Photometry Uncertainties
OPTICAL_ERR_COLS = [
    'dF365W', 'dF396W', 'dF427W', 'dF458W', 'dF489W', 'dF520W', 'dF551W',
    'dF582W', 'dF613W', 'dF644W', 'dF675W', 'dF706W', 'dF737W', 'dF768W',
    'dF799W', 'dF830W', 'dF861W', 'dF892W', 'dF923W', 'dF954W'
]
photometry_uncertainties = (
    OPTICAL_ERR_COLS +
    ['dJ', 'dH', 'dKS', 'dF814W']
)

# Combine Mags and Errors - often used together
photometry_mags_errors = photometry_magnitudes + photometry_uncertainties

# 4. ALHAMBRA Photometric Redshift & Derived Features (BPZ-based)
redshift_derived_features = [
    'zb_1', 'zb_Min_1', 'zb_Max_1', 'Tb_1', 'Odds_1',
    'z_ml', 't_ml', 'Chi2',
    'Stell_Mass_1', 'M_Abs_1', 'MagPrior'
]

# 5. ALHAMBRA Quality/Auxiliary Features (per-band quality etc.)
OPTICAL_IRMS_COLS = [
    'irms_F365W', 'irms_F396W', 'irms_F427W', 'irms_F458W', 'irms_F489W',
    'irms_F520W', 'irms_F551W', 'irms_F582W', 'irms_F613W', 'irms_F644W',
    'irms_F675W', 'irms_F706W', 'irms_F737W', 'irms_F768W', 'irms_F799W',
    'irms_F830W', 'irms_F861W', 'irms_F892W', 'irms_F923W', 'irms_F954W'
]
quality_aux_features = (
    ['nfobs'] +
    OPTICAL_IRMS_COLS +
    ['irms_J', 'irms_H', 'irms_KS', 'irms_F814W']
)

# --- Define lists of features NOT used for modeling ---

non_modeling_identifiers = ['ID_1', 'id_2'] # ALHAMBRA ID, COSMOS ID

non_modeling_astrometry = [
    'RA_1', 'Dec_1', 'x', 'y', # ALHAMBRA Astrometry
    'ra_2', 'dec_2',          # COSMOS Astrometry
    'Separation'              # Matching Quality
]

non_modeling_flags = [
    'photoflag', 'xray', 'PercW', 'Satur_Flag', # ALHAMBRA Object/Photometry Flags
    'irms_OPT_Flag', 'irms_NIR_Flag'           # ALHAMBRA Overall Quality Flags
]

non_modeling_alhambra_prediction = ['Stellar_Flag'] # ALHAMBRA's own classification

non_modeling_aperture_mags = [ # Specific aperture mags, usually use total mags
    'F814W_3arcs', 'dF814W_3arcs', 'F814W_3arcs_corr'
]

non_modeling_cosmos_features = [ # Measurements/flags derived from COSMOS data (HST, HSC, VISTA...)
    'model_flag',
    'flag_hsc', 'flag_supcam', 'flag_udeep', 'flag_uvista',
    'hsc_r_mag', 'hsc_r_magerr', 'hsc_r_valid',
    'hsc_i_mag', 'hsc_i_magerr', 'hsc_i_valid',
    'uvista_j_mag', 'uvista_j_magerr', 'uvista_j_valid',
    'uvista_ks_mag', 'uvista_ks_magerr', 'uvista_ks_valid',
    'acs_f814w_mag', 'acs_f814w_magerr',
    'acs_fwhm_world', 'acs_mu_max',
    'solution_model' # This is categorical, but still COSMOS-derived info
]

target_variable = ['acs_mu_class'] # The COSMOS classification label to predict

# --- Consolidate into the main dictionary for easy access ---

feature_sets = {
    # --- Potential Input Feature Sets ---
    'morphology_only': morphology_features,
    'photometry_magnitudes_only': photometry_magnitudes,
    'photometry_mags_errors': photometry_mags_errors,
    'photometry_plus_morphology': photometry_mags_errors + morphology_features,
    'photometry_no_redshift': photometry_mags_errors + morphology_features + quality_aux_features,
    'redshift_related_only': redshift_derived_features,
    'full_alhambra_photometry_based': photometry_mags_errors + redshift_derived_features + quality_aux_features,
    'full_alhambra_all': (morphology_features +
                          photometry_mags_errors +
                          redshift_derived_features +
                          quality_aux_features),

    # --- Excluded Feature Sets ---
    'non_modeling_identifiers': non_modeling_identifiers,
    'non_modeling_astrometry': non_modeling_astrometry,
    'non_modeling_flags': non_modeling_flags,
    'non_modeling_alhambra_prediction': non_modeling_alhambra_prediction,
    'non_modeling_aperture_mags': non_modeling_aperture_mags,
    'non_modeling_cosmos_features': non_modeling_cosmos_features,
    'target_variable': target_variable
}

# --- Function to get a specific feature set (Unchanged from before) ---

def get_feature_set(df, set_name, feature_sets_dict):
    """
    Selects columns from a DataFrame based on a predefined feature set name.

    Args:
        df (pd.DataFrame): The input DataFrame.
        set_name (str): The name of the desired feature set
                        (must be a key in feature_sets_dict).
        feature_sets_dict (dict): Dictionary containing feature set names
                                  as keys and lists of column names as values.

    Returns:
        pd.DataFrame: A DataFrame containing only the columns
                      belonging to the specified feature set.
                      Returns an empty DataFrame if no columns are found.
    """
    if set_name not in feature_sets_dict:
        raise ValueError(f"Feature set '{set_name}' not defined. "
                         f"Available sets: {list(feature_sets_dict.keys())}")

    # Get the list of columns for the requested set
    required_cols_in_set = feature_sets_dict[set_name]

    # Find which of these columns actually exist in the DataFrame
    available_cols = [col for col in required_cols_in_set if col in df.columns]

    # Warn if some columns from the set definition are missing
    missing_cols = [col for col in required_cols_in_set if col not in available_cols]
    if missing_cols:
        print(f"Warning: The following columns defined for feature set '{set_name}'"
              f" were not found in the DataFrame and will be excluded: {missing_cols}")

    if not available_cols:
        print(f"Warning: No columns for feature set '{set_name}' found in the DataFrame.")
        return pd.DataFrame() # Return empty DataFrame

    print(f"Selecting feature set '{set_name}' with {len(available_cols)} columns.")
    return df[available_cols]

# --- Example Usage ---
# Assuming 'df' is your loaded DataFrame

# Example: Get the DataFrame with only morphology features
# df_morphology = get_feature_set(df, 'morphology_only', feature_sets)
# print(f"Morphology features shape: {df_morphology.shape}")

# Example: Get the DataFrame with photometry (mags+errors) and morphology
# df_phot_morph = get_feature_set(df, 'photometry_plus_morphology', feature_sets)
# print(f"Photometry + Morphology features shape: {df_phot_morph.shape}")

# Example: Get the most comprehensive set of ALHAMBRA features
# df_full = get_feature_set(df, 'full_alhambra_all', feature_sets)
# print(f"Full ALHAMBRA features shape: {df_full.shape}")

# Example: Get the target variable
# target = df[feature_sets['target_variable'][0]]
# print(f"Target variable shape: {target.shape}")

# Example: See which COSMOS features were identified for exclusion
# df_cosmos_excluded = get_feature_set(df, 'non_modeling_cosmos_features', feature_sets)
# print(f"COSMOS features to exclude: {df_cosmos_excluded.columns.tolist()}")

## 2. Data Preprocessing and Splitting

In [30]:
# Data splitting parameters
TEST_SIZE = 0.10 # Test set proportion
VAL_SIZE = 0.10 # Validation set proportion
CAL_SIZE = 0.10 # Calibration set proportion
# Train size will be 1 - (TEST_SIZE + VAL_SIZE + CAL_SIZE) = 0.70

RANDOM_SEED = 42 # For reproducibility

# Model saving directory
MODEL_DIR = "trained_models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Data splitting strategy ('stratified' or 'random')
SPLIT_STRATEGY = 'stratified' # Recommended for imbalanced datasets


In [31]:
# Target column
TARGET_COLUMN = target_variable[0]

# Feature columns sets: 
    # 'morphology_only', 'photometry_magnitudes_only', 'photometry_mags_errors','photometry_plus_morphology','photometry_no_redshift','redshift_related_only','full_alhambra_photometry_based','full_alhambra_all'
FEATURE_COLUMNS = 'full_alhambra_all'

In [32]:
# --- Data Cleaning ---
logging.info(f"Original dataset size: {df.shape}")

# Handle missing values (simple strategy: drop rows with NaNs in target or features)
# More sophisticated imputation could be used later.
df_clean = df.dropna(subset=[TARGET_COLUMN] + feature_sets[FEATURE_COLUMNS]).copy()
logging.info(f"Dataset size after dropping NaNs: {df_clean.shape}")

logging.info(f"Value counts for target:\n1 (Star): {(df_clean[TARGET_COLUMN] == 1).sum()}\n0 (Galaxy): {(df_clean[TARGET_COLUMN] == 0).sum()}")

# Separate features (X) and target (y)
X = get_feature_set(df_clean, FEATURE_COLUMNS, feature_sets)
y = df_clean[TARGET_COLUMN]

Selecting feature set 'full_alhambra_all' with 94 columns.


In [33]:
# --- Data Splitting ---

logging.info(f"Splitting data using '{SPLIT_STRATEGY}' strategy...")
logging.info(f"Split ratios: Train={1-TEST_SIZE-VAL_SIZE-CAL_SIZE:.2f}, Val={VAL_SIZE:.2f}, Test={TEST_SIZE:.2f}, Cal={CAL_SIZE:.2f}")

# Calculate intermediate split sizes
val_test_cal_size = VAL_SIZE + TEST_SIZE + CAL_SIZE # 0.30
val_rel_size = VAL_SIZE / val_test_cal_size         # 0.10 / 0.30 = 1/3
test_cal_size = TEST_SIZE + CAL_SIZE               # 0.20
test_rel_size = TEST_SIZE / test_cal_size            # 0.10 / 0.20 = 1/2

stratify_option = y if SPLIT_STRATEGY == 'stratified' else None

# First split: Train vs. Temp (Val + Test + Cal)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=val_test_cal_size,
    random_state=RANDOM_SEED,
    stratify=stratify_option
)
logging.info(f"Train set shape: {X_train.shape}")
logging.info(f"Train set class distribution:\nStars (1): {(y_train == 1).mean():.2%}\nGalaxies (0): {(y_train == 0).mean():.2%}")

# Second split: Temp -> Val vs. Temp2 (Test + Cal)
stratify_option_temp = y_temp if SPLIT_STRATEGY == 'stratified' else None
X_val, X_temp2, y_val, y_temp2 = train_test_split(
    X_temp, y_temp,
    test_size=(1 - val_rel_size), # Size of Temp2 relative to Temp
    random_state=RANDOM_SEED,
    stratify=stratify_option_temp
)
logging.info(f"Validation set shape: {X_val.shape}")


# Third split: Temp2 -> Test vs. Cal
stratify_option_temp2 = y_temp2 if SPLIT_STRATEGY == 'stratified' else None
X_test, X_cal, y_test, y_cal = train_test_split(
    X_temp2, y_temp2,
    test_size=test_rel_size, # Size of Cal relative to Temp2
    random_state=RANDOM_SEED,
    stratify=stratify_option_temp2
)
logging.info(f"Test set shape: {X_test.shape}")
logging.info(f"Calibration set shape: {X_cal.shape}")

# Verify splits (approximate due to stratification)
assert len(X_train) + len(X_val) + len(X_test) + len(X_cal) == len(X)
logging.info("Data splitting complete.")
logging.info(f"Train target distribution:\n{y_train.value_counts(normalize=True)}")
logging.info(f"Validation target distribution:\n{y_val.value_counts(normalize=True)}")
logging.info(f"Test target distribution:\n{y_test.value_counts(normalize=True)}")
logging.info(f"Calibration target distribution:\n{y_cal.value_counts(normalize=True)}")

In [34]:
# --- Feature Scaling ---
# Important for SVM, can be beneficial for others too.
# Fit scaler ONLY on training data, then transform all sets.

logging.info("Applying StandardScaler to features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_cal_scaled = scaler.transform(X_cal)

# Save the scaler
scaler_filename = os.path.join(MODEL_DIR, f"scaler_{datetime.now().strftime('%Y%m%d_%H%M%S')}.joblib")
joblib.dump(scaler, scaler_filename)
logging.info(f"Scaler saved to {scaler_filename}")

# Use scaled data for models sensitive to scale (like SVM)
# For tree-based models, scaling is not strictly necessary, but using scaled
# data consistently here won't hurt and simplifies the workflow.
X_train_processed = X_train_scaled
X_val_processed = X_val_scaled
X_test_processed = X_test_scaled
X_cal_processed = X_cal_scaled

# (Optional) Convert back to DataFrames for easier column inspection if needed
# X_train_processed = pd.DataFrame(X_train_scaled, columns=FEATURE_COLUMNS, index=X_train.index)
# X_val_processed = pd.DataFrame(X_val_scaled, columns=FEATURE_COLUMNS, index=X_val.index)
# X_test_processed = pd.DataFrame(X_test_scaled, columns=FEATURE_COLUMNS, index=X_test.index)
# X_cal_processed = pd.DataFrame(X_cal_scaled, columns=FEATURE_COLUMNS, index=X_cal.index)

logging.info("Feature scaling complete.")

## 3. Model Implementation: Support Vector Machine (SVM)

In [35]:
# --- 3.1 SVM: Define Model ---
model_name_svm = "svm_rbf"
model_svm = None # Initialize variable

# Hyperparameters (Good defaults, tune using validation set later)
svm_params = {
    'C': 1.0,            # Regularization parameter
    'kernel': 'rbf',       # Kernel type ('linear', 'poly', 'rbf', 'sigmoid')
    'gamma': 'scale',    # Kernel coefficient for 'rbf', 'poly', 'sigmoid' ('scale' or 'auto' or float)
    'probability': True, # Enable probability estimates (needed for ROC AUC, calibration)
    'random_state': RANDOM_SEED,
    'class_weight': 'balanced' # Useful for imbalanced datasets
}

model_svm = SVC(**svm_params)
logging.info(f"Defined SVM model '{model_name_svm}' with params: {svm_params}")

In [36]:
# --- 3.2 SVM: Train Model ---
logging.info(f"--- Training {model_name_svm} ---")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename_svm = os.path.join(MODEL_DIR, f"{model_name_svm}_{timestamp}.joblib")

if os.path.exists(model_filename_svm):
    logging.warning(f"Model file {model_filename_svm} already exists. Skipping training.")
    # Optionally load the existing one here if needed immediately
    # model_svm = joblib.load(model_filename_svm)
else:
    logging.info(f"Starting training for {model_name_svm}...")
    start_time = datetime.now()

    # Use scaled data for SVM
    model_svm.fit(X_train_processed, y_train)

    end_time = datetime.now()
    training_duration = end_time - start_time
    logging.info(f"Training finished in: {training_duration}")
    logging.info(f"Saving trained model to {model_filename_svm}")
    joblib.dump(model_svm, model_filename_svm)
    logging.info("Model saved successfully.")


In [37]:
# --- 3.3 SVM: Load Model ---
# Find the latest SVM model file
list_of_svm_files = glob.glob(os.path.join(MODEL_DIR, f"{model_name_svm}_*.joblib"))
if list_of_svm_files:
    latest_svm_file = max(list_of_svm_files, key=os.path.getctime)
    logging.info(f"Loading latest SVM model: {latest_svm_file}")
    try:
        model_svm_loaded = joblib.load(latest_svm_file)
        logging.info("SVM model loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading SVM model: {e}")
        model_svm_loaded = None
else:
    logging.warning(f"No saved models found for {model_name_svm} in {MODEL_DIR}")
    model_svm_loaded = None

In [38]:
# --- 3.4 SVM: Test Model ---
if model_svm_loaded:
    logging.info(f"--- Testing {model_name_svm} ---")
    # Use scaled test data
    y_pred_svm = model_svm_loaded.predict(X_test_processed)
    y_proba_svm = model_svm_loaded.predict_proba(X_test_processed)[:, 1] # Probability of class 1 (Star)

    # Evaluate
    accuracy_svm = accuracy_score(y_test, y_pred_svm)
    roc_auc_svm = roc_auc_score(y_test, y_proba_svm)
    report_svm = classification_report(y_test, y_pred_svm, target_names=['Galaxy (0)', 'Star (1)'])
    cm_svm = confusion_matrix(y_test, y_pred_svm)

    logging.info(f"SVM Test Accuracy: {accuracy_svm:.4f}")
    logging.info(f"SVM Test ROC AUC: {roc_auc_svm:.4f}")
    logging.info(f"SVM Classification Report:\n{report_svm}")
    logging.info(f"SVM Confusion Matrix:\n{cm_svm}")
else:
    logging.warning("SVM model not loaded. Skipping testing.")

## 4. Model Implementation: Decision Tree (CART)

In [39]:
# --- 4.1 CART: Define Model ---
model_name_cart = "cart"
model_cart = None

# Hyperparameters (Defaults tend to overfit, apply some basic constraints)
cart_params = {
    'criterion': 'gini',        # Split quality measure ('gini' or 'entropy')
    'max_depth': 15,            # Max depth to prevent overfitting (tune later)
    'min_samples_split': 10,    # Min samples required to split an internal node (tune later)
    'min_samples_leaf': 5,      # Min samples required at a leaf node (tune later)
    'random_state': RANDOM_SEED,
    'class_weight': 'balanced' # Handle imbalance
}

model_cart = DecisionTreeClassifier(**cart_params)
logging.info(f"Defined CART model '{model_name_cart}' with params: {cart_params}")

In [40]:
# --- 4.2 CART: Train Model ---
logging.info(f"--- Training {model_name_cart} ---")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename_cart = os.path.join(MODEL_DIR, f"{model_name_cart}_{timestamp}.joblib")

if os.path.exists(model_filename_cart):
    logging.warning(f"Model file {model_filename_cart} already exists. Skipping training.")
else:
    logging.info(f"Starting training for {model_name_cart}...")
    start_time = datetime.now()

    # Tree models don't strictly need scaling, but we use processed data for consistency
    model_cart.fit(X_train_processed, y_train)

    end_time = datetime.now()
    training_duration = end_time - start_time
    logging.info(f"Training finished in: {training_duration}")
    logging.info(f"Saving trained model to {model_filename_cart}")
    joblib.dump(model_cart, model_filename_cart)
    logging.info("Model saved successfully.")

In [41]:
# --- 4.3 CART: Load Model ---
list_of_cart_files = glob.glob(os.path.join(MODEL_DIR, f"{model_name_cart}_*.joblib"))
if list_of_cart_files:
    latest_cart_file = max(list_of_cart_files, key=os.path.getctime)
    logging.info(f"Loading latest CART model: {latest_cart_file}")
    try:
        model_cart_loaded = joblib.load(latest_cart_file)
        logging.info("CART model loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading CART model: {e}")
        model_cart_loaded = None
else:
    logging.warning(f"No saved models found for {model_name_cart} in {MODEL_DIR}")
    model_cart_loaded = None

In [42]:
# --- 4.4 CART: Test Model ---
if model_cart_loaded:
    logging.info(f"--- Testing {model_name_cart} ---")
    y_pred_cart = model_cart_loaded.predict(X_test_processed)
    y_proba_cart = model_cart_loaded.predict_proba(X_test_processed)[:, 1]

    # Evaluate
    accuracy_cart = accuracy_score(y_test, y_pred_cart)
    roc_auc_cart = roc_auc_score(y_test, y_proba_cart)
    report_cart = classification_report(y_test, y_pred_cart, target_names=['Galaxy (0)', 'Star (1)'])
    cm_cart = confusion_matrix(y_test, y_pred_cart)

    logging.info(f"CART Test Accuracy: {accuracy_cart:.4f}")
    logging.info(f"CART Test ROC AUC: {roc_auc_cart:.4f}")
    logging.info(f"CART Classification Report:\n{report_cart}")
    logging.info(f"CART Confusion Matrix:\n{cm_cart}")
else:
    logging.warning("CART model not loaded. Skipping testing.")

## 5. Model Implementation: Random Forest

In [43]:
# --- 5.1 RF: Define Model ---
model_name_rf = "random_forest"
model_rf = None

# Hyperparameters (Good starting point)
rf_params = {
    'n_estimators': 200,        # Number of trees in the forest
    'criterion': 'gini',
    'max_depth': None,          # Grow trees fully (or set a limit like CART)
    'min_samples_split': 2,     # Default
    'min_samples_leaf': 1,      # Default (can increase for regularization)
    'max_features': 'sqrt',     # Number of features to consider for best split ('sqrt', 'log2', or int/float)
    'bootstrap': True,          # Use bootstrap samples
    'random_state': RANDOM_SEED,
    'n_jobs': -1,               # Use all available CPU cores
    'class_weight': 'balanced'  # Handle imbalance
}

model_rf = RandomForestClassifier(**rf_params)
logging.info(f"Defined RF model '{model_name_rf}' with params: {rf_params}")

In [44]:
# --- 5.2 RF: Train Model ---
logging.info(f"--- Training {model_name_rf} ---")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename_rf = os.path.join(MODEL_DIR, f"{model_name_rf}_{timestamp}.joblib")

if os.path.exists(model_filename_rf):
    logging.warning(f"Model file {model_filename_rf} already exists. Skipping training.")
else:
    logging.info(f"Starting training for {model_name_rf}...")
    start_time = datetime.now()

    model_rf.fit(X_train_processed, y_train)

    end_time = datetime.now()
    training_duration = end_time - start_time
    logging.info(f"Training finished in: {training_duration}")
    logging.info(f"Saving trained model to {model_filename_rf}")
    joblib.dump(model_rf, model_filename_rf)
    logging.info("Model saved successfully.")


In [45]:
# --- 5.3 RF: Load Model ---
list_of_rf_files = glob.glob(os.path.join(MODEL_DIR, f"{model_name_rf}_*.joblib"))
if list_of_rf_files:
    latest_rf_file = max(list_of_rf_files, key=os.path.getctime)
    logging.info(f"Loading latest RF model: {latest_rf_file}")
    try:
        model_rf_loaded = joblib.load(latest_rf_file)
        logging.info("RF model loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading RF model: {e}")
        model_rf_loaded = None
else:
    logging.warning(f"No saved models found for {model_name_rf} in {MODEL_DIR}")
    model_rf_loaded = None

In [46]:
# --- 5.4 RF: Test Model ---
if model_rf_loaded:
    logging.info(f"--- Testing {model_name_rf} ---")
    y_pred_rf = model_rf_loaded.predict(X_test_processed)
    y_proba_rf = model_rf_loaded.predict_proba(X_test_processed)[:, 1]

    # Evaluate
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    roc_auc_rf = roc_auc_score(y_test, y_proba_rf)
    report_rf = classification_report(y_test, y_pred_rf, target_names=['Galaxy (0)', 'Star (1)'])
    cm_rf = confusion_matrix(y_test, y_pred_rf)

    logging.info(f"RF Test Accuracy: {accuracy_rf:.4f}")
    logging.info(f"RF Test ROC AUC: {roc_auc_rf:.4f}")
    logging.info(f"RF Classification Report:\n{report_rf}")
    logging.info(f"RF Confusion Matrix:\n{cm_rf}")
else:
    logging.warning("RF model not loaded. Skipping testing.")

In [47]:
# --- 6.1 XGB: Define Model ---
model_name_xgb = "xgboost"
model_xgb = None

# Hyperparameters
xgb_params = {
    'objective': 'binary:logistic', # Objective function for binary classification
    'eval_metric': 'auc',           # Evaluation metric ('logloss', 'auc', 'error')
    'n_estimators': 200,            # Number of boosting rounds/trees
    'learning_rate': 0.1,           # Step size shrinkage
    'max_depth': 5,                 # Maximum tree depth
    'subsample': 0.8,               # Fraction of samples used per tree
    'colsample_bytree': 0.8,        # Fraction of features used per tree
    'gamma': 0,                     # Minimum loss reduction required to make a split
    'reg_alpha': 0,                 # L1 regularization
    'reg_lambda': 1,                # L2 regularization (default)
    'use_label_encoder': False,     # Recommended setting for recent versions
    'random_state': RANDOM_SEED,
    'n_jobs': -1
    # scale_pos_weight can be used for imbalance, but often handled by eval_metric='auc' and tuning
}

model_xgb = xgb.XGBClassifier(**xgb_params)
logging.info(f"Defined XGBoost model '{model_name_xgb}' with params: {xgb_params}")

In [48]:
# --- 6.2 XGB: Train Model ---
logging.info(f"--- Training {model_name_xgb} ---")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# XGBoost has its own save method, but joblib works too for consistency here
model_filename_xgb = os.path.join(MODEL_DIR, f"{model_name_xgb}_{timestamp}.joblib")

if os.path.exists(model_filename_xgb):
    logging.warning(f"Model file {model_filename_xgb} already exists. Skipping training.")
else:
    logging.info(f"Starting training for {model_name_xgb}...")
    start_time = datetime.now()

    # Use validation set for early stopping
    eval_set = [(X_val_processed, y_val)]
    early_stopping_rounds = 15 # Stop if no improvement on eval_set for 15 rounds

    model_xgb.fit(X_train_processed, y_train,
                  eval_set=eval_set,
                  early_stopping_rounds=early_stopping_rounds,
                  verbose=False) # Set verbose=True or integer for progress logs

    end_time = datetime.now()
    training_duration = end_time - start_time
    logging.info(f"Training finished in: {training_duration}")
    logging.info(f"Best iteration: {model_xgb.best_iteration}, Best score ({xgb_params['eval_metric']}): {model_xgb.best_score:.4f}")
    logging.info(f"Saving trained model to {model_filename_xgb}")
    joblib.dump(model_xgb, model_filename_xgb)
    # Alternatively: model_xgb.save_model(model_filename_xgb.replace('.joblib', '.xgb'))
    logging.info("Model saved successfully.")

TypeError: fit() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
# --- 6.3 XGB: Load Model ---
list_of_xgb_files = glob.glob(os.path.join(MODEL_DIR, f"{model_name_xgb}_*.joblib"))
if list_of_xgb_files:
    latest_xgb_file = max(list_of_xgb_files, key=os.path.getctime)
    logging.info(f"Loading latest XGBoost model: {latest_xgb_file}")
    try:
        model_xgb_loaded = joblib.load(latest_xgb_file)
        # If using native save:
        # model_xgb_loaded = xgb.XGBClassifier()
        # model_xgb_loaded.load_model(latest_xgb_file.replace('.joblib', '.xgb'))
        logging.info("XGBoost model loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading XGBoost model: {e}")
        model_xgb_loaded = None
else:
    logging.warning(f"No saved models found for {model_name_xgb} in {MODEL_DIR}")
    model_xgb_loaded = None

In [None]:
# --- 6.4 XGB: Test Model ---
if model_xgb_loaded:
    logging.info(f"--- Testing {model_name_xgb} ---")
    y_pred_xgb = model_xgb_loaded.predict(X_test_processed)
    y_proba_xgb = model_xgb_loaded.predict_proba(X_test_processed)[:, 1]

    # Evaluate
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    roc_auc_xgb = roc_auc_score(y_test, y_proba_xgb)
    report_xgb = classification_report(y_test, y_pred_xgb, target_names=['Galaxy (0)', 'Star (1)'])
    cm_xgb = confusion_matrix(y_test, y_pred_xgb)

    logging.info(f"XGBoost Test Accuracy: {accuracy_xgb:.4f}")
    logging.info(f"XGBoost Test ROC AUC: {roc_auc_xgb:.4f}")
    logging.info(f"XGBoost Classification Report:\n{report_xgb}")
    logging.info(f"XGBoost Confusion Matrix:\n{cm_xgb}")
else:
    logging.warning("XGBoost model not loaded. Skipping testing.")


## 7. Model Implementation: LightGBM

In [None]:
# --- 7.1 LGBM: Define Model ---
model_name_lgbm = "lightgbm"
model_lgbm = None

# Hyperparameters
lgbm_params = {
    'objective': 'binary',          # Binary classification
    'metric': 'auc',                # Evaluation metric ('auc', 'binary_logloss')
    'n_estimators': 200,
    'learning_rate': 0.1,
    'num_leaves': 31,               # Default, main parameter to control complexity
    'max_depth': -1,                # Default: no limit (num_leaves is often preferred)
    'feature_fraction': 0.8,        # Equivalent to colsample_bytree
    'bagging_fraction': 0.8,        # Equivalent to subsample
    'bagging_freq': 1,              # Perform bagging at every iteration
    'reg_alpha': 0,
    'reg_lambda': 0,                # Default: 0 for LightGBM
    'random_state': RANDOM_SEED,
    'n_jobs': -1,
    'class_weight': 'balanced'      # Handle imbalance
}

model_lgbm = lgb.LGBMClassifier(**lgbm_params)
logging.info(f"Defined LightGBM model '{model_name_lgbm}' with params: {lgbm_params}")

In [None]:
# --- 7.2 LGBM: Train Model ---
logging.info(f"--- Training {model_name_lgbm} ---")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# LightGBM also has native save, using joblib here
model_filename_lgbm = os.path.join(MODEL_DIR, f"{model_name_lgbm}_{timestamp}.joblib")

if os.path.exists(model_filename_lgbm):
    logging.warning(f"Model file {model_filename_lgbm} already exists. Skipping training.")
else:
    logging.info(f"Starting training for {model_name_lgbm}...")
    start_time = datetime.now()

    # Use validation set for early stopping
    eval_set = [(X_val_processed, y_val)]
    early_stopping_rounds = 15

    # Need callbacks for early stopping in older versions, but newer ones have direct param
    # from lightgbm import early_stopping
    # callbacks = [early_stopping(stopping_rounds=early_stopping_rounds, verbose=1)]

    model_lgbm.fit(X_train_processed, y_train,
                   eval_set=eval_set,
                   eval_metric=lgbm_params['metric'],
                   # callbacks=callbacks # Use this if early_stopping_rounds is not in fit()
                   # For newer versions:
                   early_stopping_rounds=early_stopping_rounds
                   )

    end_time = datetime.now()
    training_duration = end_time - start_time
    logging.info(f"Training finished in: {training_duration}")
    logging.info(f"Best iteration: {model_lgbm.best_iteration_}, Best score ({lgbm_params['metric']}): {model_lgbm.best_score_['valid_0'][lgbm_params['metric']]:.4f}")
    logging.info(f"Saving trained model to {model_filename_lgbm}")
    joblib.dump(model_lgbm, model_filename_lgbm)
    # Alternatively: model_lgbm.booster_.save_model(model_filename_lgbm.replace('.joblib', '.txt'))
    logging.info("Model saved successfully.")

In [None]:
# --- 7.3 LGBM: Load Model ---
list_of_lgbm_files = glob.glob(os.path.join(MODEL_DIR, f"{model_name_lgbm}_*.joblib"))
if list_of_lgbm_files:
    latest_lgbm_file = max(list_of_lgbm_files, key=os.path.getctime)
    logging.info(f"Loading latest LightGBM model: {latest_lgbm_file}")
    try:
        model_lgbm_loaded = joblib.load(latest_lgbm_file)
        # If using native save:
        # model_lgbm_loaded = lgb.Booster(model_file=latest_lgbm_file.replace('.joblib', '.txt')) # Note: loads Booster, need wrapper for predict
        # model_lgbm_loaded_clf = lgb.LGBMClassifier()
        # model_lgbm_loaded_clf.booster_ = model_lgbm_loaded
        logging.info("LightGBM model loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading LightGBM model: {e}")
        model_lgbm_loaded = None
else:
    logging.warning(f"No saved models found for {model_name_lgbm} in {MODEL_DIR}")
    model_lgbm_loaded = None

In [None]:
# --- 7.4 LGBM: Test Model ---
if model_lgbm_loaded:
    logging.info(f"--- Testing {model_name_lgbm} ---")
    # If loaded Booster directly, need wrapper or use booster_.predict
    # y_pred_lgbm = (model_lgbm_loaded.predict(X_test_processed) > 0.5).astype(int) # Booster predicts scores
    # y_proba_lgbm = model_lgbm_loaded.predict(X_test_processed)
    # If loaded via joblib (as LGBMClassifier):
    y_pred_lgbm = model_lgbm_loaded.predict(X_test_processed)
    y_proba_lgbm = model_lgbm_loaded.predict_proba(X_test_processed)[:, 1]


    # Evaluate
    accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
    roc_auc_lgbm = roc_auc_score(y_test, y_proba_lgbm)
    report_lgbm = classification_report(y_test, y_pred_lgbm, target_names=['Galaxy (0)', 'Star (1)'])
    cm_lgbm = confusion_matrix(y_test, y_pred_lgbm)

    logging.info(f"LightGBM Test Accuracy: {accuracy_lgbm:.4f}")
    logging.info(f"LightGBM Test ROC AUC: {roc_auc_lgbm:.4f}")
    logging.info(f"LightGBM Classification Report:\n{report_lgbm}")
    logging.info(f"LightGBM Confusion Matrix:\n{cm_lgbm}")
else:
    logging.warning("LightGBM model not loaded. Skipping testing.")

In [None]:
# ## 9. Next Steps
#
# - **Hyperparameter Tuning:** Use the validation set (`X_val_processed`, `y_val`) with techniques like `GridSearchCV` or `RandomizedSearchCV` to find optimal hyperparameters for each model.
# - **Feature Engineering:** Create new features (e.g., colors like F365W - F814W) and evaluate their impact.
# - **Feature Importance:** Analyze feature importance plots (especially for tree-based models) to understand which ALHAMBRA measurements are most predictive.
# - **Calibration:** Implement calibration methods using the calibration set.
# - **Error Analysis:** Investigate misclassified examples in the test set to understand model weaknesses.
# - **Comparison:** Systematically compare the performance metrics of all tuned and calibrated models.