In [2]:
#Bhardwaj et al. 2025 TDE Classifier - Adapted for MALLORN
#With GRU predictions added as feature


!pip install george -q

import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

import george
from george import kernels
from scipy.optimize import minimize

import torch
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score

SEED = 42
np.random.seed(SEED)

print(f"george version: {george.__version__}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.6/395.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hgeorge version: 0.4.4


In [3]:
DATA_DIR = Path("/kaggle/input/mallorn-astronomical-classification-challenge")

train_log = pd.read_csv(DATA_DIR / "train_log.csv")
test_log = pd.read_csv(DATA_DIR / "test_log.csv")

print(f"Train: {len(train_log)} objects, {train_log['target'].sum()} TDEs ({100*train_log['target'].mean():.1f}%)")
print(f"Test: {len(test_log)} objects")

# Load light curves
split_dirs = sorted([p for p in DATA_DIR.glob("split_*") if p.is_dir()])
print(f"Found {len(split_dirs)} splits")

train_lc = pd.concat([pd.read_csv(d / "train_full_lightcurves.csv") for d in split_dirs], ignore_index=True)
test_lc = pd.concat([pd.read_csv(d / "test_full_lightcurves.csv") for d in split_dirs], ignore_index=True)

print(f"Train LC: {len(train_lc):,} rows")
print(f"Test LC: {len(test_lc):,} rows")

Train: 3043 objects, 148 TDEs (4.9%)
Test: 7135 objects
Found 20 splits
Train LC: 479,384 rows
Test LC: 1,145,125 rows


In [4]:
FILTER_WAVELENGTHS = {
    'u': 3670.69, 'g': 4826.85, 'r': 6223.24,
    'i': 7545.98, 'z': 8590.90, 'y': 9710.28
}

def compute_gp(sub, obj_id):
    """Fit 2D GP (time × wavelength) using george library."""
    try:
        t = np.array(sub['Time (MJD)'], dtype=float)
        flux = np.array(sub['Flux'], dtype=float)
        fluxerr = np.array(sub['Flux_err'], dtype=float)
        band = np.array([FILTER_WAVELENGTHS.get(b.strip().lower(), np.nan) for b in sub['Filter']], dtype=float)
        
        x = np.vstack([t, band]).T
        mask = np.isfinite(flux) & np.isfinite(fluxerr) & np.all(np.isfinite(x), axis=1) & (flux > 0)
        x, flux, fluxerr = x[mask], flux[mask], fluxerr[mask]
        
        if len(flux) < 5:
            return None, None, None, None
        
        snr = np.abs(flux) / np.sqrt(fluxerr**2 + (0.01 * np.max(flux))**2)
        scale = np.abs(flux[np.argmax(snr)])
        
        kernel = (0.5 * scale)**2 * george.kernels.Matern32Kernel([100**2, 6000**2], ndim=2)
        gp = george.GP(kernel, solver=george.HODLRSolver)
        gp.compute(x, fluxerr)
        
        def neg_ln_like(p):
            gp.set_parameter_vector(p)
            return -gp.log_likelihood(flux)
        
        def grad_neg_ln_like(p):
            gp.set_parameter_vector(p)
            return -gp.grad_log_likelihood(flux)
        
        p0 = gp.get_parameter_vector()
        for _ in range(3):
            result = minimize(neg_ln_like, p0, jac=grad_neg_ln_like, method='L-BFGS-B')
            if result.success:
                gp.set_parameter_vector(result.x)
                break
            p0 = p0 + np.random.normal(0, 0.1, size=p0.shape)
        
        return gp, flux, x, gp.get_parameter_vector()
    except:
        return None, None, None, None


def find_peak_and_times(gp, sub, flux, band='g'):
    """Find peak and rise/fade times for one band."""
    t_min, t_max = sub['Time (MJD)'].min(), sub['Time (MJD)'].max()
    mjd = np.linspace(t_min - 50, t_max + 75, 500)
    wl = FILTER_WAVELENGTHS[band]
    x_pred = np.vstack([mjd, wl * np.ones_like(mjd)]).T
    
    try:
        mean_pred, _ = gp.predict(flux, x_pred, return_var=True)
    except:
        return None, None, None, None
    
    peak_idx = np.argmax(mean_pred)
    peak_mjd = mjd[peak_idx]
    peak_flux = mean_pred[peak_idx]
    
    if peak_flux <= 0:
        return peak_mjd, peak_flux, None, None
    
    thr = peak_flux / 2.512
    
    pre = mean_pred[:peak_idx]
    rise_idx = np.where(pre <= thr)[0]
    rise_time = peak_mjd - mjd[rise_idx[-1]] if len(rise_idx) > 0 else None
    
    post = mean_pred[peak_idx:]
    fade_idx = np.where(post <= thr)[0]
    fade_time = mjd[peak_idx + fade_idx[0]] - peak_mjd if len(fade_idx) > 0 else None
    
    return peak_mjd, peak_flux, rise_time, fade_time


def compute_colors(gp, sub, flux, peak_mjd, bands=['g', 'r', 'i']):
    """Compute mean colors pre/post peak."""
    colors = {}
    
    for b in bands:
        wl = FILTER_WAVELENGTHS[b]
        
        # Pre-peak: -30 to 0 days
        t_pre = np.linspace(peak_mjd - 30, peak_mjd, 50)
        x_pre = np.vstack([t_pre, wl * np.ones_like(t_pre)]).T
        
        # Post-peak: 0 to +30 days  
        t_post = np.linspace(peak_mjd, peak_mjd + 30, 50)
        x_post = np.vstack([t_post, wl * np.ones_like(t_post)]).T
        
        try:
            flux_pre, _ = gp.predict(flux, x_pre, return_var=True)
            flux_post, _ = gp.predict(flux, x_post, return_var=True)
            colors[f'{b}_pre'] = np.mean(flux_pre[flux_pre > 0]) if np.any(flux_pre > 0) else np.nan
            colors[f'{b}_post'] = np.mean(flux_post[flux_post > 0]) if np.any(flux_post > 0) else np.nan
        except:
            colors[f'{b}_pre'] = np.nan
            colors[f'{b}_post'] = np.nan
    
    return colors


def flux_to_mag(flux):
    """Convert flux to magnitude."""
    if flux is None or flux <= 0 or np.isnan(flux):
        return np.nan
    return -2.5 * np.log10(flux)


def extract_bhardwaj_features(obj_id, lc_df):
    """Extract all 13 Bhardwaj features for one object."""
    sub = lc_df[lc_df['object_id'] == obj_id]
    
    if len(sub) < 10:
        return None
    
    gp, flux, x, params = compute_gp(sub, obj_id)
    
    if gp is None:
        return None
    
    # GP hyperparameters (3 features)
    features = {
        'Amplitude': np.exp(params[0]),
        'LengthScale_Time': np.sqrt(np.exp(params[1])),
        'LengthScale_Wavelength': np.sqrt(np.exp(params[2])) if len(params) > 2 else np.nan,
    }
    
    # Timing features (2 features)
    peak_mjd, peak_flux, rise_time, fade_time = find_peak_and_times(gp, sub, flux, 'g')
    features['Rise_Time'] = rise_time
    features['Fade_Time'] = fade_time
    
    if peak_mjd is None:
        return features
    
    # Color features (8 features)
    colors = compute_colors(gp, sub, flux, peak_mjd, ['g', 'r', 'i'])
    
    g_pre, r_pre, i_pre = colors.get('g_pre'), colors.get('r_pre'), colors.get('i_pre')
    g_post, r_post, i_post = colors.get('g_post'), colors.get('r_post'), colors.get('i_post')
    
    # Mean colors (4 features)
    features['Mean_Color_Pre_gr'] = flux_to_mag(g_pre) - flux_to_mag(r_pre) if g_pre and r_pre else np.nan
    features['Mean_Color_Post_gr'] = flux_to_mag(g_post) - flux_to_mag(r_post) if g_post and r_post else np.nan
    features['Mean_Color_Pre_ri'] = flux_to_mag(r_pre) - flux_to_mag(i_pre) if r_pre and i_pre else np.nan
    features['Mean_Color_Post_ri'] = flux_to_mag(r_post) - flux_to_mag(i_post) if r_post and i_post else np.nan
    
    # Color slopes (4 features)
    features['Slope_Pre_gr'] = (features['Mean_Color_Post_gr'] - features['Mean_Color_Pre_gr']) / 30 if not np.isnan(features.get('Mean_Color_Pre_gr', np.nan)) else np.nan
    features['Slope_Post_gr'] = features['Slope_Pre_gr']  # Simplified
    features['Slope_Pre_ri'] = (features['Mean_Color_Post_ri'] - features['Mean_Color_Pre_ri']) / 30 if not np.isnan(features.get('Mean_Color_Pre_ri', np.nan)) else np.nan
    features['Slope_Post_ri'] = features['Slope_Pre_ri']  # Simplified
    
    return features


# Feature column names
BHARDWAJ_FEATURES = [
    'Amplitude', 'LengthScale_Time', 'LengthScale_Wavelength',
    'Rise_Time', 'Fade_Time',
    'Mean_Color_Pre_gr', 'Mean_Color_Post_gr', 'Mean_Color_Pre_ri', 'Mean_Color_Post_ri',
    'Slope_Pre_gr', 'Slope_Post_gr', 'Slope_Pre_ri', 'Slope_Post_ri'
]

print(f"Defined {len(BHARDWAJ_FEATURES)} Bhardwaj features")

Defined 13 Bhardwaj features


In [5]:

train_features = []
for obj_id in tqdm(train_log['object_id']):
    feats = extract_bhardwaj_features(obj_id, train_lc)
    if feats:
        feats['object_id'] = obj_id
        train_features.append(feats)
    else:
        train_features.append({'object_id': obj_id})

train_feats = pd.DataFrame(train_features)
print(f"Train features: {train_feats.shape}")


test_features = []
for obj_id in tqdm(test_log['object_id']):
    feats = extract_bhardwaj_features(obj_id, test_lc)
    if feats:
        feats['object_id'] = obj_id
        test_features.append(feats)
    else:
        test_features.append({'object_id': obj_id})

test_feats = pd.DataFrame(test_features)
print(f"Test features: {test_feats.shape}")

# Check coverage
for col in BHARDWAJ_FEATURES:
    if col in train_feats.columns:
        cov = train_feats[col].notna().mean() * 100
        print(f"  {col}: {cov:.1f}%")

Extracting Bhardwaj features for training set...


  0%|          | 0/3043 [00:00<?, ?it/s]

Train features: (3043, 14)

Extracting Bhardwaj features for test set...


  0%|          | 0/7135 [00:00<?, ?it/s]

Test features: (7135, 14)

Feature coverage (train):
  Amplitude: 100.0%
  LengthScale_Time: 100.0%
  LengthScale_Wavelength: 100.0%
  Rise_Time: 97.1%
  Fade_Time: 80.2%
  Mean_Color_Pre_gr: 100.0%
  Mean_Color_Post_gr: 100.0%
  Mean_Color_Pre_ri: 99.9%
  Mean_Color_Post_ri: 99.9%
  Slope_Pre_gr: 100.0%
  Slope_Post_gr: 100.0%
  Slope_Pre_ri: 99.9%
  Slope_Post_ri: 99.9%


In [26]:

GRU_CACHE_PATH = Path("/kaggle/input/mallorenlargemodels/gru_cache.pt") 

print(f"Loading GRU from: {GRU_CACHE_PATH}")

gru_cache = torch.load(GRU_CACHE_PATH, map_location='cpu', weights_only=False)
gru_oof_raw = gru_cache['oof']
gru_test_raw = gru_cache['test']
    
print(f"  Loaded from: {gru_cache.get('timestamp', 'unknown')}")
print(f"  GRU OOF shape: {gru_oof_raw.shape}")
print(f"  GRU test shape: {gru_test_raw.shape}")
    
# VALIDATE: Check GRU is working
y_for_validation = train_log['target'].values
auc_raw = roc_auc_score(y_for_validation, gru_oof_raw)
print(f"\n  RAW GRU AUC: {auc_raw:.4f}")
print(f"  RAW GRU - TDE mean: {gru_oof_raw[y_for_validation==1].mean():.4f}")
print(f"  RAW GRU - Non-TDE mean: {gru_oof_raw[y_for_validation==0].mean():.4f}")
    


Loading GRU from: /kaggle/input/mallorenlargemodels/gru_cache.pt
  Loaded from: 2026-01-24T21:44:02.488192
  GRU OOF shape: (3043,)
  GRU test shape: (7135,)

  RAW GRU AUC: 0.4888
  RAW GRU - TDE mean: 0.1510
  RAW GRU - Non-TDE mean: 0.1569


In [12]:
# Start fresh - just Bhardwaj features
train_df = train_feats[['object_id'] + [c for c in BHARDWAJ_FEATURES if c in train_feats.columns]].copy()
test_df = test_feats[['object_id'] + [c for c in BHARDWAJ_FEATURES if c in test_feats.columns]].copy()
# Add target to train
train_df = train_df.merge(train_log[['object_id', 'target']], on='object_id', how='left')

In [27]:
#The GRU was stored in a different order, here we fix that.
#GRU predictions are in alphabetical order of object_id
# Check if main notebook order is alphabetically sorted
main_first_10 = ['Dornhoth_anwar_melethron', 'Dornhoth_archam_grond', 'Dornhoth_certh_iaun', 
                 'Dornhoth_drafn_celon', 'Dornhoth_fervain_onodrim', 'Dornhoth_galadh_ylf', 
                 'Dornhoth_gwend_nagol', 'Dornhoth_hervenn_tathar', 'Dornhoth_inias_gond', 
                 'Dornhoth_lavan_ank']

sorted_ids = sorted(train_log['object_id'].tolist())
print(f"Sorted first 10: {sorted_ids[:10]}")
print(f"Main notebook first 10 matches sorted: {main_first_10 == sorted_ids[:10]}")

# Create mapping
gru_order = sorted(train_log['object_id'].tolist())
gru_oof_dict = dict(zip(gru_order, gru_oof_raw))

gru_test_order = sorted(test_log['object_id'].tolist())
gru_test_dict = dict(zip(gru_test_order, gru_test_raw))

# Map to Bhardwaj train_df order
gru_oof_aligned = np.array([gru_oof_dict[oid] for oid in train_df['object_id']])
gru_test_aligned = np.array([gru_test_dict[oid] for oid in test_df['object_id']])

# Validate
y_check = train_df['target'].values
auc_aligned = roc_auc_score(y_check, gru_oof_aligned)
print(f"\n*** Aligned GRU AUC: {auc_aligned:.4f} ***")
print(f"TDE mean: {gru_oof_aligned[y_check==1].mean():.4f}")
print(f"Non-TDE mean: {gru_oof_aligned[y_check==0].mean():.4f}")

if auc_aligned > 0.9:
    gru_oof_raw = gru_oof_aligned  # Replace with aligned version
    gru_test_raw = gru_test_aligned

Sorted first 10: ['Dornhoth_anwar_melethron', 'Dornhoth_archam_grond', 'Dornhoth_certh_iaun', 'Dornhoth_drafn_celon', 'Dornhoth_fervain_onodrim', 'Dornhoth_galadh_ylf', 'Dornhoth_gwend_nagol', 'Dornhoth_hervenn_tathar', 'Dornhoth_inias_gond', 'Dornhoth_lavan_ank']
Main notebook first 10 matches sorted: True

*** Aligned GRU AUC: 0.9544 ***
TDE mean: 0.8013
Non-TDE mean: 0.1236


In [28]:
# Platt scaling
y_for_platt = train_df['target'].values
lr = LogisticRegression(C=1e10, solver='lbfgs', max_iter=1000)
lr.fit(gru_oof_aligned.reshape(-1, 1), y_for_platt)
gru_oof_platt = lr.predict_proba(gru_oof_aligned.reshape(-1, 1))[:, 1]
gru_test_platt = lr.predict_proba(gru_test_aligned.reshape(-1, 1))[:, 1]

print(f"Platt coef: {lr.coef_[0][0]:.3f}, intercept: {lr.intercept_[0]:.3f}")
print(f"Platt-scaled AUC: {roc_auc_score(y_for_platt, gru_oof_platt):.4f}")

# Add to train_df and test_df
train_df['gru_pred'] = gru_oof_platt
test_df['gru_pred'] = gru_test_platt

# Update feature columns
FEATURE_COLS_WITH_GRU = BHARDWAJ_FEATURES + ['gru_pred']
print(f"\nFeatures: {FEATURE_COLS_WITH_GRU}")

# Prepare training data
X_train = train_df[FEATURE_COLS_WITH_GRU].fillna(-999)
y_train = train_df['target']
X_test = test_df[FEATURE_COLS_WITH_GRU].fillna(-999)

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

Platt coef: 6.277, intercept: -5.818
Platt-scaled AUC: 0.9544

Features: ['Amplitude', 'LengthScale_Time', 'LengthScale_Wavelength', 'Rise_Time', 'Fade_Time', 'Mean_Color_Pre_gr', 'Mean_Color_Post_gr', 'Mean_Color_Pre_ri', 'Mean_Color_Post_ri', 'Slope_Pre_gr', 'Slope_Post_gr', 'Slope_Pre_ri', 'Slope_Post_ri', 'gru_pred']
X_train: (3043, 14)
X_test: (7135, 14)


In [15]:
TRANSFORMER_CACHE_PATH = Path("/kaggle/input/mallorenlargemodels/transformer_cache.pt")

if TRANSFORMER_CACHE_PATH.exists():
    trans_cache = torch.load(TRANSFORMER_CACHE_PATH, map_location='cpu', weights_only=False)
    trans_oof_raw = trans_cache['oof']
    trans_test_raw = trans_cache['test']
    
    print(f"  Loaded: {trans_cache.get('timestamp')}")
    
    # Align by sorted order (same fix as GRU)
    trans_order = sorted(train_log['object_id'].tolist())
    trans_oof_dict = dict(zip(trans_order, trans_oof_raw))
    
    trans_test_order = sorted(test_log['object_id'].tolist())
    trans_test_dict = dict(zip(trans_test_order, trans_test_raw))
    
    # Map to train_df/test_df order
    trans_oof_aligned = np.array([trans_oof_dict[oid] for oid in train_df['object_id']])
    trans_test_aligned = np.array([trans_test_dict[oid] for oid in test_df['object_id']])
    
    # Validate
    y_check = train_df['target'].values
    auc_trans = roc_auc_score(y_check, trans_oof_aligned)
    print(f"  Aligned Transformer AUC: {auc_trans:.4f}")
    
    if auc_trans > 0.8:
        print("  ✓ Transformer aligned correctly!")
        
        # Platt scale
        lr_trans = LogisticRegression(C=1e10, solver='lbfgs', max_iter=1000)
        lr_trans.fit(trans_oof_aligned.reshape(-1, 1), y_check)
        trans_oof_platt = lr_trans.predict_proba(trans_oof_aligned.reshape(-1, 1))[:, 1]
        trans_test_platt = lr_trans.predict_proba(trans_test_aligned.reshape(-1, 1))[:, 1]
        
        train_df['transformer_pred'] = trans_oof_platt
        test_df['transformer_pred'] = trans_test_platt
        print(f"  Added transformer_pred")
    else:
        print(f"  ✗ Transformer AUC too low ({auc_trans:.4f}), skipping")
else:
    print(f"  Transformer cache not found: {TRANSFORMER_CACHE_PATH}")


#GP
print("\n2. Loading GP features...")

GP_CACHE_DIR = Path("/kaggle/input/gpfeatures/gp_cache/")
TRAIN_GP_CACHE = GP_CACHE_DIR / "train_gp_features_v11.parquet"
TEST_GP_CACHE = GP_CACHE_DIR / "test_gp_features_v11.parquet"

if TRAIN_GP_CACHE.exists() and TEST_GP_CACHE.exists():
    train_gp = pd.read_parquet(TRAIN_GP_CACHE)
    test_gp = pd.read_parquet(TEST_GP_CACHE)
    print(f"  Loaded train GP: {train_gp.shape}")
    print(f"  Loaded test GP: {test_gp.shape}")
    
    # Key GP features
    GP_FEATURES = [
        'gp_decay_slope_30d', 'gp_decay_slope_60d', 'gp_decay_slope_100d',
        'gp_decay_rate_norm_30d', 'gp_decay_rate_norm_60d',
        'gp_decay_slope_r', 'gp_decay_slope_i', 'gp_decay_slope_z',
        'gp_decay_r2_100d', 'gp_t_above_half',
        'gp_color_gr_peak', 'gp_color_gr_30d', 'gp_color_evolution_gr',
    ]
    gp_cols = ['object_id'] + [c for c in GP_FEATURES if c in train_gp.columns]
    
    train_df = train_df.merge(train_gp[gp_cols], on='object_id', how='left')
    test_df = test_df.merge(test_gp[gp_cols], on='object_id', how='left')
    print(f"  Added {len(gp_cols)-1} GP features")
else:
    print(f"  GP cache not found: {TRAIN_GP_CACHE}")


#SNCOSMO FEATURES
print("\n3. Loading SNCOSMO features...")

SNCOSMO_TRAIN_PATH = Path("/kaggle/input/sncosmos/train_sncosmo_features.parquet")
SNCOSMO_TEST_PATH = Path("/kaggle/input/sncosmos/test_sncosmo_features.parquet")

if SNCOSMO_TRAIN_PATH.exists() and SNCOSMO_TEST_PATH.exists():
    sncosmo_train = pd.read_parquet(SNCOSMO_TRAIN_PATH)
    sncosmo_test = pd.read_parquet(SNCOSMO_TEST_PATH)
    print(f"  Loaded train SNCOSMO: {sncosmo_train.shape}")
    print(f"  Loaded test SNCOSMO: {sncosmo_test.shape}")
    
    # Key SNCOSMO features
    SNCOSMO_FEATURES = [
        'sn_salt2_rchisq', 'sn_salt3_rchisq', 
        'sn_best_ia_rchisq', 'sn_is_good_ia_fit',
        'sn_salt2_x1', 'sn_salt2_c',
        'sn_best_cc_rchisq', 'sn_is_good_cc_fit',
    ]
    sncosmo_cols = ['object_id'] + [c for c in SNCOSMO_FEATURES if c in sncosmo_train.columns]
    
    train_df = train_df.merge(sncosmo_train[sncosmo_cols], on='object_id', how='left')
    test_df = test_df.merge(sncosmo_test[sncosmo_cols], on='object_id', how='left')
    print(f"  Added {len(sncosmo_cols)-1} SNCOSMO features")
else:
    print(f"  SNCOSMO not found: {SNCOSMO_TRAIN_PATH}")


print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# List all features (excluding object_id and target)
all_features = [c for c in train_df.columns if c not in ['object_id', 'target']]
print(f"\nTotal features: {len(all_features)}")
print(f"Features: {all_features}")

Adding additional features: Transformer, GP, SNCOSMO

1. Loading Transformer predictions...
  Loaded: 2026-01-24T23:20:26.479422
  Aligned Transformer AUC: 0.8954
  ✓ Transformer aligned correctly!
  Added transformer_pred

2. Loading GP features...
  Loaded train GP: (3043, 25)
  Loaded test GP: (7135, 25)
  Added 13 GP features

3. Loading SNCOSMO features...
  Loaded train SNCOSMO: (3043, 44)
  Loaded test SNCOSMO: (7135, 44)
  Added 8 SNCOSMO features

FEATURE SUMMARY
Train shape: (3043, 38)
Test shape: (7135, 37)

Total features: 36
Features: ['Amplitude', 'LengthScale_Time', 'LengthScale_Wavelength', 'Rise_Time', 'Fade_Time', 'Mean_Color_Pre_gr', 'Mean_Color_Post_gr', 'Mean_Color_Pre_ri', 'Mean_Color_Post_ri', 'Slope_Pre_gr', 'Slope_Post_gr', 'Slope_Pre_ri', 'Slope_Post_ri', 'gru_pred', 'transformer_pred', 'gp_decay_slope_30d', 'gp_decay_slope_60d', 'gp_decay_slope_100d', 'gp_decay_rate_norm_30d', 'gp_decay_rate_norm_60d', 'gp_decay_slope_r', 'gp_decay_slope_i', 'gp_decay_slope_z

In [20]:
FEATURE_COLS = [c for c in train_df.columns if c not in ['object_id', 'target']]
print(f"Training with {len(FEATURE_COLS)} features")

X_train = train_df[FEATURE_COLS].fillna(-999)
y_train = train_df['target']
X_test = test_df[FEATURE_COLS].fillna(-999)

yy = train_log.set_index("object_id").loc[train_df["object_id"], "target"].astype(int).values
pos = (yy == 1).sum()
neg = (yy == 0).sum()
scale_pos_weight = neg / pos

Training with 36 features


In [21]:
#Train XGBoost
print("Training XGBoost with 5-fold CV...")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
oof_preds = np.zeros(len(X_train))
models = []
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

    model = xgb.XGBClassifier(
        n_estimators=9000,
        learning_rate=0.005,
        max_depth=6,
        min_child_weight=2,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        random_state=SEED,
        use_label_encoder=False,
        eval_metric='logloss',
        early_stopping_rounds=600
    )

    model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    oof_preds[va_idx] = model.predict_proba(X_va)[:, 1]
    models.append(model)

    # Fold metrics
    f1_fold = max(f1_score(y_va, (oof_preds[va_idx] >= t).astype(int)) 
                  for t in np.linspace(0.2, 0.6, 41))
    print(f"  Fold {fold}: Best F1 = {f1_fold:.4f}")
# Overall results
print("\n" + "="*50)
best_f1 = max(f1_score(y_train, (oof_preds >= t).astype(int)) for t in np.linspace(0.1, 0.7, 61))
best_thr = max(np.linspace(0.1, 0.7, 61), key=lambda t: f1_score(y_train, (oof_preds >= t).astype(int)))
print(f"Overall CV F1: {best_f1:.4f}")
print(f"Best threshold: {best_thr:.2f}")
oof_binary = (oof_preds >= best_thr).astype(int)
print(f"Precision: {precision_score(y_train, oof_binary):.4f}")
print(f"Recall: {recall_score(y_train, oof_binary):.4f}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_train, oof_binary)}")

Training XGBoost with 5-fold CV...
  Fold 1: Best F1 = 0.6486
  Fold 2: Best F1 = 0.7222
  Fold 3: Best F1 = 0.7143
  Fold 4: Best F1 = 0.7105
  Fold 5: Best F1 = 0.6377

Overall CV F1: 0.6703
Best threshold: 0.33
Precision: 0.5586
Recall: 0.8378

Confusion Matrix:
[[2797   98]
 [  24  124]]


In [22]:
# Feature importance
importance = pd.DataFrame({
    'feature': FEATURE_COLS,
    'importance': np.mean([m.feature_importances_ for m in models], axis=0)
}).sort_values('importance', ascending=False)

print("\nTop 10 features:")
print(importance.head(28).to_string(index=False))


Top 10 features:
               feature  importance
              gru_pred    0.231942
      transformer_pred    0.083128
LengthScale_Wavelength    0.068478
     sn_is_good_ia_fit    0.048104
          Slope_Pre_gr    0.036335
         Slope_Post_gr    0.035469
       gp_t_above_half    0.034299
     sn_best_ia_rchisq    0.030374
             Fade_Time    0.026851
     sn_is_good_cc_fit    0.025484
             Rise_Time    0.025136
      LengthScale_Time    0.021514
    Mean_Color_Post_gr    0.020135
       sn_salt2_rchisq    0.018963
          Slope_Pre_ri    0.018386
   gp_decay_slope_100d    0.017887
      gp_decay_slope_r    0.017763
         Slope_Post_ri    0.017271
    gp_decay_slope_60d    0.015894
     Mean_Color_Pre_gr    0.015479
           sn_salt2_x1    0.015009
      gp_decay_r2_100d    0.014572
       sn_salt3_rchisq    0.014556
 gp_color_evolution_gr    0.014060
    gp_decay_slope_30d    0.013244
gp_decay_rate_norm_60d    0.012958
      gp_decay_slope_i    0.012552
  

In [29]:
import lightgbm as lgb
from catboost import CatBoostClassifier

N_FOLDS = 3
N_SEEDS = 3 

# Store OOF predictions
oof_xgb = np.zeros(len(X_train), dtype=np.float64)
oof_lgb = np.zeros(len(X_train), dtype=np.float64)
oof_cat = np.zeros(len(X_train), dtype=np.float64)

test_xgb = np.zeros(len(X_test), dtype=np.float64)
test_lgb = np.zeros(len(X_test), dtype=np.float64)
test_cat = np.zeros(len(X_test), dtype=np.float64)

for seed_idx in range(N_SEEDS):
    seed = SEED + seed_idx * 42
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train), 1):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]
        
        # XGBoost
        xgb_model = xgb.XGBClassifier(
            n_estimators=2000, learning_rate=0.03, max_depth=6,
            min_child_weight=2, subsample=0.8, colsample_bytree=0.6,
            gamma=0.1, reg_lambda=2, random_state=seed,
            eval_metric='aucpr', early_stopping_rounds=150, verbosity=0
        )
        xgb_model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
        oof_xgb[va_idx] += xgb_model.predict_proba(X_va)[:, 1] / N_SEEDS
        test_xgb += xgb_model.predict_proba(X_test)[:, 1] / (N_FOLDS * N_SEEDS)
        
        # LightGBM
        lgb_model = lgb.LGBMClassifier(
            n_estimators=2000, learning_rate=0.03, num_leaves=16,
            max_depth=-1, min_child_samples=20, subsample=0.8,
            colsample_bytree=0.6, reg_alpha=0.5, reg_lambda=2,
            random_state=seed, verbosity=-1
        )
        lgb_model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)],
                      callbacks=[lgb.early_stopping(150, verbose=False), lgb.log_evaluation(0)])
        oof_lgb[va_idx] += lgb_model.predict_proba(X_va)[:, 1] / N_SEEDS
        test_lgb += lgb_model.predict_proba(X_test)[:, 1] / (N_FOLDS * N_SEEDS)
        
        # CatBoost
        cat_model = CatBoostClassifier(
            iterations=2000, learning_rate=0.03, depth=6,
            l2_leaf_reg=6, subsample=0.8, colsample_bylevel=0.6,
            early_stopping_rounds=150, random_seed=seed, verbose=False
        )
        cat_model.fit(X_tr, y_tr, eval_set=(X_va, y_va), use_best_model=True)
        oof_cat[va_idx] += cat_model.predict_proba(X_va)[:, 1] / N_SEEDS
        test_cat += cat_model.predict_proba(X_test)[:, 1] / (N_FOLDS * N_SEEDS)
    
    print(f"  Seed {seed_idx+1}/{N_SEEDS} done")

# Individual scores

for name, oof in [("XGB", oof_xgb), ("LGB", oof_lgb), ("CAT", oof_cat)]:
    f1 = max(f1_score(y_train, (oof >= t).astype(int)) for t in np.linspace(0.05, 0.6, 56))
    print(f"  {name}: F1 = {f1:.4f}")

# Find best blend
best_f1 = 0
best_weights = (1/3, 1/3, 1/3)
best_thr = 0.2

for w1 in np.arange(0.0, 0.9, 0.1):
    for w2 in np.arange(0.0, 0.9 - w1, 0.1):
        w3 = 1 - w1 - w2
        if w3 < 0.05:
            continue
        blend = w1 * oof_xgb + w2 * oof_lgb + w3 * oof_cat
        for thr in np.linspace(0.05, 0.5, 46):
            f1 = f1_score(y_train, (blend >= thr).astype(int))
            if f1 > best_f1:
                best_f1 = f1
                best_weights = (w1, w2, w3)
                best_thr = thr

print(f"\nBest blend: XGB={best_weights[0]:.1f}, LGB={best_weights[1]:.1f}, CAT={best_weights[2]:.1f}")
print(f"Best threshold: {best_thr:.2f}")
print(f"Best CV F1: {best_f1:.4f}")

# Final blend
oof_blend = best_weights[0] * oof_xgb + best_weights[1] * oof_lgb + best_weights[2] * oof_cat
test_blend = best_weights[0] * test_xgb + best_weights[1] * test_lgb + best_weights[2] * test_cat

oof_binary = (oof_blend >= best_thr).astype(int)
print(f"\nPrecision: {precision_score(y_train, oof_binary):.4f}")
print(f"Recall: {recall_score(y_train, oof_binary):.4f}")
print(f"Predicted: {oof_binary.sum()} / {y_train.sum()} actual")

  Seed 1/3 done
  Seed 2/3 done
  Seed 3/3 done
  XGB: F1 = 0.6630
  LGB: F1 = 0.6630
  CAT: F1 = 0.6705

Best blend: XGB=0.4, LGB=0.0, CAT=0.6
Best threshold: 0.21
Best CV F1: 0.6744

Precision: 0.5879
Recall: 0.7905
Predicted: 199 / 148 actual


In [30]:
# Apply threshold to get binary predictions
test_preds = (test_blend >= best_thr).astype(int)

# Create submission dataframe
submission = pd.DataFrame({
    'object_id': test_df['object_id'], 
    'target': test_preds
})

# Save
submission.to_csv('submission.csv', index=False)

print(f"Saved submission.csv")
print(f"Threshold: {best_thr:.2f}")
print(f"Predicted TDEs: {test_preds.sum()}")

Saved submission.csv
Threshold: 0.21
Predicted TDEs: 422
