In [None]:
# Cell 1: Install and import dependencies
# conda create -n kf_py312 python=3.12
# conda activate kf_py312
# pip install ipykernel uproot h5py numpy pandas tensorflow scikit-learn
## In Ubuntu do:
# sudo apt update
# sudo apt --fix-broken install
# sudo apt install \
#  libsqlite3-0=3.37.2-2ubuntu0.4 \
#  libsqlite3-dev=3.37.2-2ubuntu0.4 \
#  sqlite3=3.37.2-2ubuntu0.4
# sudo apt-mark hold libsqlite3-0 libsqlite3-dev sqlite3
# # OR with conda (not required)
# conda install -c conda-forge openssl sqlite
# conda install -c conda-forge cudatoolkit-dev cudnn tensorflow ipykernel uproot\
# h5py pandas scikit-learn matplotlib awkward numpy

import sys, os, glob
import numpy as np
import pandas as pd
import h5py
import uproot
import awkward as ak
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("Python", sys.version.split()[0])
print("NumPy ", np.__version__)
print("Pandas", pd.__version__)
print("Awkward", ak.__version__)
print("TensorFlow", tf.__version__)

In [None]:
"""
# Cell 2: Define Data Loader
from sklearn.utils import shuffle as sk_shuffle

# ───────────────────────── 1. Helper functions ──────────────────────────
def load_h5_data(files, label):
    dfs, ys, ws = [], [], []
    for f in files:
        with h5py.File(f, "r") as hf:
            c0 = [n.decode() for n in hf["df/block0_items"][:]]
            v0 = pd.DataFrame(hf["df/block0_values"][:], columns=c0)
            c1 = [n.decode() for n in hf["df/block1_items"][:]]
            v1 = pd.DataFrame(hf["df/block1_values"][:], columns=c1)
        df = pd.concat([v0, v1], axis=1)
        dfs.append(df[selected_variables])
        ws.append(df["weight"].values if "weight" in df.columns else np.ones(len(df)))
        ys.append(np.full(len(df), label))
    return (
        pd.concat(dfs, axis=0).reset_index(drop=True),
        np.concatenate(ys),
        np.concatenate(ws),
    )

def load_root_data(files, label, tree="sel_tree"):
    dfs, ys, ws = [], [], []
    for f in files:
        with uproot.open(f) as rf:
            if tree not in rf: continue
            arr = rf[tree].arrays(selected_variables + ["weight"], library="np")
        dfs.append(pd.DataFrame({v: arr[v] for v in selected_variables}))
        ws.append(arr["weight"]) # Preserving original weight
        ys.append(np.full(arr[selected_variables[0]].shape, label))
    return (
        pd.concat(dfs, axis=0).reset_index(drop=True),
        np.concatenate(ys),
        np.concatenate(ws),
    )

# ───────────────────────── 2. Configuration ─────────────────────────────
selected_variables = ['jet1_pt','jet1_eta','jet1met_dphi','met_sig','met_pt']

base_dir_LQ         = "/home/sgoswami/monobcntuples/"
signal_masses_LQ    = ["500","1000","1400","2000","2500","2800"]
base_dir_stop       = "/home/sgoswami/monobcntuples/run3_btag/all"
signal_pattern_stop = os.path.join(base_dir_stop, "singlestop", "basicSel_sT_*_*.root")

bkg_procs = ["ttbar","singletop","dijet","diboson","wlnu","zll","znunu"]

# ───────────────────────── 3. Discover files ────────────────────────────
def filter_ok(paths):
    return [f for f in paths if "_histogram" not in f and "_cutflow" not in f]

sig_files_LQ = []
for m in signal_masses_LQ:
    sig_files_LQ += filter_ok(glob.glob(os.path.join(base_dir_LQ, f"mass_{m}", "basicSel_mass_*.h5")))
sig_files_stop = filter_ok(glob.glob(signal_pattern_stop))

bkg_files_LQ_flat, bkg_files_stop_flat = [], []
for p in bkg_procs:
    bkg_files_LQ_flat.extend(filter_ok(glob.glob(os.path.join(base_dir_LQ, f"{p}_mc20e", "*.h5"))))
    root_path = os.path.join(base_dir_stop, p, f"basicSel_{p}.root")
    if os.path.exists(root_path):
        bkg_files_stop_flat.append(root_path)

print(">>> DEBUG: File Discovery Complete.")
print(f"    Found {len(sig_files_LQ)} LQ signal files and {len(bkg_files_LQ_flat)} LQ background files.")
print(f"    Found {len(sig_files_stop)} Stop signal files and {len(bkg_files_stop_flat)} Stop background files.")

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# NEW LOADING AND BALANCING STRATEGY
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# ────────────────── 4. Process LQ Dataset (Signal + Background) ───────────────
print("\n--- Processing LQ Dataset ---")
# 4.1: Load LQ signal
X_LQ_sig, y_LQ_sig, w_LQ_sig = load_h5_data(sig_files_LQ, 1)
N_sig_LQ = len(X_LQ_sig)
print(f"Loaded {N_sig_LQ} LQ signal events. Target for background is {N_sig_LQ}.")

# 4.2: Load all LQ background file-by-file to get their sizes
lq_bkg_dfs, lq_bkg_sizes, lq_bkg_weights = [], [], []
for f in bkg_files_LQ_flat:
    Xf, _, wf = load_h5_data([f], 0)
    lq_bkg_dfs.append(Xf)
    lq_bkg_sizes.append(len(Xf))
    lq_bkg_weights.append(wf)
total_lq_bkg_events = sum(lq_bkg_sizes)
print(f"Found {total_lq_bkg_events} total available LQ background events across {len(lq_bkg_dfs)} files.")

# 4.3: Determine how many events to draw from each LQ background file proportionally
target_per_file_lq = [int(round(N_sig_LQ * (sz / total_lq_bkg_events))) for sz in lq_bkg_sizes]
while sum(target_per_file_lq) < N_sig_LQ: target_per_file_lq[np.argmax(lq_bkg_sizes)] += 1
while sum(target_per_file_lq) > N_sig_LQ: target_per_file_lq[np.argmax(target_per_file_lq)] -= 1

# 4.4: Sample from each LQ background file and shuffle the result
X_LQ_bkg_parts, y_LQ_bkg_parts, w_LQ_bkg_parts = [], [], []
for df, wt, k in zip(lq_bkg_dfs, lq_bkg_weights, target_per_file_lq):
    if k == 0: continue
    sel = np.random.choice(len(df), k, replace=False)
    X_LQ_bkg_parts.append(df.iloc[sel])
    y_LQ_bkg_parts.append(np.zeros(k, dtype=np.int64))
    w_LQ_bkg_parts.append(wt[sel])

X_LQ_bkg = pd.concat(X_LQ_bkg_parts, axis=0)
y_LQ_bkg = np.concatenate(y_LQ_bkg_parts)
w_LQ_bkg = np.concatenate(w_LQ_bkg_parts)

lq_bkg_perm = np.random.permutation(len(X_LQ_bkg))
X_LQ_bkg, y_LQ_bkg, w_LQ_bkg = X_LQ_bkg.iloc[lq_bkg_perm], y_LQ_bkg[lq_bkg_perm], w_LQ_bkg[lq_bkg_perm]
print(f"Sampled and shuffled {len(X_LQ_bkg)} LQ background events.")

# ────────────────── 5. Process Stop Dataset (Signal + Background) ──────────────
print("\n--- Processing Stop Dataset ---")
# 5.1: Load Stop signal
X_stop_sig, y_stop_sig, w_stop_sig = load_root_data(sig_files_stop, 1)
N_sig_stop = len(X_stop_sig)
print(f"Loaded {N_sig_stop} Stop signal events. Target for background is {N_sig_stop}.")

# 5.2: Load all Stop background file-by-file
stop_bkg_dfs, stop_bkg_sizes, stop_bkg_weights = [], [], []
for f in bkg_files_stop_flat:
    Xf, _, wf = load_root_data([f], 0)
    stop_bkg_dfs.append(Xf)
    stop_bkg_sizes.append(len(Xf))
    stop_bkg_weights.append(wf)
total_stop_bkg_events = sum(stop_bkg_sizes)
print(f"Found {total_stop_bkg_events} total available Stop background events across {len(stop_bkg_dfs)} files.")

# 5.3: Determine how many events to draw from each Stop background file proportionally
target_per_file_stop = [int(round(N_sig_stop * (sz / total_stop_bkg_events))) for sz in stop_bkg_sizes]
while sum(target_per_file_stop) < N_sig_stop: target_per_file_stop[np.argmax(stop_bkg_sizes)] += 1
while sum(target_per_file_stop) > N_sig_stop: target_per_file_stop[np.argmax(target_per_file_stop)] -= 1

# 5.4: Sample from each Stop background file and shuffle the result
X_stop_bkg_parts, y_stop_bkg_parts, w_stop_bkg_parts = [], [], []
for df, wt, k in zip(stop_bkg_dfs, stop_bkg_weights, target_per_file_stop):
    if k == 0: continue
    sel = np.random.choice(len(df), k, replace=False)
    X_stop_bkg_parts.append(df.iloc[sel])
    y_stop_bkg_parts.append(np.zeros(k, dtype=np.int64))
    w_stop_bkg_parts.append(wt[sel])

X_stop_bkg = pd.concat(X_stop_bkg_parts, axis=0)
y_stop_bkg = np.concatenate(y_stop_bkg_parts)
w_stop_bkg = np.concatenate(w_stop_bkg_parts)

stop_bkg_perm = np.random.permutation(len(X_stop_bkg))
X_stop_bkg, y_stop_bkg, w_stop_bkg = X_stop_bkg.iloc[stop_bkg_perm], y_stop_bkg[stop_bkg_perm], w_stop_bkg[stop_bkg_perm]
print(f"Sampled and shuffled {len(X_stop_bkg)} Stop background events.")


# ────────────────── 6. Concatenate All Datasets ─────────────────────────
print("\n--- Concatenating and Final Shuffling ---")
X_full = pd.concat([X_LQ_sig, X_LQ_bkg, X_stop_sig, X_stop_bkg], axis=0)
y_full = np.concatenate([y_LQ_sig, y_LQ_bkg, y_stop_sig, y_stop_bkg])
w_full = np.concatenate([w_LQ_sig, w_LQ_bkg, w_stop_sig, w_stop_bkg])
print(f"Concatenated all four datasets. Total events: {len(X_full)}")


# ────────────────── 7. Final Double Shuffle ─────────────────────────────
# First shuffle using numpy permutation
print("Performing first shuffle (numpy)")
perm1 = np.random.permutation(len(X_full))
X_full = X_full.iloc[perm1].reset_index(drop=True)
y_full = y_full[perm1]
w_full = w_full[perm1]

# Second shuffle using scikit-learn's utility (as a different method)
print("Performing second shuffle (sklearn)")
X_full, y_full, w_full = sk_shuffle(X_full, y_full, w_full)
X_full = X_full.reset_index(drop=True)


# ────────────────── 8. Final Output ─────────────────────────────────────
print("\n>>> FINAL DATASET STATS <<<")
print(f"    Total events: {len(X_full)}")
print(f"    Features shape: {X_full.shape}")
print(f"    Final class counts (0=bkg, 1=sig): {np.bincount(y_full.astype(int))}")
print(f"    (Signal should be {N_sig_LQ + N_sig_stop}, Background should be {N_sig_LQ + N_sig_stop})")
"""


In [None]:
# Cell 2: Revised azz same ol' shizz
import os
import glob
import numpy as np
import pandas as pd
import uproot
import h5py
import awkward as ak
from sklearn.model_selection import train_test_split

# --- 1. Configuration ---
selected_variables = [
    "jet1_pt", "jet1_eta", "jet1met_dphi", "jet2met_dphi",
    "met_sig", "mjj",
    "pTjj", "mbb", "pTbb", "dRjj", "dEtajj", "dPhijj", "dRbb", "dEtabb",
    "dPhibb", "jet2_pt", "jet2_eta"
]

base_dir_LQ      = "/home/sgoswami/monobcntuples/"
signal_masses_LQ = ["500", "1000", "1400", "2000", "2500", "2800"]

base_dir_stop       = "/home/sgoswami/monobcntuples/run3_btag/all"
signal_pattern_stop = os.path.join(base_dir_stop, "singlestop", "basicSel_sT_*.root")

bkg_procs = ["ttbar", "singletop", "dijet", "diboson", "wlnu", "zll", "znunu"]


# --- 2. Helper Functions ---
def importance_shuffle(X, y=None):
    idx = np.random.permutation(len(X))
    Xs = X.iloc[idx].reset_index(drop=True)
    if y is None:
        return Xs
    ys = y[idx]
    return Xs, ys

def filter_ok(paths):
    return [f for f in paths if "_histogram" not in f and "_cutflow" not in f]

def load_root_data(file_list, label, tree_name="sel_tree", max_samples=None):
    dfs = []
    for f in file_list:
        with uproot.open(f) as rf:
            if tree_name not in rf:
                print(f"Skipping {f}: no '{tree_name}'")
                continue
            arr = rf[tree_name].arrays(selected_variables, library="ak")
        dfs.append(ak.to_dataframe(arr))
    if not dfs:
        return pd.DataFrame(columns=selected_variables), np.array([])
    combined = pd.concat(dfs, axis=0).reset_index(drop=True)
    if max_samples:
        replace = max_samples > len(combined)
        combined = combined.sample(n=max_samples, replace=replace).reset_index(drop=True)
    else:
        combined = combined.sample(frac=1).reset_index(drop=True)
    labels = np.full(len(combined), label)
    return combined, labels

def load_h5_data(file_list, label):
    dfs = []
    for f in file_list:
        with h5py.File(f, "r") as hf:
            cols0 = [n.decode() for n in hf["df/block0_items"][:]]
            vals0 = pd.DataFrame(hf["df/block0_values"][:], columns=cols0)
            cols1 = [n.decode() for n in hf["df/block1_items"][:]]
            vals1 = pd.DataFrame(hf["df/block1_values"][:], columns=cols1)
        dfs.append(pd.concat([vals0, vals1], axis=1)[selected_variables])
    if not dfs:
        return pd.DataFrame(columns=selected_variables), np.array([])
    combined = pd.concat(dfs, axis=0).reset_index(drop=True)
    labels   = np.full(len(combined), label)
    return combined, labels


# --- 3. Physics-Aware Sampling Setup ---
eff_xs = {
    'ttbar':     729.77 * (0.10546 + 0.45623 + 0.54382),
    'singletop': 2.0267 + 1.2675 + 36.995 + 22.173,
    'dijet': (2.4331e9*0.00986+2.6450e7*0.01166+2.5461e5*0.01337+4.5532e3*0.01453+
              257.54*0.00947+16.215*0.01110+0.62506*0.01015+0.01964*0.01206),
    'diboson':   1.2974 + 4.661 + 12.079 + 0.02221 + 3.1081 + 0.57762,
    'wlnu': (21745.0*0.14699+21814.0*0.00923+21814.0*0.14722+21814.0*0.14343+
             21815.0*0.00912),
    'zll': (2221.3*0.02494+2221.3*0.12896+2221.4*0.02439+2221.4*0.12920+
            2239.7*0.02487+2239.6*0.12915+2239.6*0.84596),
    'znunu':     448.77 * 0.20291,
}
_total_eff = sum(eff_xs.values())
bkg_probs   = {p: xs/_total_eff for p, xs in eff_xs.items()}


# --- 4. Dataset Preparation Functions ---
def prepare_LQ():
    sig_files = []
    for m in signal_masses_LQ:
        sig_files += filter_ok(glob.glob(os.path.join(base_dir_LQ, f"mass_{m}", f"basicSel_mass_*.h5")))

    sig_df, sig_lbl = load_h5_data(sig_files, label=1)
    S = len(sig_df)

    bkg_counts = {p: int(round(S * bkg_probs[p])) for p in bkg_procs}
    diff = S - sum(bkg_counts.values())
    if diff: bkg_counts[bkg_procs[-1]] += diff

    bkg_parts = []
    for proc in bkg_procs:
        n_req = bkg_counts[proc]
        if n_req == 0: continue

        if proc == 'wlnu':
            dirs = ['wenu_mc20e','wmunu_mc20e','wtaunu_mc20e']
            files = sum((glob.glob(os.path.join(base_dir_LQ,d,"*.h5")) for d in dirs), [])
        elif proc == 'zll':
            dirs = ['zee_mc20e','zmumu_mc20e','ztautau_mc20e']
            files = sum((glob.glob(os.path.join(base_dir_LQ,d,"*.h5")) for d in dirs), [])
        else:
            files = glob.glob(os.path.join(base_dir_LQ, f"{proc}_mc20e", "*.h5"))

        files = filter_ok(files)
        if not files: continue

        df_all, _ = load_h5_data(files, label=0)
        if len(df_all) == 0: continue

        replace = n_req > len(df_all)
        df_samp = df_all.sample(n=n_req, replace=replace).reset_index(drop=True)
        bkg_parts.append(df_samp)

    bkg_df = pd.concat(bkg_parts, axis=0).reset_index(drop=True)
    bkg_df = importance_shuffle(bkg_df)

    X_LQ = pd.concat([sig_df, bkg_df], axis=0).reset_index(drop=True)
    y_LQ = np.concatenate([sig_lbl, np.zeros(len(bkg_df),dtype=int)])
    return importance_shuffle(X_LQ, y_LQ)

def prepare_stop():
    sig_files = filter_ok(glob.glob(signal_pattern_stop))
    sig_df, sig_lbl = load_root_data(sig_files, label=1)
    S = len(sig_df)

    bkg_counts = {p: int(round(S * bkg_probs[p])) for p in bkg_procs}
    diff = S - sum(bkg_counts.values())
    if diff: bkg_counts[bkg_procs[-1]] += diff

    bkg_parts = []
    for proc in bkg_procs:
        n_req = bkg_counts[proc]
        if n_req == 0: continue

        files = filter_ok(glob.glob(os.path.join(base_dir_stop, proc, f"basicSel_{proc}.root")))
        if not files: continue

        df_p, _ = load_root_data(files, label=0, max_samples=n_req)
        bkg_parts.append(df_p)

    bkg_df = pd.concat(bkg_parts, axis=0).reset_index(drop=True)
    bkg_df = importance_shuffle(bkg_df)

    X_stop = pd.concat([sig_df, bkg_df], axis=0).reset_index(drop=True)
    y_stop = np.concatenate([sig_lbl, np.zeros(len(bkg_df),dtype=int)])
    return importance_shuffle(X_stop, y_stop)


# --- 5. Final Data Loading ---
X_LQ, y_LQ = prepare_LQ()
print(f"Loaded LQ dataset. Shape: {X_LQ.shape}, {y_LQ.shape}")
X_stop, y_stop = prepare_stop()
print(f"Loaded Stop dataset. Shape: {X_stop.shape}, {y_stop.shape}")

X_full_df = pd.concat([X_LQ, X_stop], axis=0).reset_index(drop=True)
y_full = np.concatenate([y_LQ, y_stop])
X_full_df, y_full = importance_shuffle(X_full_df, y_full)

print(f"\nCombined dataset shape: {X_full_df.shape}")
print(f"Combined class distribution: {np.bincount(y_full)}")

In [None]:
# Cell 3: Model Architecture, Optimizer, and Callbacks

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import AUC, Precision, Recall

FOLDS = 5
NUM_FEATURES = 5 # Placeholder for the actual number of features

def build_model(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),

        Dense(128, kernel_initializer='he_normal', activation='gelu'),
        #LeakyReLU(),
        #BatchNormalization(),
        #Dropout(0.1),

        Dense(64, kernel_initializer='he_normal', activation='gelu'),
        #LeakyReLU(),
        #BatchNormalization(),
        #Dropout(0.1),

        Dense(32, kernel_initializer='he_normal', activation='gelu'),
        #LeakyReLU(),
        #BatchNormalization(),
        #Dropout(0.1),

        Dense(1, activation='sigmoid')
    ])
    return model

# Instantiate the model
model = build_model(input_dim=NUM_FEATURES)

# Compile the model with the ADAM optimizer
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=[
        AUC(name='auc'),
        'accuracy',
        Precision(name='precision'),
        Recall(name='recall')
    ]
)

# Define callbacks for training
callbacks = [
    # Stop training if the validation AUC doesn't improve for 15 epochs
    EarlyStopping(monitor='val_auc', patience=15, mode='max', verbose=1),

    # Reduce the learning rate if validation AUC plateaus for 5 epochs
    ReduceLROnPlateau(monitor='val_auc', factor=0.2, patience=5, mode='max', verbose=1, min_lr=1e-7),

    # Save the best model based on validation AUC
    ModelCheckpoint("model_best.keras", monitor='val_auc', save_best_only=True, mode='max')
]

# Print the model summary
model.summary()

In [None]:
# Cell 4 - Revised Training Loop

import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import ModelCheckpoint

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC, Precision, Recall

# Define your desired metrics once
metric_list = [
    AUC(name='auc'),
    'accuracy',
    Precision(name='precision'),
    Recall(name='recall')
]


lr = 1e-6
epochs = 100
batch_size = 1024
FOLDS = 5

X_arr = X_full.values.astype(np.float32)
y_arr = y_full.astype(np.int64)
w_arr = w_full # OR np.clip(w_full.astype(np.float32), -10, 10)

print(f"Starting training with LR={lr}, BS={batch_size}")

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
all_histories = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_arr, y_arr), 1):
    print(f"\n=== Fold {fold}/{FOLDS} ===")

    X_tr_raw, X_va_raw = X_arr[tr_idx], X_arr[va_idx]
    y_tr, y_va = y_arr[tr_idx], y_arr[va_idx]
    w_tr, w_va = w_arr[tr_idx], w_arr[va_idx]

    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr_raw)
    X_va = scaler.transform(X_va_raw)

    print("Train:", X_tr.shape, "Val:", X_va.shape)

    model = build_model(input_dim=X_tr.shape[1])
    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss='binary_crossentropy',
        weighted_metrics=metric_list
    )

    # Update checkpoint to save a unique file for each fold
    callbacks[2] = ModelCheckpoint(
        f"model_fold_{fold}.h5",
        monitor='val_auc',
        save_best_only=True,
        mode='max'
    )

    history = model.fit(
        X_tr,
        y_tr,
        sample_weight=w_tr,
        validation_data=(X_va, y_va, w_va),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        verbose=1
    )
    all_histories.append(history)
    print("\n--- Fold", fold, "Loss Summary ---")
    val_loss_history = history.history['val_loss']
    print("Validation loss for the first 5 epochs:", val_loss_history[:5])

In [None]:
# Cell 5: Final Evaluation Cell (run after all folds are trained)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import StandardScaler

print("--- Starting Final K-Fold Evaluation ---")

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

all_true_labels = []
all_pred_probs = []
per_fold_aucs = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_arr, y_arr), 1):
    print(f"Loading and evaluating Fold {fold}...")

    X_tr_raw, X_va_raw = X_arr[tr_idx], X_arr[va_idx]
    y_va = y_arr[va_idx]

    scaler = StandardScaler()
    scaler.fit(X_tr_raw)
    X_va = scaler.transform(X_va_raw)

    model = tf.keras.models.load_model(f"model_fold_{fold}.h5")

    fold_pred_probs = model.predict(X_va, batch_size=batch_size * 2).squeeze()

    all_pred_probs.append(fold_pred_probs)
    all_true_labels.append(y_va)

    fold_auc = roc_auc_score(y_va, fold_pred_probs)
    per_fold_aucs.append(fold_auc)
    print(f"  AUC for Fold {fold}: {fold_auc:.4f}")

y_true_full = np.concatenate(all_true_labels)
y_pred_full = np.concatenate(all_pred_probs)

mean_auc = np.mean(per_fold_aucs)
std_auc = np.std(per_fold_aucs)
print(f"\nOverall Model Performance:")
print(f"  Mean AUC = {mean_auc:.4f}")
print(f"  AUC Std. Dev. = {std_auc:.4f}")

fpr, tpr, _ = roc_curve(y_true_full, y_pred_full)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f'Overall AUC = {roc_auc:.4f}', lw=2)
plt.plot([0, 1], [0, 1], 'k--', lw=1, label='Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Overall ROC Curve (from all folds)')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(6, 5))
plt.hist(y_pred_full[y_true_full==0], bins=50, alpha=0.6, label='Background', density=True)
plt.hist(y_pred_full[y_true_full==1], bins=50, alpha=0.6, label='Signal', density=True)
plt.xlabel('Predicted Probability for Signal Class')
plt.ylabel('Density')
plt.title('Overall Output Distribution (from all folds)')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Cell 6: ROC Curve and Confusion Matrix

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc

# --- 1. Prepare Data for Plotting ---
# Convert probabilities to class predictions for the confusion matrix
y_pred_class = (y_pred_full > 0.5).astype(int)

# Calculate metrics for the ROC curve
fpr, tpr, _ = roc_curve(y_true_full, y_pred_full)
roc_auc = auc(fpr, tpr)

# Calculate the confusion matrix
cm = confusion_matrix(y_true_full, y_pred_class)


# --- 2. Create the Plots ---
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Plot 1: ROC Curve
ax1.plot(fpr, tpr, label=f'Overall AUC = {roc_auc:.4f}', lw=2)
ax1.plot([0, 1], [0, 1], 'k--', lw=1, label='Chance')
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('Overall ROC Curve')
ax1.legend(loc='lower right')

# Plot 2: Confusion Matrix
disp = ConfusionMatrixDisplay(cm, display_labels=['Background','Signal'])
disp.plot(ax=ax2, cmap='Blues', colorbar=False, values_format='d')
ax2.set_title('Overall Confusion Matrix')

plt.tight_layout()
plt.show()