In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tqdm.notebook import tqdm
import pickle

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.signal_categories import train_category_labels
from src.file_locations import intermediate_files_location

from src.ntuple_variables.variables import wc_training_vars, combined_training_vars, lantern_training_vars, glee_training_vars, pandora_training_vars, pandora_scalar_training_vars, combined_postprocessing_training_vars

print("xgboost version:", xgb.__version__)

plt.rcParams.update({'font.size': 14})


In [None]:
nominal_training = "all_vars"

nominal_training_importances = pd.read_csv(f"../training_outputs/{nominal_training}/feature_importances.csv").sort_values(by="weight_importance", ascending=False)
sorted_training_vars = nominal_training_importances["feature"].tolist()

nominal_training_curves = pd.read_csv(f"../training_outputs/{nominal_training}/training_curves.csv")

display(nominal_training_importances)
display(nominal_training_curves)


In [None]:
bins = np.linspace(0, 2000, 201)
plt.figure(figsize=(10, 6))
plt.hist(nominal_training_importances["weight_importance"], bins=bins, histtype="step")
plt.xlabel("Weight Importance")
plt.ylabel("Count")
plt.show()

bins = np.linspace(0, 10, 201)
plt.figure(figsize=(10, 6))
plt.hist(nominal_training_importances["weight_importance"], bins=bins, histtype="step")
plt.xlabel("Weight Importance")
plt.ylabel("Count")
plt.show()


In [None]:

plt.figure(figsize=(10, 6))
plt.plot(nominal_training_curves["iteration"], nominal_training_curves["train_loss"], label="Nominal Training Loss")
plt.plot(nominal_training_curves["iteration"], nominal_training_curves["test_loss"], label="Nominal Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["train_loss"], label="Nominal Training Loss")
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["test_loss"], label="Nominal Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xlim(0, 19)
plt.ylim(0.6, 1.6)
plt.show()


# New Trainings

In [None]:
training_vars = combined_training_vars
num_epochs = 20

In [None]:
all_df = pl.read_parquet(f"{intermediate_files_location}/presel_df_train_vars.parquet").filter(pl.col("filetype") != "data")
signal_category_labels = train_category_labels
signal_category_var = "del1g_simple_signal_category"

# Preselection: WC generic neutrino selection
# (should already be applied in the presel_df_train_vars.pkl file)
original_num_events = all_df.height
presel_df = all_df.filter(pl.col("wc_kine_reco_Enu") > 0)
preselected_num_events = presel_df.height
print(f"Preselected {preselected_num_events} / {original_num_events} events")

num_categories = len(signal_category_labels)
print(f"{num_categories=}")


In [None]:
# Convert to arrays aligned with training_vars
# XGBoost uses f0, f1, f2, ... as feature names
def get_importance_array(importance_dict, feature_names):
    importance_array = []
    for i, feature_name in enumerate(feature_names):
        feature_key = f'f{i}'
        importance_array.append(importance_dict.get(feature_key, 0.0))
    return np.array(importance_array)

In [None]:
def train_model(training_vars, test_frac, subsample, colsample_bytree):

    print("loading variables...")
    x = all_df.select(training_vars).to_numpy()
    y = all_df.select(signal_category_var).to_numpy()
    w = all_df.select("wc_net_weight").to_numpy()

    x = x.astype(np.float64)
    x[(x > 1e10) | (x < -1e10)] = np.nan
    y = y.flatten()

    print("splitting into train and test...")
    train_indices, test_indices = train_test_split(np.arange(all_df.height), test_size=test_frac, random_state=42)

    x_train = x[train_indices]
    y_train = y[train_indices]
    w_train = w[train_indices]

    x_test = x[test_indices]
    y_test = y[test_indices]
    w_test = w[test_indices]

    eval_set = [(x_train, y_train), (x_test, y_test)]
    eval_weights = [w_train, w_test]

    print("creating model...")
    model = xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=num_categories,
        n_estimators=num_epochs,
        eval_metric=['mlogloss', 'merror'],
        subsample=subsample,
        colsample_bytree=colsample_bytree,
    )

    print("fitting model...")
    model.fit(
        x_train, y_train, 
        sample_weight=w_train,
        eval_set=eval_set,
        sample_weight_eval_set=eval_weights,
        verbose=1
    )
    print("done")

    return model



In [None]:
def get_dfs(model, training_vars):

    # Get importance scores for weight, gain, and cover
    booster = model.get_booster()

    # Get importance dictionaries (feature name -> importance value)
    weight_importance = booster.get_score(importance_type='weight')
    gain_importance = booster.get_score(importance_type='gain')
    cover_importance = booster.get_score(importance_type='cover')

    weight_array = get_importance_array(weight_importance, training_vars)
    gain_array = get_importance_array(gain_importance, training_vars)
    cover_array = get_importance_array(cover_importance, training_vars)

    importance_df = pl.DataFrame({
        'feature': training_vars,
        'weight_importance': weight_array,
        'gain_importance': gain_array,
        'cover_importance': cover_array
    })

    evals_result = model.evals_result()
    loss_key = 'mlogloss' if num_categories > 2 else 'logloss'
    err_key = 'merror' if num_categories > 2 else 'error'
    train_loss = evals_result['validation_0'][loss_key]
    test_loss = evals_result['validation_1'][loss_key]
    training_curves_df = pl.DataFrame({
        'iteration': list(range(num_epochs)),
        'train_loss': train_loss,
        'test_loss': test_loss
    })

    return importance_df, training_curves_df


In [None]:
short_training_experiments = {}


In [None]:
subsample = 1.0
colsample_bytree = 1.0
test_frac = 0.5
for included_training_var_frac in tqdm([0.1, 0.5, 0.9, 1.0]):
    num_included_training_vars = int(len(sorted_training_vars) * included_training_var_frac)
    training_vars = sorted_training_vars[:num_included_training_vars]

    model = train_model(training_vars=training_vars, test_frac=test_frac, subsample=subsample, colsample_bytree=colsample_bytree)

    importance_df, training_curves_df = get_dfs(model, training_vars)
    short_training_experiments[(included_training_var_frac, test_frac, subsample, colsample_bytree)] = (importance_df, training_curves_df)
    print(f"added {(included_training_var_frac, test_frac, subsample, colsample_bytree)} to short_training_experiments")


In [None]:
subsample = 1.0
colsample_bytree = 1.0
training_vars, included_training_var_frac = sorted_training_vars, 1.0
for test_frac in tqdm([0.1, 0.9]):

    model = train_model(training_vars=training_vars, test_frac=test_frac, subsample=subsample, colsample_bytree=colsample_bytree)
    
    importance_df, training_curves_df = get_dfs(model, training_vars)
    short_training_experiments[(included_training_var_frac, test_frac, subsample, colsample_bytree)] = (importance_df, training_curves_df)
    print(f"added {(included_training_var_frac, test_frac, subsample, colsample_bytree)} to short_training_experiments")


In [None]:
subsample = 1.0
training_vars, included_training_var_frac = sorted_training_vars, 1.0
test_frac = 0.5
for colsample_bytree in tqdm([0.1, 0.5, 0.9]):

    model = train_model(training_vars=training_vars, test_frac=test_frac, subsample=subsample, colsample_bytree=colsample_bytree)
    
    importance_df, training_curves_df = get_dfs(model, training_vars)
    short_training_experiments[(included_training_var_frac, test_frac, subsample, colsample_bytree)] = (importance_df, training_curves_df)
    print(f"added {(included_training_var_frac, test_frac, subsample, colsample_bytree)} to short_training_experiments")
    

In [None]:
colsample_bytree = 1.0
training_vars, included_training_var_frac = sorted_training_vars, 1.0
test_frac = 0.5
for subsample in tqdm([0.1, 0.5, 0.9]):

    model = train_model(training_vars=training_vars, test_frac=test_frac, subsample=subsample, colsample_bytree=colsample_bytree)
    
    importance_df, training_curves_df = get_dfs(model, training_vars)
    short_training_experiments[(included_training_var_frac, test_frac, subsample, colsample_bytree)] = (importance_df, training_curves_df)
    print(f"added {(included_training_var_frac, test_frac, subsample, colsample_bytree)} to short_training_experiments")
    

In [None]:
# save short_training_experiments
with open(f"{intermediate_files_location}/short_training_experiments.pkl", "wb") as f:
    pickle.dump(short_training_experiments, f)


In [None]:
for k, v in short_training_experiments.items():
    print(k)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["test_loss"], label="Nominal Validation", c="black", s=100)
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["train_loss"], label="Nominal Training", c="black", marker="x", s=100)
colornum = 0
for k, v in short_training_experiments.items():
    included_training_var_frac, test_frac, subsample, colsample_bytree = k
    importance_df, training_curves_df = v
    if test_frac != 0.5 or subsample != 1.0 or colsample_bytree != 1.0:
        continue
    color = f"C{colornum}"
    colornum += 1
    plt.scatter(training_curves_df["iteration"], training_curves_df["test_loss"], label=f"Validation with {included_training_var_frac*100:.0f}% variables", c=color)
    plt.scatter(training_curves_df["iteration"], training_curves_df["train_loss"], label=f"Training with {included_training_var_frac*100:.0f}% variables", marker="x", c=color)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(fontsize=12)
plt.xlim(0, 19)
plt.ylim(0.6, 1.6)
plt.savefig(f"../plots/short_training_experiments_variable_frac.png")

plt.figure(figsize=(10, 6))
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["test_loss"], label="Nominal Validation", c="black", s=100)
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["train_loss"], label="Nominal Training", c="black", marker="x", s=100)
colornum = 0
for k, v in short_training_experiments.items():
    included_training_var_frac, test_frac, subsample, colsample_bytree = k
    importance_df, training_curves_df = v
    if included_training_var_frac != 1.0 or subsample != 1.0 or colsample_bytree != 1.0:
        continue
    color = f"C{colornum}"
    colornum += 1
    plt.scatter(training_curves_df["iteration"], training_curves_df["test_loss"], label=f"Validation with {(1-test_frac)*100:.0f}% train events", c=color)
    plt.scatter(training_curves_df["iteration"], training_curves_df["train_loss"], label=f"Training with {(1-test_frac)*100:.0f}% train events", marker="x", c=color)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(fontsize=12)
plt.xlim(0, 19)
plt.ylim(0.6, 1.6)
plt.savefig(f"../plots/short_training_experiments_test_frac.png")

plt.figure(figsize=(10, 6))
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["test_loss"], label="Nominal Validation", c="black", s=100)
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["train_loss"], label="Nominal Training", c="black", marker="x", s=100)
colornum = 0
for k, v in short_training_experiments.items():
    included_training_var_frac, test_frac, subsample, colsample_bytree = k
    importance_df, training_curves_df = v
    if included_training_var_frac != 1.0 or test_frac != 0.5 or colsample_bytree != 1.0:
        continue
    color = f"C{colornum}"
    colornum += 1
    plt.scatter(training_curves_df["iteration"], training_curves_df["test_loss"], label=f"Validation with subsample={subsample}", c=color)
    plt.scatter(training_curves_df["iteration"], training_curves_df["train_loss"], label=f"Training with subsample={subsample}", marker="x", c=color)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(fontsize=12)
plt.xlim(0, 19)
plt.ylim(0.6, 1.6)
plt.savefig(f"../plots/short_training_experiments_subsample.png")

plt.figure(figsize=(10, 6))
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["test_loss"], label="Nominal Validation", c="black", s=100)
plt.scatter(nominal_training_curves["iteration"], nominal_training_curves["train_loss"], label="Nominal Training", c="black", marker="x", s=100)
colornum = 0
for k, v in short_training_experiments.items():
    included_training_var_frac, test_frac, subsample, colsample_bytree = k
    importance_df, training_curves_df = v
    if included_training_var_frac != 1.0 or test_frac != 0.5 or subsample != 1.0:
        continue
    color = f"C{colornum}"
    colornum += 1
    plt.scatter(training_curves_df["iteration"], training_curves_df["test_loss"], label=f"Validation with colsample_bytree={colsample_bytree}", c=color)
    plt.scatter(training_curves_df["iteration"], training_curves_df["train_loss"], label=f"Training with colsample_bytree={colsample_bytree}", marker="x", c=color)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(fontsize=12)
plt.xlim(0, 19)
plt.ylim(0.6, 1.6)
plt.savefig(f"../plots/short_training_experiments_colsample_bytree.png")
