In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['hatch.linewidth'] = 0.2
import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm
import polars as pl
import xgboost as xgb
print("xgboost version:", xgb.__version__)

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.signal_categories import topological_category_labels, topological_category_colors, topological_category_labels_latex, topological_category_hatches, topological_categories_dic
from src.signal_categories import filetype_category_labels, filetype_category_colors, filetype_category_hatches
from src.signal_categories import del1g_detailed_category_labels, del1g_detailed_category_colors, del1g_detailed_category_labels_latex, del1g_detailed_category_hatches, del1g_detailed_categories_dic
from src.signal_categories import del1g_simple_category_labels, del1g_simple_category_colors, del1g_simple_category_labels_latex, del1g_simple_category_hatches, del1g_simple_categories_dic
from src.signal_categories import train_category_labels, train_category_labels_latex

from src.ntuple_variables.pandora_variables import pandora_scalar_second_half_training_vars

from src.file_locations import intermediate_files_location

from src.plot_helpers import make_histogram_plot

from src.ntuple_variables.variables import combined_training_vars


# File Loading

In [None]:
training = "all_vars"
training_vars = combined_training_vars

reco_categories = train_category_labels
reco_category_labels_latex = train_category_labels_latex


In [None]:
print("loading all_df.parquet...")
all_df = pl.read_parquet(f"{intermediate_files_location}/all_df.parquet")
print(f"{all_df.shape=}")

# this only includes predictions for events passing the preselection used during training
print("loading predictions.parquet...")
pred_df = pl.read_parquet(f"../training_outputs/{training}/predictions.parquet")
print(f"{pred_df.shape=}")

print("merging all_df and predictions.pkl...")
merged_df_no_data_drop = all_df.join(
    pred_df, 
    on=["filetype", "run", "subrun", "event"], 
    how="left"
)
del all_df
del pred_df

# Use polars expressions to set "used_for_training" and "used_for_testing" for wc_kine_reco_Enu < 0
merged_df_no_data_drop = merged_df_no_data_drop.with_columns([
    pl.when(pl.col("wc_kine_reco_Enu") < 0)
      .then(False)
      .otherwise(pl.col("used_for_training"))
      .alias("used_for_training"),
    pl.when(pl.col("wc_kine_reco_Enu") < 0)
      .then(True)
      .otherwise(pl.col("used_for_testing"))
      .alias("used_for_testing")
])



In [None]:
full_pred = merged_df_no_data_drop.filter(
    ~pl.col("filetype").is_in(["data", "isotropic_one_gamma_overlay", "delete_one_gamma_overlay"])
)
full_data = merged_df_no_data_drop.filter(pl.col("filetype") == "data")

prob_categories = ["prob_" + cat for cat in reco_categories]
for prob in prob_categories:
    full_pred = full_pred.with_columns(pl.col(prob).fill_null(-1))
    full_data = full_data.with_columns(pl.col(prob).fill_null(-1))

generic_pred_df = full_pred.filter(pl.col("wc_kine_reco_Enu") > 0)
non_generic_pred_df = full_pred.filter(pl.col("wc_kine_reco_Enu") < 0)
del full_pred

num_train_events = generic_pred_df.filter(pl.col("used_for_training") == True).height
num_test_events = generic_pred_df.filter(pl.col("used_for_testing") == True).height
print(f"num_train_events: {num_train_events}, num_test_events: {num_test_events}")
frac_test = num_test_events / (num_train_events + num_test_events)
print(f"weighting up preselected prediction events by the fraction of test/train events: {frac_test:.3f}")

# Modify weights using polars expressions
generic_pred_df = generic_pred_df.with_columns(
    pl.when(pl.col("used_for_testing"))
    .then(pl.col("wc_net_weight") / frac_test)
    .otherwise(pl.col("wc_net_weight"))
    .alias("wc_net_weight")
)

full_pred = pl.concat([generic_pred_df, non_generic_pred_df])
del generic_pred_df
del non_generic_pred_df

test_pred = full_pred.filter(pl.col("used_for_testing") == True)


In [None]:
merged_df = pl.concat([test_pred, full_data])
del test_pred
del full_data
presel_merged_df = merged_df.filter(pl.col("wc_kine_reco_Enu") > 0)

presel_merged_data_df = presel_merged_df.filter(pl.col("filetype") == "data")
presel_merged_pred_df = presel_merged_df.filter(pl.col("filetype") != "data")


In [None]:
# Create boolean expressions for each condition
shw_sp_n_20mev_showers_expr = pl.col("wc_shw_sp_n_20mev_showers") > 0
reco_nuvtxX_expr = (pl.col("wc_reco_nuvtxX") > 5.0) & (pl.col("wc_reco_nuvtxX") < 250.0)
single_photon_numu_score_expr = pl.col("wc_single_photon_numu_score") > 0.4
single_photon_other_score_expr = pl.col("wc_single_photon_other_score") > 0.2
single_photon_ncpi0_score_expr = pl.col("wc_single_photon_ncpi0_score") > -0.05
single_photon_nue_score_expr = pl.col("wc_single_photon_nue_score") > -1.0
shw_sp_n_20br1_showers_expr = pl.col("wc_shw_sp_n_20br1_showers") == 1

# Combine all conditions
selection_expr = (
    shw_sp_n_20mev_showers_expr &
    reco_nuvtxX_expr &
    single_photon_numu_score_expr &
    single_photon_other_score_expr &
    single_photon_ncpi0_score_expr &
    single_photon_nue_score_expr &
    shw_sp_n_20br1_showers_expr
)

# Add selection column using when/then/otherwise
presel_merged_df = presel_merged_df.with_columns(
    pl.when(selection_expr)
    .then(1)
    .otherwise(0)
    .alias("erin_inclusive_1g_sel")
)


In [None]:
# Get the probabilities and find argmax index for each row
presel_merged_df = presel_merged_df.with_columns(
    pl.concat_list(prob_categories).list.arg_max().alias("reco_category_argmax_index")
)

# Build list of query strings
reco_category_argmax_queries = []
for i, signal_category in enumerate(reco_categories):
    reco_category_argmax_queries.append(pl.col("reco_category_argmax_index") == i)


In [None]:
presel_detvar_df = pl.read_parquet(f"{intermediate_files_location}/detvar_presel_df_train_vars.parquet")

x = presel_detvar_df.select(training_vars).to_numpy()
x = x.astype(np.float64)
x[np.isinf(x)] = np.nan

# load xgboost model from f"../training_outputs/{training}/bdt.json")
# model.get_booster().save_model(output_dir / "bdt.json")

model = xgb.XGBClassifier()
model.load_model(f"../training_outputs/{training}/bdt.json")

all_probabilities = model.predict_proba(x)

num_probabilities = all_probabilities.shape[1]
for i in tqdm(range(num_probabilities)):
    presel_detvar_df = presel_detvar_df.with_columns(pl.DataFrame({
        f'prob_{train_category_labels[i]}': all_probabilities[:, i]
    }))


# Preselection Histogram

In [None]:
make_histogram_plot(pred_and_data_sel_df=presel_merged_df, bins=np.linspace(0, 2000, 21), 
            var="wc_kine_reco_Enu", display_var=r"WC Reconstructed $E_\nu$ (MeV)", title="Preselection",

            include_ratio=True, include_decomposition=True,

            selname="generic_presel",
            dont_load_rw_from_systematic_cache=False, dont_load_detvar_from_systematic_cache=False, 
            use_rw_systematics=True, use_detvar_systematics=True, use_detvar_bootstrapping=True, detvar_df=presel_detvar_df,

            plot_det_variations=True,

            plot_sys_breakdown=True, 
            #include_data_stat=False, include_pred_stat=False, include_total=False, just_genie_breakdown=False, include_rw=False,
            #just_detvar_breakdown=True,
            #print_sys_breakdown=False,
            )


# Grid Plot Cuts

In [None]:
custom_cut_values = [
    pl.col("prob_1gNp") > 0.3,
    pl.col("prob_1g0p") > 0.9,
    pl.col("prob_1gNp1mu") > 0.5, 
    pl.col("prob_1g0p1mu") > 0.2,
    pl.col("prob_1g_outFV") > 0.5,
    None, #pl.col("prob_NC1pi0_Np") > 0.5,
    None, #pl.col("prob_NC1pi0_0p") > 0.5,
    (pl.col("prob_numuCC1pi0_Np") > 0.3) & (pl.col("prob_numuCC1pi0_0p") < 0.1),
    pl.col("prob_numuCC1pi0_0p") > 0.1,
    pl.col("prob_1pi0_outFV") > 0.1,
    (pl.col("prob_nueCC_Np") > 0.05) & (pl.col("prob_nueCC_0p") < 0.05),
    pl.col("prob_nueCC_0p") > 0.05,
    pl.col("prob_numuCC_Np") > 0.5,
    pl.col("prob_numuCC_0p") > 0.5,
    pl.col("prob_pi0_dalitz_decay") > 0.1,
    pl.col("prob_multi_pi0") > 0.02,
    pl.col("prob_eta_other") > 0.01,
    None, #pl.col("prob_NC_no_gamma") > 0.5,
    None, #pl.col("prob_other_outFV_dirt") > 0.5,
    None, #pl.col("prob_ext") > 0.5,
]

reco_category_queries_possible_overlap = []
for i, custom_cut_value in enumerate(custom_cut_values):
    if custom_cut_value is None:
        reco_category_queries_possible_overlap.append(reco_category_argmax_queries[i])
    else:
        reco_category_queries_possible_overlap.append(custom_cut_value)

reco_category_queries = []
for i in range(len(reco_category_queries_possible_overlap)):
    curr_query = reco_category_queries_possible_overlap[i]
    for j in range(i):
        curr_query = curr_query & ~reco_category_queries_possible_overlap[j]

    reco_category_queries.append(curr_query)

# saving this to a file so we can apply the same cuts in the efficiency plots
with open(f"{intermediate_files_location}/reco_category_queries.pkl", "wb") as f:
    pickle.dump(reco_category_queries, f)


# 1gNp1mu studies

In [None]:
make_histogram_plot(pred_and_data_sel_df=presel_merged_df, bins=np.linspace(0, 1, 21), include_overflow=False, include_underflow=False, log_y=True, include_legend=False,
            var="prob_1gNp1mu", title="Preselection",

            include_ratio=True, include_decomposition=True,
            
            selname="generic_presel",
            use_rw_systematics=True, use_detvar_systematics=True, detvar_df=presel_detvar_df,

            plot_sys_breakdown=True, 
            include_data_stat=False, include_pred_stat=False, include_total=False, just_genie_breakdown=False, include_rw=False,
            just_detvar_breakdown=True,
            print_sys_breakdown=False,
            )


In [None]:
detvar_sel_1gNp1mu_df = presel_detvar_df.filter(reco_category_queries[reco_categories.index("1gNp1mu")])
pred_and_data_sel_1gNp1mu_df = presel_merged_df.filter(reco_category_queries[reco_categories.index("1gNp1mu")])

detvar_sel_1g0p1mu_df = presel_detvar_df.filter(reco_category_queries[reco_categories.index("1g0p1mu")])
pred_and_data_sel_1g0p1mu_df = presel_merged_df.filter(reco_category_queries[reco_categories.index("1g0p1mu")])

In [None]:
make_histogram_plot(pred_and_data_sel_df=pred_and_data_sel_1gNp1mu_df, bins=np.linspace(0, 2000, 11), include_overflow=True, include_underflow=False, log_y=False, include_legend=False,
            var="wc_kine_reco_Enu", title="1gNp1mu Selection", selname="1gNp1mu_sel",
            include_ratio=True, include_decomposition=True,
            use_rw_systematics=True, use_detvar_systematics=True, detvar_df=detvar_sel_1gNp1mu_df,
            )

make_histogram_plot(pred_and_data_sel_df=pred_and_data_sel_1g0p1mu_df, bins=np.linspace(0, 2000, 11), include_overflow=True, include_underflow=False, log_y=False, include_legend=False,
            var="wc_kine_reco_Enu", title="1g0p1mu Selection", selname="1g0p1mu_sel",
            include_ratio=True, include_decomposition=True,
            use_rw_systematics=True, use_detvar_systematics=True, detvar_df=detvar_sel_1g0p1mu_df,
            )

In [None]:
make_histogram_plot(pred_and_data_sel_df=pred_and_data_sel_1gNp1mu_df, bins=np.linspace(0, 1, 11), include_overflow=True, include_underflow=False, log_y=False, include_legend=False,
            var="wc_reco_showerKE", title="1gNp1mu Selection", selname="1gNp1mu_sel",
            include_ratio=True, include_decomposition=True,
            use_rw_systematics=True, use_detvar_systematics=True, detvar_df=detvar_sel_1gNp1mu_df,
            )

make_histogram_plot(pred_and_data_sel_df=pred_and_data_sel_1g0p1mu_df, bins=np.linspace(0, 1, 11), include_overflow=True, include_underflow=False, log_y=False, include_legend=False,
            var="wc_reco_showerKE", title="1g0p1mu Selection", selname="1g0p1mu_sel",
            include_ratio=True, include_decomposition=True,
            use_rw_systematics=True, use_detvar_systematics=True, detvar_df=detvar_sel_1g0p1mu_df,
            )

In [None]:
make_histogram_plot(pred_and_data_sel_df=pred_and_data_sel_1gNp1mu_df, bins=np.linspace(0, 2, 11), include_overflow=True, include_underflow=False, log_y=False, include_legend=False,
            var="wc_reco_muonMomentum_3", title="1gNp1mu Selection", selname="1gNp1mu_sel",
            include_ratio=True, include_decomposition=True,
            use_rw_systematics=True, use_detvar_systematics=True, detvar_df=detvar_sel_1gNp1mu_df,
            )

make_histogram_plot(pred_and_data_sel_df=pred_and_data_sel_1g0p1mu_df, bins=np.linspace(0, 2, 11), include_overflow=True, include_underflow=False, log_y=False, include_legend=False,
            var="wc_reco_muonMomentum_3", title="1g0p1mu Selection", selname="1g0p1mu_sel",
            include_ratio=True, include_decomposition=True,
            use_rw_systematics=True, use_detvar_systematics=True, detvar_df=detvar_sel_1g0p1mu_df,
            )


In [None]:
make_histogram_plot(pred_and_data_sel_df=pred_and_data_sel_1gNp1mu_df, bins=np.linspace(-0.5, 10.5, 12), include_overflow=True, include_underflow=False, log_y=False, include_legend=False,
            var="wc_reco_num_protons_35_MeV", title="1gNp1mu Selection", selname="1gNp1mu_sel",
            include_ratio=True, include_decomposition=True,
            use_rw_systematics=True, use_detvar_systematics=True, detvar_df=detvar_sel_1gNp1mu_df,
            )

make_histogram_plot(pred_and_data_sel_df=pred_and_data_sel_1g0p1mu_df, bins=np.linspace(-0.5, 10.5, 12), include_overflow=True, include_underflow=False, log_y=False, include_legend=False,
            var="wc_reco_num_protons_35_MeV", title="1g0p1mu Selection", selname="1g0p1mu_sel",
            include_ratio=True, include_decomposition=True,
            use_rw_systematics=True, use_detvar_systematics=True, detvar_df=detvar_sel_1g0p1mu_df,
            )
