In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.signal_categories import topological_category_labels, topological_category_colors
from src.signal_categories import physics_category_labels, physics_category_colors
from src.signal_categories import filetype_category_labels, filetype_category_colors

from src.file_locations import intermediate_files_location


# File Loading

In [None]:
training = "first_combined_training"
#training = "first_wc_training"
#training = "first_lantern_training"

In [None]:
print("loading all_df.pkl...")
with open(f"{intermediate_files_location}/all_df.pkl", "rb") as f:
    all_df = pickle.load(f)
print(f"{all_df.shape=}")

# this only includes predictions for events passing the preselection used during training
print("loading predictions.pkl...")
with open(f"../training_outputs/{training}/predictions.pkl", "rb") as f:
    pred_df = pickle.load(f)
print(f"{pred_df.shape=}")

print("merging all_df and predictions.pkl...")
merged_df = pd.merge(all_df, pred_df, on=["filetype", "run", "subrun", "event"], how="left")

sig_categories = topological_category_labels

prob_categories = ["prob_" + cat for cat in sig_categories]

for prob in prob_categories:
    merged_df[prob] = merged_df[prob].fillna(-1)

num_train_events = len(merged_df.query("used_for_training == True"))
num_test_events = len(merged_df.query("used_for_testing == True"))
frac_test = num_test_events / (num_train_events + num_test_events)
print(f"weighting up by the fraction of test events: {frac_test:.3f}")

modified_net_weights = []
used_for_testing = merged_df["used_for_testing"].to_numpy()
wc_net_weights = merged_df["wc_net_weight"].to_numpy()
for i in range(len(merged_df)):
    if used_for_testing[i]:
        modified_net_weights.append(wc_net_weights[i] / frac_test)
    else:
        modified_net_weights.append(wc_net_weights[i])
merged_df["wc_net_weight"] = modified_net_weights         

merged_df = merged_df.query("used_for_testing == True")
generic_merged_df = merged_df.query("wc_kine_reco_Enu > 0")
presel_merged_df = generic_merged_df.query("wc_shw_sp_n_20mev_showers > 0")

print(f"{merged_df.shape=}")
print(f"{generic_merged_df.shape=}")
print(f"{presel_merged_df.shape=}")


In [None]:
probs_2d_arr = presel_merged_df[prob_categories].to_numpy()
reco_categories_argmax = np.argmax(probs_2d_arr, axis=1)
presel_merged_df["reco_category_argmax_index"] = reco_categories_argmax


# Preselection Efficiencies

In [None]:
breakdown_queries = []
for label in topological_category_labels:
    breakdown_queries.append(f"topological_signal_category == '{label}'")

total_num_truth_by_category = [np.sum(merged_df.query(query)["wc_net_weight"]) for query in breakdown_queries]
total_num_generic_truth_by_category = [np.sum(generic_merged_df.query(query)["wc_net_weight"]) for query in breakdown_queries]
total_num_presel_truth_by_category = [np.sum(presel_merged_df.query(query)["wc_net_weight"]) for query in breakdown_queries]

print("WC Generic Selection Topological Efficiencies:")
for i in range(len(topological_category_labels)):
    print(f"{topological_category_labels[i]}: {total_num_presel_truth_by_category[i]} / {total_num_truth_by_category[i]} = {total_num_presel_truth_by_category[i] / total_num_truth_by_category[i]:.3f}")

print("Preselection Topological Efficiencies:")
for i in range(len(topological_category_labels)):
    print(f"{topological_category_labels[i]}: {total_num_presel_truth_by_category[i]} / {total_num_truth_by_category[i]} = {total_num_presel_truth_by_category[i] / total_num_truth_by_category[i]:.3f}")


# Argmax Efficiencies

In [None]:
argmax_sel_matrix = np.zeros((len(topological_category_labels), len(topological_category_labels)))

for i in range(len(topological_category_labels)):
    for j in range(len(topological_category_labels)):
        argmax_sel_matrix[i, j] = presel_merged_df.query(f"topological_signal_category == '{topological_category_labels[i]}' and reco_category_argmax_index == {j}")["wc_net_weight"].sum()

plt.figure(figsize=(10, 10))
plt.imshow(argmax_sel_matrix, cmap="Blues")
for i in range(len(topological_category_labels)):
    for j in range(len(topological_category_labels)):
        plt.text(j, i, f'{argmax_sel_matrix[i,j]:.1f}', ha='center', va='center')
plt.colorbar(label="Number of Events")
plt.xticks(range(len(topological_category_labels)), topological_category_labels, rotation=90)
plt.yticks(range(len(topological_category_labels)), topological_category_labels)
plt.xlabel("Reconstructed Category")
plt.ylabel("Truth Category")
plt.title("Argmax Selection Efficiency Matrix")
plt.show()


# 1g Efficiencies By Cut Value

In [None]:
# for each 1g category, as well as combined 1g categories:
# eff vs pur as function of cut value
# compare with NC Delta selection curve
# compare with Erin inclusive 1g curve and point
# maybe ask Mark about including the gLEE point? Might be totally different with nugraph...

