In [None]:
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import pickle
import uproot

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.file_locations import intermediate_files_location, data_files_location
from src.plotting_3d import plot_event
from src.signal_categories import train_category_labels, train_category_labels_latex


In [None]:
training = "all_vars"

reco_categories = train_category_labels
reco_category_labels_latex = train_category_labels_latex


In [None]:
print("loading all_df.parquet...")
all_df = pl.read_parquet(f"{intermediate_files_location}/all_df.parquet")
print(f"{all_df.shape=}")

# this only includes predictions for events passing the preselection used during training
print("loading predictions.parquet...")
pred_df = pl.read_parquet(f"../training_outputs/{training}/predictions.parquet")
print(f"{pred_df.shape=}")

print("merging all_df and predictions.pkl...")
merged_df_no_data_drop = all_df.join(
    pred_df, 
    on=["filetype", "run", "subrun", "event"], 
    how="left"
)
del all_df
del pred_df

# Use polars expressions to set "used_for_training" and "used_for_testing" for wc_kine_reco_Enu < 0
merged_df_no_data_drop = merged_df_no_data_drop.with_columns([
    pl.when(pl.col("wc_kine_reco_Enu") < 0)
      .then(False)
      .otherwise(pl.col("used_for_training"))
      .alias("used_for_training"),
    pl.when(pl.col("wc_kine_reco_Enu") < 0)
      .then(True)
      .otherwise(pl.col("used_for_testing"))
      .alias("used_for_testing")
])



In [None]:
full_pred = merged_df_no_data_drop.filter(
    ~pl.col("filetype").is_in(["data", "isotropic_one_gamma_overlay", "delete_one_gamma_overlay"])
)
full_data = merged_df_no_data_drop.filter(pl.col("filetype") == "data")

prob_categories = ["prob_" + cat for cat in reco_categories]
for prob in prob_categories:
    full_pred = full_pred.with_columns(pl.col(prob).fill_null(-1))
    full_data = full_data.with_columns(pl.col(prob).fill_null(-1))

generic_pred_df = full_pred.filter(pl.col("wc_kine_reco_Enu") > 0)
non_generic_pred_df = full_pred.filter(pl.col("wc_kine_reco_Enu") < 0)
del full_pred

num_train_events = generic_pred_df.filter(pl.col("used_for_training") == True).height
num_test_events = generic_pred_df.filter(pl.col("used_for_testing") == True).height
print(f"num_train_events: {num_train_events}, num_test_events: {num_test_events}")
frac_test = num_test_events / (num_train_events + num_test_events)
print(f"weighting up preselected prediction events by the fraction of test/train events: {frac_test:.3f}")

# Modify weights using polars expressions
generic_pred_df = generic_pred_df.with_columns(
    pl.when(pl.col("used_for_testing"))
    .then(pl.col("wc_net_weight") / frac_test)
    .otherwise(pl.col("wc_net_weight"))
    .alias("wc_net_weight")
)

full_pred = pl.concat([generic_pred_df, non_generic_pred_df])
del generic_pred_df
del non_generic_pred_df

test_pred = full_pred.filter(pl.col("used_for_testing") == True)


In [None]:
test_pred = test_pred.with_columns(
    pl.concat_list(prob_categories).list.arg_max().alias("reco_category_argmax_index")
)

full_data = full_data.with_columns(
    pl.concat_list(prob_categories).list.arg_max().alias("reco_category_argmax_index")
)

In [None]:
del test_pred # not using this for now

In [None]:
with open(f"{intermediate_files_location}/reco_category_queries.pkl", "rb") as f:
    reco_category_queries = pickle.load(f)


In [None]:
num_plots_per_cat = 10

all_reco_cat_data_dfs = []
for reco_i in range(len(reco_categories)):
    curr_reco_cat_data_df = full_data.filter(reco_category_queries[reco_i])

    num_plots = min(num_plots_per_cat, curr_reco_cat_data_df.height)

    curr_reco_cat_data_df = curr_reco_cat_data_df.head(num_plots)

    curr_reco_cat_data_df = curr_reco_cat_data_df.with_columns(
        pl.lit(reco_categories[reco_i]).alias("reco_category")
    )

    curr_reco_cat_data_df = curr_reco_cat_data_df.select([
        "filename", "filetype", "run", "subrun", "event", "reco_category",
    ])

    all_reco_cat_data_dfs.append(curr_reco_cat_data_df)

all_reco_cat_data_df = pl.concat(all_reco_cat_data_dfs)

all_reco_cat_data_df


In [None]:
spacepoint_vars = [
    "Trecchargeblob_spacepoints_x",
    "Trecchargeblob_spacepoints_y",
    "Trecchargeblob_spacepoints_z",
    "Trecchargeblob_spacepoints_q",
]

with uproot.open(f"{data_files_location}/MCC9.10_Run4b_v10_04_07_11_BNB_beam_on_surprise_reco2_hist.root") as f:
    dic = {}
    dic.update(f["wcpselection"]["T_spacepoints"].arrays(spacepoint_vars, library="np"))
    dic.update(f["wcpselection"]["T_eval"].arrays(["run", "subrun", "event"], library="np"))

# Convert numpy arrays to lists so Polars recognizes them as List types
list_dic = {}
for var in dic.keys():
    list_dic[var] = [arr.tolist() if isinstance(arr, np.ndarray) else arr for arr in dic[var]]

spacepoints_df = pl.DataFrame(list_dic)

spacepoints_df = spacepoints_df.with_columns(pl.lit("MCC9.10_Run4b_v10_04_07_11_BNB_beam_on_surprise_reco2_hist.root").alias("filename"))

# filter out events with no spacepoints
spacepoints_df = spacepoints_df.filter(pl.col(spacepoint_vars[0]).list.len() > 0)


In [None]:
merged_df = all_reco_cat_data_df.join(spacepoints_df, on=["filename", "run", "subrun", "event"], how="left")

merged_df


In [None]:
prev_cat = ""
for i in range(merged_df.height):
    run = merged_df["run"][i]
    subrun = merged_df["subrun"][i]
    event = merged_df["event"][i]
    reco_category = merged_df["reco_category"][i]

    if reco_category != prev_cat:
        print(f"######################### {reco_category} #########################")
        prev_cat = reco_category

    plt.figure(figsize=(7, 5))
    plt.scatter(merged_df["Trecchargeblob_spacepoints_z"][i], merged_df["Trecchargeblob_spacepoints_x"][i], c=merged_df["Trecchargeblob_spacepoints_q"][i], 
            cmap="jet", s=2, vmin=0, vmax=10_000)
    #plt.colorbar()
    plt.xlabel("z")
    plt.ylabel("x")
    plt.title(f"{reco_category} Candidate From Open Data\nRSE {run} {subrun} {event}")
    plt.show()

In [None]:
print(1/0)

In [None]:
# event with a proton blip upstream of one shower
event_index = 7

# each entry is name : (points, color, cmap, size, visible)
points_dic = {}

points_dic["WC_reco_nu_vtx"] = (
    np.array([all_df["wc_reco_nuvtxX"].to_numpy()[event_index], all_df["wc_reco_nuvtxY"].to_numpy()[event_index], all_df["wc_reco_nuvtxZ"].to_numpy()[event_index]]),
    "green",
    None,
    10,
    "legendonly"
)
points_dic["true_nu_vtx"] = (
    np.array([all_df["wc_truth_vtxX"].to_numpy()[event_index], all_df["wc_truth_vtxY"].to_numpy()[event_index], all_df["wc_truth_vtxZ"].to_numpy()[event_index]]),
    "red",
    None,
    10,
    "legendonly"
)
points_dic["true_corr_nu_vtx"] = (
    np.array([all_df["wc_truth_corr_nuvtxX"].to_numpy()[event_index], all_df["wc_truth_corr_nuvtxY"].to_numpy()[event_index], all_df["wc_truth_corr_nuvtxZ"].to_numpy()[event_index]]),
    "yellow",
    None,
    10,
    "legendonly"
)
points_dic["Trec_spacepoints"] = (
    np.vstack((all_df["wc_Trec_spacepoints_x"].to_numpy()[event_index], all_df["wc_Trec_spacepoints_y"].to_numpy()[event_index], all_df["wc_Trec_spacepoints_z"].to_numpy()[event_index])).T,
    "red",
    None,
    1,
    "legendonly"
)
points_dic["Treccharge_spacepoints"] = (
    np.vstack((all_df["wc_Treccharge_spacepoints_x"].to_numpy()[event_index], all_df["wc_Treccharge_spacepoints_y"].to_numpy()[event_index], all_df["wc_Treccharge_spacepoints_z"].to_numpy()[event_index])).T,
    "blue",
    None,
    1,
    "legendonly"
)
points_dic["Trecchargeblob_spacepoints"] = (
    np.vstack((all_df["wc_Trecchargeblob_spacepoints_x"].to_numpy()[event_index], all_df["wc_Trecchargeblob_spacepoints_y"].to_numpy()[event_index], all_df["wc_Trecchargeblob_spacepoints_z"].to_numpy()[event_index])).T,
    all_df["wc_Trecchargeblob_spacepoints_q"].to_numpy()[event_index],
    "jet",
    1,
    True
)

points_dic["electron blips"] = (
    np.vstack((all_df["electron_blip_x"].to_numpy()[event_index], all_df["electron_blip_y"].to_numpy()[event_index], all_df["electron_blip_z"].to_numpy()[event_index])).T,
    "orange",
    None,
    5,
    True
)
points_dic["proton blips"] = (
    np.vstack((all_df["proton_blip_x"].to_numpy()[event_index], all_df["proton_blip_y"].to_numpy()[event_index], all_df["proton_blip_z"].to_numpy()[event_index])).T,
    "blue",
    None,
    5,
    True
)
points_dic["other blips"] = (
    np.vstack((all_df["other_blip_x"].to_numpy()[event_index], all_df["other_blip_y"].to_numpy()[event_index], all_df["other_blip_z"].to_numpy()[event_index])).T,
    "purple",
    None,
    5,
    True
)

plot_event(event_index, points_dic)
