In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['hatch.linewidth'] = 0.2
import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm
import polars as pl
import xgboost as xgb
print("xgboost version:", xgb.__version__)

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.signal_categories import topological_category_labels, topological_category_colors, topological_category_labels_latex, topological_category_hatches, topological_categories_dic
from src.signal_categories import filetype_category_labels, filetype_category_colors, filetype_category_hatches
from src.signal_categories import del1g_detailed_category_labels, del1g_detailed_category_colors, del1g_detailed_category_labels_latex, del1g_detailed_category_hatches, del1g_detailed_categories_dic
from src.signal_categories import del1g_simple_category_labels, del1g_simple_category_colors, del1g_simple_category_labels_latex, del1g_simple_category_hatches, del1g_simple_categories_dic
from src.signal_categories import train_category_labels, train_category_labels_latex

from src.ntuple_variables.pandora_variables import pandora_scalar_second_half_training_vars

from src.file_locations import intermediate_files_location

from src.plot_helpers import make_histogram_plot

from src.ntuple_variables.variables import combined_training_vars

from src.df_helpers import lazy_height


# File Loading

In [None]:
training = "all_vars"
training_vars = combined_training_vars

reco_categories = train_category_labels
reco_category_labels_latex = train_category_labels_latex


In [None]:
print("loading all_df.parquet...")
all_df = pl.scan_parquet(f"{intermediate_files_location}/all_df.parquet")
print(f"num events in all_df: {lazy_height(all_df)}")


In [None]:
full_pred_data = all_df.filter(
    ~pl.col("filetype").is_in(["isotropic_one_gamma_overlay", "delete_one_gamma_overlay"])
)

presel_merged_df_allvars = full_pred_data.filter(pl.col("wc_kine_reco_Enu") > 0)


# Preselection Histogram

In [None]:
# load columns from presel_merged_df
load_vars = list(presel_merged_df_allvars.collect_schema().names())

# remove columns combined_training_vars variables, tons of variables that aren't needed
load_vars = [col for col in load_vars if not (col in combined_training_vars)]

# TEMPORARY, since we didn't exclude all the pandora postprocessing variables
#load_vars = [col for col in load_vars if not (["pandora_max" in col])]

extra_vars = [
    "wc_kine_reco_Enu",
    "wc_reco_num_protons_35_MeV",
    "wc_reco_backwards_projected_dist",
    "wc_reco_distance_to_boundary",
    "wc_reco_shower_theta",
    "wc_reco_shower_phi",
    "wc_kine_pio_mass",
    "lantern_diphoton_mass",
]

# add back in the current variable
for var in extra_vars:
    if var not in load_vars:
        load_vars.append(var)

print(load_vars)

In [None]:
presel_merged_df = presel_merged_df_allvars.select(load_vars).collect()

make_histogram_plot(pred_and_data_sel_df=presel_merged_df, bins=np.linspace(0, 2000, 21), 
            var="wc_kine_reco_Enu", display_var=r"WC Reconstructed $E_\nu$ (MeV)", title="Preselection",
            selname="generic_presel"
            )


In [None]:
# Checking the splitting of 4a and 4nota5 open data sets, this makes the plot labels wrong!

presel_merged_df_run4a = presel_merged_df_allvars.filter(pl.col("normalizing_run_period") == "4a").collect()

bins = np.linspace(0, 2000, 21)
#bins = np.linspace(0, 3000, 2)

make_histogram_plot(pred_and_data_sel_df=presel_merged_df_run4a, bins=bins, 
            var="wc_kine_reco_Enu", display_var=r"WC Reconstructed $E_\nu$ (MeV)", title="Preselection",
            selname="generic_presel"
            )

presel_merged_df_run4nota5 = presel_merged_df_allvars.filter(pl.col("normalizing_run_period") == "4nota5").collect()

make_histogram_plot(pred_and_data_sel_df=presel_merged_df_run4nota5, bins=bins, 
            var="wc_kine_reco_Enu", display_var=r"WC Reconstructed $E_\nu$ (MeV)", title="Preselection",
            selname="generic_presel"
            )