In [None]:
import uproot
print("uproot version: ", uproot.__version__)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
from tqdm.notebook import tqdm
import pickle

import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.file_locations import data_files_location, intermediate_files_location
from src.ntuple_variables.variables import pandora_training_vars, glee_training_vars, lantern_training_vars


In [None]:
rw_sys_df = pl.read_parquet("/nevis/riverside/data/leehagaman/ngem/intermediate_files/presel_weights_df.parquet")
rw_sys_df

In [None]:
detvar_df = pl.read_parquet("/nevis/riverside/data/leehagaman/ngem/intermediate_files/detvar_presel_df_train_vars.parquet")
detvar_df[["filename", "filetype", "vartype", "run", "subrun", "event", "wc_kine_reco_Enu", "lantern_second_max_nonprimary_shower_RecoE"]].filter(pl.col("lantern_second_max_nonprimary_shower_RecoE") > 0)

In [None]:
all_df = pl.read_parquet("/nevis/riverside/data/leehagaman/ngem/intermediate_files/all_df.parquet")
all_df

In [None]:
all_df[[col for col in all_df.columns if "lantern_second_max_nonprimary_shower" in col]].filter(pl.col("lantern_second_max_nonprimary_shower_RecoE") > 0)



In [None]:
print(1/0)

In [None]:
load_cols = [col for col in all_df.columns if "trackstub" in col]
all_df[load_cols].filter(pl.col("glee_energy_ranked_trackstub_candidate_energy") > 0)


In [None]:
load_cols = [col for col in all_df.columns if "isolation" in col]
all_df[load_cols].filter(pl.col("glee_dist_ranked_isolation_min_dist_trk_shr") > 0)


In [None]:
load_cols = [
    "wc_true_max_prim_proton_energy",
    "wc_true_max_prim_proton_costheta",
    "wc_true_max_prim_proton_phi",
    "wc_true_leading_shower_energy",
    "wc_true_leading_shower_costheta",
    "wc_true_leading_shower_phi",
    "wc_true_subleading_shower_energy",
]
all_df[load_cols].filter(pl.col("wc_true_max_prim_proton_energy") > 0)


In [None]:
load_cols = [
    "wc_reco_max_prim_proton_energy",
    "wc_reco_max_prim_proton_costheta",
    "wc_reco_max_prim_proton_phi",
    "wc_reco_max_prim_other_track_energy",
    "wc_reco_max_prim_other_track_costheta",
    "wc_reco_max_prim_other_track_phi",
]
all_df[load_cols].filter(pl.col("wc_reco_max_prim_proton_energy") > 0)


In [None]:
compressed_all_df = all_df.copy()
compressed_all_df.info(memory_usage="deep")


In [None]:
for c in compressed_all_df.select_dtypes("float64"):
    compressed_all_df[c] = compressed_all_df[c].astype("float32")

compressed_all_df.info(memory_usage="deep")

In [None]:
for c in compressed_all_df.select_dtypes("int64"):
    compressed_all_df[c] = pd.to_numeric(compressed_all_df[c], downcast="integer")

compressed_all_df.info(memory_usage="deep")

In [None]:
for c in compressed_all_df.select_dtypes("object"):
    unique_ratio = compressed_all_df[c].nunique() / len(compressed_all_df)
    print(c, compressed_all_df[c].dtype, compressed_all_df[c].nunique(), "/", len(compressed_all_df), unique_ratio)
    if unique_ratio < 0.5:
        compressed_all_df[c] = compressed_all_df[c].astype("category")

compressed_all_df.info(memory_usage="deep")

In [None]:
non_numerical_cols = [col for col in all_df.columns if str(type(all_df[col][0])) != "<class 'numpy.int64'>" and str(type(all_df[col][0])) != "<class 'numpy.float64'>" and str(type(all_df[col][0])) != "<class 'numpy.bool'>" and str(type(all_df[col][0])) != "<class 'bool'>"]

non_numerical_cols

In [None]:
for var in pandora_training_vars:
    print(var)
    print(all_df[var][:5])
    print()


In [None]:
for var in glee_training_vars:
    print(var)
    print(all_df[var][:5])
    print()


In [None]:
bins = np.linspace(-0.5, 10.5, 12)

plt.figure()
plt.hist(all_df["wc_reco_num_protons_5_MeV"], bins=bins, label="5 MeV", histtype="step")
plt.hist(all_df["wc_reco_num_protons_10_MeV"], bins=bins, label="10 MeV", histtype="step")
plt.hist(all_df["wc_reco_num_protons_15_MeV"], bins=bins, label="15 MeV", histtype="step")
plt.hist(all_df["wc_reco_num_protons_20_MeV"], bins=bins, label="20 MeV", histtype="step")
plt.hist(all_df["wc_reco_num_protons_25_MeV"], bins=bins, label="25 MeV", histtype="step")
plt.hist(all_df["wc_reco_num_protons_30_MeV"], bins=bins, label="30 MeV", histtype="step")
plt.hist(all_df["wc_reco_num_protons_35_MeV"], bins=bins, label="35 MeV", histtype="step")
plt.hist(all_df["wc_reco_num_protons_40_MeV"], bins=bins, label="40 MeV", histtype="step")
plt.hist(all_df["wc_reco_num_protons_45_MeV"], bins=bins, label="45 MeV", histtype="step")
plt.hist(all_df["wc_reco_num_protons_50_MeV"], bins=bins, label="50 MeV", histtype="step")
plt.legend()
plt.yscale("log")
plt.show()

plt.figure()
plt.hist(all_df["lantern_prim_track_proton_num_5MeV"], bins=bins, label="5 MeV", histtype="step")
plt.hist(all_df["lantern_prim_track_proton_num_10MeV"], bins=bins, label="10 MeV", histtype="step")
plt.hist(all_df["lantern_prim_track_proton_num_15MeV"], bins=bins, label="15 MeV", histtype="step")
plt.hist(all_df["lantern_prim_track_proton_num_20MeV"], bins=bins, label="20 MeV", histtype="step")
plt.hist(all_df["lantern_prim_track_proton_num_25MeV"], bins=bins, label="25 MeV", histtype="step")
plt.hist(all_df["lantern_prim_track_proton_num_30MeV"], bins=bins, label="30 MeV", histtype="step")
plt.hist(all_df["lantern_prim_track_proton_num_35MeV"], bins=bins, label="35 MeV", histtype="step")
plt.hist(all_df["lantern_prim_track_proton_num_40MeV"], bins=bins, label="40 MeV", histtype="step")
plt.hist(all_df["lantern_prim_track_proton_num_45MeV"], bins=bins, label="45 MeV", histtype="step")
plt.hist(all_df["lantern_prim_track_proton_num_50MeV"], bins=bins, label="50 MeV", histtype="step")
plt.legend()
plt.yscale("log")
plt.show()


In [None]:
np.max(all_df["lantern_prim_track_proton_num_5MeV"])

In [None]:
lantern_cols = [col for col in all_df.columns if "lantern" in col]
lantern_df = all_df[lantern_cols]

# print each element in the first row of all_df
for col in lantern_df.columns:
    print(f"{col}: {lantern_df[col][0]}")

lantern_df

In [None]:
all_df.columns

In [None]:
np.nan_to_num(all_df["glee_max_ssv_score"], nan=-999)

In [None]:
plt.hist(np.nan_to_num(all_df["glee_max_ssv_score"], nan=-999), bins=100)

In [None]:
all_df[[col for col in all_df.columns if "glee" in col]]

In [None]:
for col in all_df.columns:
    if "glee" in col:
        print(f"{col}: {all_df[col][0]}")

In [None]:
all_df[[col for col in all_df.columns if "blip" in col]]

In [None]:
all_df["glee_sss_candidate_veto_score"]

In [None]:
all_df[['wc_pandora_dist', 'wc_pandora_sce_dist','wc_lantern_dist', 'lantern_pandora_dist', 'lantern_pandora_sce_dist']]

plt.figure(figsize=(10, 5))
bins = np.linspace(0, 10, 100)
plt.hist(all_df['wc_pandora_dist'], histtype='step', bins=bins, label='WC Pandora')
plt.hist(all_df['wc_pandora_sce_dist'], histtype='step', bins=bins, label='WC Pandora SCE')
plt.hist(all_df['wc_lantern_dist'], histtype='step', bins=bins, label='WC Lantern')
plt.hist(all_df['lantern_pandora_dist'], histtype='step', bins=bins, label='Lantern Pandora')
plt.hist(all_df['lantern_pandora_sce_dist'], histtype='step', bins=bins, label='Lantern Pandora SCE')
plt.legend()
plt.show()

In [None]:
all_df["wc_reco_nuvtxX"]

In [None]:
all_df["pelee_reco_nu_vtx_x"]

In [None]:
all_df["lantern_vtxX"]