In [1]:
import xgboost as xgb
import matplotlib.pyplot as plt
import uproot3 as uproot
import pandas as pd
import numpy as np
from scipy.special import logit
import math
from tqdm import tqdm

In [2]:
f_1g0p_BNBext = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_BNBext.root")["singlephoton"]

f_1g0p_BNBext.keys()

[b'vertex_tree;1',
 b'pot_tree;1',
 b'run_subrun_tree;1',
 b'eventweight_tree;1',
 b'ncdelta_slice_tree;1',
 b'simple_tree;1',
 b'weight_formula;1',
 b';1',
 b'POT_value;1']

In [3]:
f_1g0p_BNBext["vertex_tree"].show()

run_number                 (no streamer)              asdtype('>i4')
subrun_number              (no streamer)              asdtype('>i4')
event_number               (no streamer)              asdtype('>i4')
pot_per_event              (no streamer)              asdtype('>f8')
pot_per_subrun             (no streamer)              asdtype('>f8')
number_of_events_in_subrun (no streamer)              asdtype('>f8')
genie_spline_weight        (no streamer)              asdtype('>f8')
genie_CV_tune_weight       (no streamer)              asdtype('>f8')
test_matched_hits          (no streamer)              asdtype('>i4')
reco_vertex_size           (no streamer)              asdtype('>i4')
reco_vertex_x              (no streamer)              asdtype('>f8')
reco_vertex_y              (no streamer)              asdtype('>f8')
reco_vertex_z              (no streamer)              asdtype('>f8')
reco_vertex_in_SCB         (no streamer)              asdtype('>i4')
reco_vertex_dist_to_SCB    (no str

In [4]:
# this is all of the gLEE variables in the vertex_tree that we want to look at
gLEE_vars = [
    "run_number", 
    "subrun_number", 
    "event_number",
    "reco_shower_energy_plane0",
    "reco_shower_energy_plane1",
    "reco_shower_energy_plane2",
    "reco_vertex_x",
    "reco_vertex_y",
    "reco_vertex_z",
]

# this is all of the wire-cell variables that we want to look at (including variables we construct in this code)
WC_vars = [
    "data_or_pred",
    "run", 
    "subrun", 
    "event", 
    "category",
    "WC_file",
    "match_isFC",
    "kine_reco_Enu", 
    "reco_showerKE", 
    "nc_delta_score",
    "WC_reco_num_protons",
    "WC_reco_num_other_tracks",
    "reco_showervtxX",
    "reco_showervtxY",
    "reco_showervtxZ",
    "reco_nuvtxX",
    "reco_nuvtxY",
    "reco_nuvtxZ",
    "truth_vtxX",
    "truth_vtxY",
    "truth_vtxZ",
]

WC_data_vars = [
    "data_or_pred",
    "run", 
    "subrun", 
    "event", 
    "category",
    "WC_file",
    "match_isFC",
    "kine_reco_Enu", 
    "reco_showerKE", 
    "nc_delta_score",
    "WC_reco_num_protons",
    "WC_reco_num_other_tracks",
    "reco_showervtxX",
    "reco_showervtxY",
    "reco_showervtxZ",
    "reco_nuvtxX",
    "reco_nuvtxY",
    "reco_nuvtxZ",
]

In [5]:
f_1g0p_BNBext = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_BNBext.root")["singlephoton"]
f_1g0p_BNBext_vertex = f_1g0p_BNBext["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_BNBext_simple = f_1g0p_BNBext["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_BNBext = pd.concat([f_1g0p_BNBext_vertex, f_1g0p_BNBext_simple], axis=1, sort=False)
del f_1g0p_BNBext
del f_1g0p_BNBext_vertex
del f_1g0p_BNBext_simple
df_1g0p_BNBext["gLEE_file"] = "BNBext"
df_1g0p_BNBext["gLEE_selection"] = "1g0p"

f_1g0p_BNBOther = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_BNBOtherExtra.root")["singlephoton"]
f_1g0p_BNBOther_vertex = f_1g0p_BNBOther["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_BNBOther_simple = f_1g0p_BNBOther["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_BNBOther = pd.concat([f_1g0p_BNBOther_vertex, f_1g0p_BNBOther_simple], axis=1, sort=False)
del f_1g0p_BNBOther
del f_1g0p_BNBOther_vertex
del f_1g0p_BNBOther_simple
df_1g0p_BNBOther["gLEE_file"] = "BNBOther"
df_1g0p_BNBOther["gLEE_selection"] = "1g0p"

f_1g0p_CC1Pi0 = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_CC1Pi0.root")["singlephoton"]
f_1g0p_CC1Pi0_vertex = f_1g0p_CC1Pi0["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_CC1Pi0_simple = f_1g0p_CC1Pi0["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_CC1Pi0 = pd.concat([f_1g0p_CC1Pi0_vertex, f_1g0p_CC1Pi0_simple], axis=1, sort=False)
del f_1g0p_CC1Pi0
del f_1g0p_CC1Pi0_vertex
del f_1g0p_CC1Pi0_simple
df_1g0p_CC1Pi0["gLEE_file"] = "CC1Pi0"
df_1g0p_CC1Pi0["gLEE_selection"] = "1g0p"

f_1g0p_Dirt = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_Dirt.root")["singlephoton"]
f_1g0p_Dirt_vertex = f_1g0p_Dirt["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_Dirt_simple = f_1g0p_Dirt["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_Dirt = pd.concat([f_1g0p_Dirt_vertex, f_1g0p_Dirt_simple], axis=1, sort=False)
del f_1g0p_Dirt
del f_1g0p_Dirt_vertex
del f_1g0p_Dirt_simple
df_1g0p_Dirt["gLEE_file"] = "Dirt"
df_1g0p_Dirt["gLEE_selection"] = "1g0p"

f_1g0p_NCDelta = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_NCDeltaRadOverlaySM.root")["singlephoton"]
f_1g0p_NCDelta_vertex = f_1g0p_NCDelta["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_NCDelta_simple = f_1g0p_NCDelta["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_NCDelta = pd.concat([f_1g0p_NCDelta_vertex, f_1g0p_NCDelta_simple], axis=1, sort=False)
del f_1g0p_NCDelta
del f_1g0p_NCDelta_vertex
del f_1g0p_NCDelta_simple
df_1g0p_NCDelta["gLEE_file"] = "NCDelta"
df_1g0p_NCDelta["gLEE_selection"] = "1g0p"

f_1g0p_NCPi0Coh = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_NCPi0Coh.root")["singlephoton"]
f_1g0p_NCPi0Coh_vertex = f_1g0p_NCPi0Coh["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_NCPi0Coh_simple = f_1g0p_NCPi0Coh["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_NCPi0Coh = pd.concat([f_1g0p_NCPi0Coh_vertex, f_1g0p_NCPi0Coh_simple], axis=1, sort=False)
del f_1g0p_NCPi0Coh
del f_1g0p_NCPi0Coh_vertex
del f_1g0p_NCPi0Coh_simple
df_1g0p_NCPi0Coh["gLEE_file"] = "NCPi0Coh"
df_1g0p_NCPi0Coh["gLEE_selection"] = "1g0p"

f_1g0p_NCPi0NotCoh = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_NCPi0NotCoh.root")["singlephoton"]
f_1g0p_NCPi0NotCoh_vertex = f_1g0p_NCPi0NotCoh["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_NCPi0NotCoh_simple = f_1g0p_NCPi0NotCoh["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_NCPi0NotCoh = pd.concat([f_1g0p_NCPi0NotCoh_vertex, f_1g0p_NCPi0NotCoh_simple], axis=1, sort=False)
del f_1g0p_NCPi0NotCoh
del f_1g0p_NCPi0NotCoh_vertex
del f_1g0p_NCPi0NotCoh_simple
df_1g0p_NCPi0NotCoh["gLEE_file"] = "NCPi0NotCoh"
df_1g0p_NCPi0NotCoh["gLEE_selection"] = "1g0p"

f_1g0p_Nue = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_NueOverlays.root")["singlephoton"]
f_1g0p_Nue_vertex = f_1g0p_Nue["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_Nue_simple = f_1g0p_Nue["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_Nue = pd.concat([f_1g0p_Nue_vertex, f_1g0p_Nue_simple], axis=1, sort=False)
del f_1g0p_Nue
del f_1g0p_Nue_vertex
del f_1g0p_Nue_simple
df_1g0p_Nue["gLEE_file"] = "Nue"
df_1g0p_Nue["gLEE_selection"] = "1g0p"

f_1g0p_OTPC = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_OTPCExtra.root")["singlephoton"]
f_1g0p_OTPC_vertex = f_1g0p_OTPC["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_OTPC_simple = f_1g0p_OTPC["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_OTPC = pd.concat([f_1g0p_OTPC_vertex, f_1g0p_OTPC_simple], axis=1, sort=False)
del f_1g0p_OTPC
del f_1g0p_OTPC_vertex
del f_1g0p_OTPC_simple
df_1g0p_OTPC["gLEE_file"] = "OTPC"
df_1g0p_OTPC["gLEE_selection"] = "1g0p"


In [6]:
f_1g1p_BNBext = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_BNBext.root")["singlephoton"]
f_1g1p_BNBext_vertex = f_1g1p_BNBext["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_BNBext_simple = f_1g1p_BNBext["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_BNBext = pd.concat([f_1g1p_BNBext_vertex, f_1g1p_BNBext_simple], axis=1, sort=False)
del f_1g1p_BNBext
del f_1g1p_BNBext_vertex
del f_1g1p_BNBext_simple
df_1g1p_BNBext["gLEE_file"] = "BNBext"
df_1g1p_BNBext["gLEE_selection"] = "1g1p"

f_1g1p_BNBOther = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_BNBOtherExtra.root")["singlephoton"]
f_1g1p_BNBOther_vertex = f_1g1p_BNBOther["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_BNBOther_simple = f_1g1p_BNBOther["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_BNBOther = pd.concat([f_1g1p_BNBOther_vertex, f_1g1p_BNBOther_simple], axis=1, sort=False)
del f_1g1p_BNBOther
del f_1g1p_BNBOther_vertex
del f_1g1p_BNBOther_simple
df_1g1p_BNBOther["gLEE_file"] = "BNBOther"
df_1g1p_BNBOther["gLEE_selection"] = "1g1p"

f_1g1p_CC1Pi0 = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_CC1Pi0.root")["singlephoton"]
f_1g1p_CC1Pi0_vertex = f_1g1p_CC1Pi0["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_CC1Pi0_simple = f_1g1p_CC1Pi0["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_CC1Pi0 = pd.concat([f_1g1p_CC1Pi0_vertex, f_1g1p_CC1Pi0_simple], axis=1, sort=False)
del f_1g1p_CC1Pi0
del f_1g1p_CC1Pi0_vertex
del f_1g1p_CC1Pi0_simple
df_1g1p_CC1Pi0["gLEE_file"] = "CC1Pi0"
df_1g1p_CC1Pi0["gLEE_selection"] = "1g1p"

f_1g1p_Dirt = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_Dirt.root")["singlephoton"]
f_1g1p_Dirt_vertex = f_1g1p_Dirt["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_Dirt_simple = f_1g1p_Dirt["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_Dirt = pd.concat([f_1g1p_Dirt_vertex, f_1g1p_Dirt_simple], axis=1, sort=False)
del f_1g1p_Dirt
del f_1g1p_Dirt_vertex
del f_1g1p_Dirt_simple
df_1g1p_Dirt["gLEE_file"] = "Dirt"
df_1g1p_Dirt["gLEE_selection"] = "1g1p"

f_1g1p_NCDelta = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_NCDeltaRadOverlaySM.root")["singlephoton"]
f_1g1p_NCDelta_vertex = f_1g1p_NCDelta["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_NCDelta_simple = f_1g1p_NCDelta["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_NCDelta = pd.concat([f_1g1p_NCDelta_vertex, f_1g1p_NCDelta_simple], axis=1, sort=False)
del f_1g1p_NCDelta
del f_1g1p_NCDelta_vertex
del f_1g1p_NCDelta_simple
df_1g1p_NCDelta["gLEE_file"] = "NCDelta"
df_1g1p_NCDelta["gLEE_selection"] = "1g1p"

f_1g1p_NCPi0Coh = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_NCPi0Coh.root")["singlephoton"]
f_1g1p_NCPi0Coh_vertex = f_1g1p_NCPi0Coh["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_NCPi0Coh_simple = f_1g1p_NCPi0Coh["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_NCPi0Coh = pd.concat([f_1g1p_NCPi0Coh_vertex, f_1g1p_NCPi0Coh_simple], axis=1, sort=False)
del f_1g1p_NCPi0Coh
del f_1g1p_NCPi0Coh_vertex
del f_1g1p_NCPi0Coh_simple
df_1g1p_NCPi0Coh["gLEE_file"] = "NCPi0Coh"
df_1g1p_NCPi0Coh["gLEE_selection"] = "1g1p"

f_1g1p_NCPi0NotCoh = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_NCPi0NotCoh.root")["singlephoton"]
f_1g1p_NCPi0NotCoh_vertex = f_1g1p_NCPi0NotCoh["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_NCPi0NotCoh_simple = f_1g1p_NCPi0NotCoh["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_NCPi0NotCoh = pd.concat([f_1g1p_NCPi0NotCoh_vertex, f_1g1p_NCPi0NotCoh_simple], axis=1, sort=False)
del f_1g1p_NCPi0NotCoh
del f_1g1p_NCPi0NotCoh_vertex
del f_1g1p_NCPi0NotCoh_simple
df_1g1p_NCPi0NotCoh["gLEE_file"] = "NCPi0NotCoh"
df_1g1p_NCPi0NotCoh["gLEE_selection"] = "1g1p"

f_1g1p_Nue = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_NueOverlays.root")["singlephoton"]
f_1g1p_Nue_vertex = f_1g1p_Nue["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_Nue_simple = f_1g1p_Nue["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_Nue = pd.concat([f_1g1p_Nue_vertex, f_1g1p_Nue_simple], axis=1, sort=False)
del f_1g1p_Nue
del f_1g1p_Nue_vertex
del f_1g1p_Nue_simple
df_1g1p_Nue["gLEE_file"] = "Nue"
df_1g1p_Nue["gLEE_selection"] = "1g1p"

f_1g1p_OTPC = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_OTPCExtra.root")["singlephoton"]
f_1g1p_OTPC_vertex = f_1g1p_OTPC["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_OTPC_simple = f_1g1p_OTPC["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_OTPC = pd.concat([f_1g1p_OTPC_vertex, f_1g1p_OTPC_simple], axis=1, sort=False)
del f_1g1p_OTPC
del f_1g1p_OTPC_vertex
del f_1g1p_OTPC_simple
df_1g1p_OTPC["gLEE_file"] = "OTPC"
df_1g1p_OTPC["gLEE_selection"] = "1g1p"

In [7]:
gLEE_mc_df = pd.concat([
        df_1g0p_BNBext,
        df_1g0p_BNBOther,
        df_1g0p_CC1Pi0,
        df_1g0p_Dirt,
        df_1g0p_NCDelta,
        df_1g0p_NCPi0Coh,
        df_1g0p_NCPi0NotCoh,
        df_1g0p_Nue,
        df_1g0p_OTPC,
        df_1g1p_BNBext,
        df_1g1p_BNBOther,
        df_1g1p_CC1Pi0,
        df_1g1p_Dirt,
        df_1g1p_NCDelta,
        df_1g1p_NCPi0Coh,
        df_1g1p_NCPi0NotCoh,
        df_1g1p_Nue,
        df_1g1p_OTPC,
    ], sort=False)

gLEE_mc_df["data_or_pred"] = ["pred" for i in range(gLEE_mc_df.shape[0])]

In [8]:
f_1g0p_data = uproot.open("gLEE_files/sbnfit_files/1g0p/sbnfit_1g0pMar2020_stage_4_Data5e19.root")["singlephoton"]
f_1g0p_data_vertex = f_1g0p_data["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g0p_data_simple = f_1g0p_data["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g0p_data = pd.concat([f_1g0p_data_vertex, f_1g0p_data_simple], axis=1, sort=False)
del f_1g0p_data
del f_1g0p_data_vertex
del f_1g0p_data_simple
df_1g0p_data["gLEE_file"] = "data"
df_1g0p_data["gLEE_selection"] = "1g0p"

f_1g1p_data = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_Data5e19.root")["singlephoton"]
f_1g1p_data_vertex = f_1g1p_data["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_data_simple = f_1g1p_data["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)
df_1g1p_data = pd.concat([f_1g1p_data_vertex, f_1g1p_data_simple], axis=1, sort=False)
del f_1g1p_data
del f_1g1p_data_vertex
del f_1g1p_data_simple
df_1g1p_data["gLEE_file"] = "data"
df_1g1p_data["gLEE_selection"] = "1g1p"

In [9]:
gLEE_data_df = pd.concat([
        df_1g0p_data,
        df_1g1p_data,
    ], sort=False)

gLEE_data_df["data_or_pred"] = ["data" for i in range(gLEE_data_df.shape[0])]

In [10]:
gLEE_all_df = gLEE_data_df = pd.concat([
        gLEE_mc_df,
        gLEE_data_df,
    ], sort=False)

gLEE_all_df.reset_index(inplace=True)


In [11]:
# check to see if any RSE numbers are shared between files, would be an issue when merging

prev_set = set()
for i, row in gLEE_all_df.iterrows():
    rse_num = row["run_number"] * 1000000000000 + row["subrun_number"] * 1000000 + row["event_number"]
    if rse_num in prev_set:
        print("duplicate!")
        print(row["run_number"], row["subrun_number"], row["event_number"], row["gLEE_file"])
    else:
        prev_set.add(rse_num)


In [12]:
# remove duplicate RSE nums, would be an issue when merging

gLEE_duplicate_row_list = gLEE_all_df[["run_number", "subrun_number", "event_number"]].duplicated().to_numpy()

gLEE_duplicate_indices = list(np.nonzero(gLEE_duplicate_row_list)[0])

print(gLEE_duplicate_indices)

gLEE_all_df.drop(gLEE_duplicate_indices, inplace=True)

[]


In [13]:
gLEE_all_df.query("run_number == 5762 and subrun_number == 114 and event_number == 5732")

Unnamed: 0,entry,run_number,subrun_number,event_number,reco_shower_energy_plane0,reco_shower_energy_plane1,reco_shower_energy_plane2,reco_vertex_x,reco_vertex_y,reco_vertex_z,simple_pot_weight,gLEE_file,gLEE_selection,data_or_pred
5553,0,5762,114,5732,[143.9895468460497],[186.02024226135157],[186.8563924864959],113.383087,60.917236,541.001648,1.0,data,1g1p,data


In [14]:
bdt_vars = [
    "nc_delta_score",
]

In [15]:
extra_variables = [
    "run",
    "subrun",
    "event",
    "nuvtx_diff",
    "showervtx_diff",
    "muonvtx_diff",
    "truth_isCC",
    "truth_vtxInside",
    "truth_nuPdg",
    "truth_nuEnergy",
    "truth_nuIntType",
    "truth_energyInside",
    "weight_spline",
    "weight_cv",
    "weight_lee",
    "event_type",
    "weight",
    "lowEweight"
]

kine_scalar_vars = [
    "kine_reco_add_energy",
    "kine_pio_mass",
    "kine_pio_flag",
    "kine_pio_vtx_dis",
    "kine_pio_energy_1",
    "kine_pio_theta_1",
    "kine_pio_phi_1",
    "kine_pio_dis_1",
    "kine_pio_energy_2",
    "kine_pio_theta_2",
    "kine_pio_phi_2",
    "kine_pio_dis_2",
    "kine_pio_angle"
]

kine_vector_vars = [
    "kine_energy_particle",
    "kine_energy_info",
    "kine_particle_type",
    "kine_energy_included",
]

In [16]:
eval_mc_variables = [
    "run",
    "subrun",
    "event",
    "flash_time",
    "weight_spline", # this and remaining only make sense for MC
    "weight_cv",
    "match_completeness_energy",
    "truth_nuEnergy",
    "truth_energyInside",
    "truth_electronInside",
    "truth_nuPdg",
    "truth_isCC",
    "truth_isFC",
    "truth_vtxInside",
    "truth_vtxX",
    "truth_vtxY",
    "truth_vtxZ",
    "truth_nuTime",
]

eval_data_variables = [
    "run",
    "subrun",
    "event",
    "flash_time",
]


pf_eval_mc_variables = [
    "truth_NprimPio",
    "truth_NCDelta",
    "nuvtx_diff",
    "showervtx_diff",
    "reco_showerKE",
    "truth_pio_energy_1",
    "truth_pio_energy_2",
    "reco_nuvtxX",
    "reco_nuvtxY",
    "reco_nuvtxZ",
    "reco_showervtxX",
    "reco_showervtxY",
    "reco_showervtxZ",
]

pf_eval_data_variables = [ # also use these for dirt
    "reco_showerKE",
    "reco_nuvtxX",
    "reco_nuvtxY",
    "reco_nuvtxZ",
    "reco_showervtxX",
    "reco_showervtxY",
    "reco_showervtxZ",
]

In [17]:
# loading NC Delta files

f_nc_delta_run1 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_nc_delta_overlay_run1.root")["wcpselection"]
#print(f_nc_delta_run1.keys())
#f_nc_delta_run1["T_KINEvars"].show()
f_nc_delta_run1_bdt = f_nc_delta_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nc_delta_run1_eval = f_nc_delta_run1["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nc_delta_run1_pfeval = f_nc_delta_run1["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nc_delta_run1_kine = f_nc_delta_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nc_delta_run1_pot = f_nc_delta_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_delta_run1_file_POT = np.sum(f_nc_delta_run1_pot["pot_tor875good"].to_numpy())
nc_delta_run1_df = pd.concat([f_nc_delta_run1_bdt, f_nc_delta_run1_eval, f_nc_delta_run1_pfeval, f_nc_delta_run1_kine], axis=1, sort=False).query(
    "kine_reco_Enu >= 0 and match_isFC==1")
del f_nc_delta_run1
del f_nc_delta_run1_bdt
del f_nc_delta_run1_eval
del f_nc_delta_run1_pfeval
del f_nc_delta_run1_kine
del f_nc_delta_run1_pot
nc_delta_run1_df["isEXT"] = [0 for i in range(nc_delta_run1_df.shape[0])]
nc_delta_run1_df["isDirt"] = [0 for i in range(nc_delta_run1_df.shape[0])]
nc_delta_run1_df["WC_file"] = ["nc_delta_run1" for i in range(nc_delta_run1_df.shape[0])]
nc_delta_run1_df["run_num"] = [1 for i in range(nc_delta_run1_df.shape[0])]

print(nc_delta_run1_file_POT)
print(nc_delta_run1_df.shape[0])

f_nc_delta_run2 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_nc_delta_overlay_run2.root")["wcpselection"]
f_nc_delta_run2_bdt = f_nc_delta_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nc_delta_run2_eval = f_nc_delta_run2["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nc_delta_run2_pfeval = f_nc_delta_run2["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nc_delta_run2_kine = f_nc_delta_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nc_delta_run2_pot = f_nc_delta_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_delta_run2_file_POT = np.sum(f_nc_delta_run2_pot["pot_tor875good"].to_numpy())
nc_delta_run2_df = pd.concat([f_nc_delta_run2_bdt, f_nc_delta_run2_eval, f_nc_delta_run2_pfeval, f_nc_delta_run2_kine], axis=1, sort=False).query(
    "kine_reco_Enu >= 0 and match_isFC==1")
del f_nc_delta_run2
del f_nc_delta_run2_bdt
del f_nc_delta_run2_eval
del f_nc_delta_run2_pfeval
del f_nc_delta_run2_kine
del f_nc_delta_run2_pot
nc_delta_run2_df["isEXT"] = [0 for i in range(nc_delta_run2_df.shape[0])]
nc_delta_run2_df["isDirt"] = [0 for i in range(nc_delta_run2_df.shape[0])]
nc_delta_run2_df["WC_file"] = ["nc_delta_run2" for i in range(nc_delta_run2_df.shape[0])]
nc_delta_run2_df["run_num"] = [2 for i in range(nc_delta_run2_df.shape[0])]

print(nc_delta_run2_file_POT)
print(nc_delta_run2_df.shape[0])

f_nc_delta_run3 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_nc_delta_overlay_run3.root")["wcpselection"]
f_nc_delta_run3_bdt = f_nc_delta_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nc_delta_run3_eval = f_nc_delta_run3["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nc_delta_run3_pfeval = f_nc_delta_run3["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nc_delta_run3_kine = f_nc_delta_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nc_delta_run3_pot = f_nc_delta_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_delta_run3_file_POT = np.sum(f_nc_delta_run3_pot["pot_tor875good"].to_numpy())
nc_delta_run3_df = pd.concat([f_nc_delta_run3_bdt, f_nc_delta_run3_eval, f_nc_delta_run3_pfeval, f_nc_delta_run3_kine], axis=1, sort=False).query(
    "kine_reco_Enu >= 0 and match_isFC==1")
del f_nc_delta_run3
del f_nc_delta_run3_bdt
del f_nc_delta_run3_eval
del f_nc_delta_run3_pfeval
del f_nc_delta_run3_kine
del f_nc_delta_run3_pot
nc_delta_run3_df["isEXT"] = [0 for i in range(nc_delta_run3_df.shape[0])]
nc_delta_run3_df["isDirt"] = [0 for i in range(nc_delta_run3_df.shape[0])]
nc_delta_run3_df["WC_file"] = ["nc_delta_run3" for i in range(nc_delta_run3_df.shape[0])]
nc_delta_run3_df["run_num"] = [3 for i in range(nc_delta_run3_df.shape[0])]

print(nc_delta_run3_file_POT)
print(nc_delta_run3_df.shape[0])

nc_delta_df = pd.concat([nc_delta_run1_df, nc_delta_run2_df, nc_delta_run3_df], sort=False).query("truth_isCC==0 and truth_NCDelta==1 and kine_reco_Enu >= 0 and match_isFC==1")



2.2142960619858217e+23
24496
4.78045456048476e+23
53012
5.20631219812051e+23
58367


In [18]:
# loading NC Pi0 files

f_ncpi0_run1 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_nc_pi0_overlay_run1.root")["wcpselection"]
f_ncpi0_run1_bdt = f_ncpi0_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ncpi0_run1_eval = f_ncpi0_run1["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_ncpi0_run1_pfeval = f_ncpi0_run1["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_ncpi0_run1_kine = f_ncpi0_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ncpi0_run1_pot = f_ncpi0_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_pi0_run1_file_POT = np.sum(f_ncpi0_run1_pot["pot_tor875good"].to_numpy())
nc_pi0_run1_df = pd.concat([f_ncpi0_run1_bdt, f_ncpi0_run1_eval, f_ncpi0_run1_pfeval, f_ncpi0_run1_kine], axis=1, sort=False)
del f_ncpi0_run1
del f_ncpi0_run1_bdt
del f_ncpi0_run1_eval
del f_ncpi0_run1_pfeval
del f_ncpi0_run1_kine
del f_ncpi0_run1_pot
nc_pi0_run1_df["isEXT"] = [0 for i in range(nc_pi0_run1_df.shape[0])]
nc_pi0_run1_df["isDirt"] = [0 for i in range(nc_pi0_run1_df.shape[0])]
nc_pi0_run1_df["WC_file"] = ["NC_Pi0_run1" for i in range(nc_pi0_run1_df.shape[0])]
nc_pi0_run1_df["run_num"] = [1 for i in range(nc_pi0_run1_df.shape[0])]

f_ncpi0_run2 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_nc_pi0_overlay_run2.root")["wcpselection"]
f_ncpi0_run2_bdt = f_ncpi0_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ncpi0_run2_eval = f_ncpi0_run2["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_ncpi0_run2_pfeval = f_ncpi0_run2["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_ncpi0_run2_kine = f_ncpi0_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ncpi0_run2_pot = f_ncpi0_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_pi0_run2_file_POT = np.sum(f_ncpi0_run2_pot["pot_tor875good"].to_numpy())
nc_pi0_run2_df = pd.concat([f_ncpi0_run2_bdt, f_ncpi0_run2_eval, f_ncpi0_run2_pfeval, f_ncpi0_run2_kine], axis=1, sort=False)
del f_ncpi0_run2
del f_ncpi0_run2_bdt
del f_ncpi0_run2_eval
del f_ncpi0_run2_pfeval
del f_ncpi0_run2_kine
del f_ncpi0_run2_pot
nc_pi0_run2_df["isEXT"] = [0 for i in range(nc_pi0_run2_df.shape[0])]
nc_pi0_run2_df["isDirt"] = [0 for i in range(nc_pi0_run2_df.shape[0])]
nc_pi0_run2_df["WC_file"] = ["NC_Pi0_run2" for i in range(nc_pi0_run2_df.shape[0])]
nc_pi0_run2_df["run_num"] = [2 for i in range(nc_pi0_run2_df.shape[0])]


f_ncpi0_run3 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_nc_pi0_overlay_run3.root")["wcpselection"]
f_ncpi0_run3_bdt = f_ncpi0_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ncpi0_run3_eval = f_ncpi0_run3["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_ncpi0_run3_pfeval = f_ncpi0_run3["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_ncpi0_run3_kine = f_ncpi0_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ncpi0_run3_pot = f_ncpi0_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_pi0_run3_file_POT = np.sum(f_ncpi0_run3_pot["pot_tor875good"].to_numpy())
nc_pi0_run3_df = pd.concat([f_ncpi0_run3_bdt, f_ncpi0_run3_eval, f_ncpi0_run3_pfeval, f_ncpi0_run3_kine], axis=1, sort=False)
del f_ncpi0_run3
del f_ncpi0_run3_bdt
del f_ncpi0_run3_eval
del f_ncpi0_run3_pfeval
del f_ncpi0_run3_kine
del f_ncpi0_run3_pot
nc_pi0_run3_df["isEXT"] = [0 for i in range(nc_pi0_run3_df.shape[0])]
nc_pi0_run3_df["isDirt"] = [0 for i in range(nc_pi0_run3_df.shape[0])]
nc_pi0_run3_df["WC_file"] = ["NC_Pi0_run3" for i in range(nc_pi0_run3_df.shape[0])]
nc_pi0_run3_df["run_num"] = [3 for i in range(nc_pi0_run3_df.shape[0])]

print(nc_pi0_run1_file_POT)
print(nc_pi0_run1_df.shape[0])

print(nc_pi0_run3_file_POT)
print(nc_pi0_run3_df.shape[0])

nc_pi0_df = pd.concat([nc_pi0_run1_df, nc_pi0_run2_df, nc_pi0_run3_df], sort=False).query("truth_isCC==0 and truth_NprimPio>0 and not (truth_NCDelta==1) and kine_reco_Enu >= 0 and match_isFC==1")

del nc_pi0_run1_df
del nc_pi0_run2_df
del nc_pi0_run3_df

1.249265551566517e+21
48200
1.2277865498371458e+21
47066


In [19]:
f_intrinsic_nue = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_bnb_intrinsic_nue_overlay_run123_all.root")["wcpselection"]
f_intrinsic_nue_bdt = f_intrinsic_nue["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_intrinsic_nue_eval = f_intrinsic_nue["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_intrinsic_nue_pfeval = f_intrinsic_nue["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_intrinsic_nue_kine = f_intrinsic_nue["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_intrinsic_nue_pot = f_intrinsic_nue["T_pot"].pandas.df("pot_tor875good", flatten=False)
intrinsic_nue_file_POT = np.sum(f_intrinsic_nue_pot["pot_tor875good"].to_numpy())
intrinsic_nue_df = pd.concat([f_intrinsic_nue_bdt, f_intrinsic_nue_eval, f_intrinsic_nue_pfeval, f_intrinsic_nue_kine], axis=1, sort=False)
del f_intrinsic_nue
del f_intrinsic_nue_bdt
del f_intrinsic_nue_eval
del f_intrinsic_nue_pfeval
del f_intrinsic_nue_kine
del f_intrinsic_nue_pot
intrinsic_nue_df["isEXT"] = [0 for i in range(intrinsic_nue_df.shape[0])]
intrinsic_nue_df["isDirt"] = [0 for i in range(intrinsic_nue_df.shape[0])]
intrinsic_nue_df["WC_file"] = ["intrinsic_nue_run123_all" for i in range(intrinsic_nue_df.shape[0])]

print(intrinsic_nue_file_POT)
print(intrinsic_nue_df.shape[0])

2.5297848996675555e+23
426571


In [20]:
# loading all overlay, ext, and dirt files

# nu overlay runs 1, 2, and 3
f_nu_overlay_run1 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_bnb_nu_overlay_run1.root")["wcpselection"]
f_nu_overlay_run1_bdt = f_nu_overlay_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nu_overlay_run1_eval = f_nu_overlay_run1["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nu_overlay_run1_pfeval = f_nu_overlay_run1["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nu_overlay_run1_kine = f_nu_overlay_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nu_overlay_run1_pot = f_nu_overlay_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
nu_overlay_run1_POT = np.sum(f_nu_overlay_run1_pot["pot_tor875good"].to_numpy())
nu_overlay_run1_df = pd.concat([f_nu_overlay_run1_bdt, f_nu_overlay_run1_eval, f_nu_overlay_run1_pfeval, f_nu_overlay_run1_kine], axis=1, sort=False)
del f_nu_overlay_run1
del f_nu_overlay_run1_bdt
del f_nu_overlay_run1_eval
del f_nu_overlay_run1_pfeval
del f_nu_overlay_run1_kine
del f_nu_overlay_run1_pot
nu_overlay_run1_df["isEXT"] = [0 for i in range(nu_overlay_run1_df.shape[0])]
nu_overlay_run1_df["isDirt"] = [0 for i in range(nu_overlay_run1_df.shape[0])]
nu_overlay_run1_df["WC_file"] = ["nu_overlay_run1" for i in range(nu_overlay_run1_df.shape[0])]
nu_overlay_run1_df["run_num"] = [1 for i in range(nu_overlay_run1_df.shape[0])]

f_nu_overlay_run2 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_bnb_nu_overlay_run2.root")["wcpselection"]
f_nu_overlay_run2_bdt = f_nu_overlay_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nu_overlay_run2_eval = f_nu_overlay_run2["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nu_overlay_run2_pfeval = f_nu_overlay_run2["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nu_overlay_run2_kine = f_nu_overlay_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nu_overlay_run2_pot = f_nu_overlay_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
nu_overlay_run2_POT = np.sum(f_nu_overlay_run2_pot["pot_tor875good"].to_numpy())
nu_overlay_run2_df = pd.concat([f_nu_overlay_run2_bdt, f_nu_overlay_run2_eval, f_nu_overlay_run2_pfeval, f_nu_overlay_run2_kine], axis=1, sort=False)
del f_nu_overlay_run2
del f_nu_overlay_run2_bdt
del f_nu_overlay_run2_eval
del f_nu_overlay_run2_pfeval
del f_nu_overlay_run2_kine
del f_nu_overlay_run2_pot
nu_overlay_run2_df["isEXT"] = [0 for i in range(nu_overlay_run2_df.shape[0])]
nu_overlay_run2_df["isDirt"] = [0 for i in range(nu_overlay_run2_df.shape[0])]
nu_overlay_run2_df["WC_file"] = ["nu_overlay_run2" for i in range(nu_overlay_run2_df.shape[0])]
nu_overlay_run2_df["run_num"] = [2 for i in range(nu_overlay_run2_df.shape[0])]

f_nu_overlay_run3 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_bnb_nu_overlay_run3.root")["wcpselection"]
f_nu_overlay_run3_bdt = f_nu_overlay_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nu_overlay_run3_eval = f_nu_overlay_run3["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nu_overlay_run3_pfeval = f_nu_overlay_run3["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nu_overlay_run3_kine = f_nu_overlay_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nu_overlay_run3_pot = f_nu_overlay_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
nu_overlay_run3_POT = np.sum(f_nu_overlay_run3_pot["pot_tor875good"].to_numpy())
nu_overlay_run3_df = pd.concat([f_nu_overlay_run3_bdt, f_nu_overlay_run3_eval, f_nu_overlay_run3_pfeval, f_nu_overlay_run3_kine], axis=1, sort=False)
del f_nu_overlay_run3
del f_nu_overlay_run3_bdt
del f_nu_overlay_run3_eval
del f_nu_overlay_run3_pfeval
del f_nu_overlay_run3_kine
del f_nu_overlay_run3_pot
nu_overlay_run3_df["isEXT"] = [0 for i in range(nu_overlay_run3_df.shape[0])]
nu_overlay_run3_df["isDirt"] = [0 for i in range(nu_overlay_run3_df.shape[0])]
nu_overlay_run3_df["WC_file"] = ["nu_overlay_run3" for i in range(nu_overlay_run3_df.shape[0])]
nu_overlay_run3_df["run_num"] = [3 for i in range(nu_overlay_run3_df.shape[0])]

overlay_df = pd.concat([nu_overlay_run1_df, nu_overlay_run2_df, nu_overlay_run3_df], sort=False).query("not (truth_NCDelta==1)")
del nu_overlay_run1_df
del nu_overlay_run2_df
del nu_overlay_run3_df

#print(f_nu_overlay_run1.keys())
#f_nu_overlay_run1["T_PFeval"].show()

In [21]:
# EXT runs 1, 2, and 3

f_ext_run1 = uproot.open("data/processed_checkout_rootfiles/wcp_data_extbnb_run1_mcc9_v08_00_00_53_checkout.root")["wcpselection"]
f_ext_run1_bdt = f_ext_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ext_run1_eval = f_ext_run1["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_ext_run1_pfeval = f_ext_run1["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_ext_run1_kine = f_ext_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ext_run1_pot = f_ext_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
ext_run1_POT = np.sum(f_ext_run1_pot["pot_tor875good"].to_numpy())
ext_run1_df = pd.concat([f_ext_run1_bdt, f_ext_run1_eval, f_ext_run1_pfeval, f_ext_run1_kine], axis=1, sort=False)
del f_ext_run1
del f_ext_run1_bdt
del f_ext_run1_eval
del f_ext_run1_pfeval
del f_ext_run1_kine
del f_ext_run1_pot
ext_run1_df["isEXT"] = [1 for i in range(ext_run1_df.shape[0])]
ext_run1_df["isDirt"] = [0 for i in range(ext_run1_df.shape[0])]
ext_run1_df["WC_file"] = ["ext_run1" for i in range(ext_run1_df.shape[0])]
ext_run1_df["run_num"] = [1 for i in range(ext_run1_df.shape[0])]

f_ext_run2 = uproot.open("data/processed_checkout_rootfiles/wcp_data_extbnb_run2_mcc9_v08_00_00_53_checkout.root")["wcpselection"]
f_ext_run2_bdt = f_ext_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ext_run2_eval = f_ext_run2["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_ext_run2_pfeval = f_ext_run2["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_ext_run2_kine = f_ext_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ext_run2_pot = f_ext_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
ext_run2_POT = np.sum(f_ext_run2_pot["pot_tor875good"].to_numpy())
ext_run2_df = pd.concat([f_ext_run2_bdt, f_ext_run2_eval, f_ext_run2_pfeval, f_ext_run2_kine], axis=1, sort=False)
del f_ext_run2
del f_ext_run2_bdt
del f_ext_run2_eval
del f_ext_run2_pfeval
del f_ext_run2_kine
del f_ext_run2_pot
ext_run2_df["isEXT"] = [1 for i in range(ext_run2_df.shape[0])]
ext_run2_df["isDirt"] = [0 for i in range(ext_run2_df.shape[0])]
ext_run2_df["WC_file"] = ["ext_run2" for i in range(ext_run2_df.shape[0])]
ext_run2_df["run_num"] = [2 for i in range(ext_run2_df.shape[0])]

f_ext_run3 = uproot.open("data/processed_checkout_rootfiles/wcp_data_extbnb_run3_mcc9_v08_00_00_53_checkout.root")["wcpselection"]
f_ext_run3_bdt = f_ext_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ext_run3_eval = f_ext_run3["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_ext_run3_pfeval = f_ext_run3["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_ext_run3_kine = f_ext_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ext_run3_pot = f_ext_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
ext_run3_POT = np.sum(f_ext_run3_pot["pot_tor875good"].to_numpy())
ext_run3_df = pd.concat([f_ext_run3_bdt, f_ext_run3_eval, f_ext_run3_pfeval, f_ext_run3_kine], axis=1, sort=False)
del f_ext_run3
del f_ext_run3_bdt
del f_ext_run3_eval
del f_ext_run3_pfeval
del f_ext_run3_kine
del f_ext_run3_pot
ext_run3_df["isEXT"] = [1 for i in range(ext_run3_df.shape[0])]
ext_run3_df["isDirt"] = [0 for i in range(ext_run3_df.shape[0])]
ext_run3_df["WC_file"] = ["ext_run3" for i in range(ext_run3_df.shape[0])]
ext_run3_df["run_num"] = [3 for i in range(ext_run3_df.shape[0])]

ext_df = pd.concat([ext_run1_df, ext_run2_df, ext_run3_df], sort=False)
del ext_run1_df
del ext_run2_df
del ext_run3_df


In [22]:
# Dirt runs 1, 2, and 3
f_dirt_run1 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_dirt_overlay_run1_all.root")["wcpselection"]
f_dirt_run1_bdt = f_dirt_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_dirt_run1_eval = f_dirt_run1["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_dirt_run1_pfeval = f_dirt_run1["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_dirt_run1_kine = f_dirt_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_dirt_run1_pot = f_dirt_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
dirt_run1_POT = np.sum(f_dirt_run1_pot["pot_tor875good"].to_numpy())
dirt_run1_df = pd.concat([f_dirt_run1_bdt, f_dirt_run1_eval, f_dirt_run1_pfeval, f_dirt_run1_kine], axis=1, sort=False)
del f_dirt_run1
del f_dirt_run1_bdt
del f_dirt_run1_eval
del f_dirt_run1_pfeval
del f_dirt_run1_kine
del f_dirt_run1_pot
dirt_run1_df["isEXT"] = [0 for i in range(dirt_run1_df.shape[0])]
dirt_run1_df["isDirt"] = [1 for i in range(dirt_run1_df.shape[0])]
dirt_run1_df["WC_file"] = ["dirt_run1" for i in range(dirt_run1_df.shape[0])]
dirt_run1_df["run_num"] = [1 for i in range(dirt_run1_df.shape[0])]

f_dirt_run2 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_dirt_overlay_run2_all.root")["wcpselection"]
f_dirt_run2_bdt = f_dirt_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_dirt_run2_eval = f_dirt_run2["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_dirt_run2_pfeval = f_dirt_run2["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_dirt_run2_kine = f_dirt_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_dirt_run2_pot = f_dirt_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
dirt_run2_POT = np.sum(f_dirt_run2_pot["pot_tor875good"].to_numpy())
dirt_run2_df = pd.concat([f_dirt_run2_bdt, f_dirt_run2_eval, f_dirt_run2_pfeval, f_dirt_run2_kine], axis=1, sort=False)
del f_dirt_run2
del f_dirt_run2_bdt
del f_dirt_run2_eval
del f_dirt_run2_pfeval
del f_dirt_run2_kine
del f_dirt_run2_pot
dirt_run2_df["isEXT"] = [0 for i in range(dirt_run2_df.shape[0])]
dirt_run2_df["isDirt"] = [1 for i in range(dirt_run2_df.shape[0])]
dirt_run2_df["WC_file"] = ["dirt_run2" for i in range(dirt_run2_df.shape[0])]
dirt_run2_df["run_num"] = [2 for i in range(dirt_run2_df.shape[0])]

f_dirt_run3 = uproot.open("data/processed_checkout_rootfiles/checkout_prodgenie_dirt_overlay_run3_all.root")["wcpselection"]
f_dirt_run3_bdt = f_dirt_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_dirt_run3_eval = f_dirt_run3["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_dirt_run3_pfeval = f_dirt_run3["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_dirt_run3_kine = f_dirt_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_dirt_run3_pot = f_dirt_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
dirt_run3_POT = np.sum(f_dirt_run3_pot["pot_tor875good"].to_numpy())
dirt_run3_df = pd.concat([f_dirt_run3_bdt, f_dirt_run3_eval, f_dirt_run3_pfeval, f_dirt_run3_kine], axis=1, sort=False)
del f_dirt_run3
del f_dirt_run3_bdt
del f_dirt_run3_eval
del f_dirt_run3_pfeval
del f_dirt_run3_kine
del f_dirt_run3_pot
dirt_run3_df["isEXT"] = [0 for i in range(dirt_run3_df.shape[0])]
dirt_run3_df["isDirt"] = [1 for i in range(dirt_run3_df.shape[0])]
dirt_run3_df["WC_file"] = ["dirt_run3" for i in range(dirt_run3_df.shape[0])]
dirt_run3_df["run_num"] = [3 for i in range(dirt_run3_df.shape[0])]

dirt_df = pd.concat([dirt_run1_df, dirt_run2_df, dirt_run3_df], sort=False)
del dirt_run1_df
del dirt_run2_df
del dirt_run3_df

In [23]:
# Open data

f_data_run1 = uproot.open("data/processed_checkout_rootfiles/checkout_data_bnb_run1_5e19.root")["wcpselection"]
f_data_run1_bdt = f_data_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_data_run1_eval = f_data_run1["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_data_run1_pfeval = f_data_run1["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_data_run1_kine = f_data_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
data_run1_df = pd.concat([f_data_run1_bdt, f_data_run1_eval, f_data_run1_pfeval, f_data_run1_kine], axis=1, sort=False)
del f_data_run1
del f_data_run1_bdt
del f_data_run1_eval
del f_data_run1_pfeval
del f_data_run1_kine
data_run1_df["isEXT"] = [0 for i in range(data_run1_df.shape[0])]
data_run1_df["isDirt"] = [0 for i in range(data_run1_df.shape[0])]
data_run1_df["WC_file"] = ["open_data_run1" for i in range(data_run1_df.shape[0])]
data_run1_df["category"] = ["data" for i in range(data_run1_df.shape[0])]
data_run1_df["run_num"] = [1 for i in range(data_run1_df.shape[0])]

f_data_run3 = uproot.open("data/processed_checkout_rootfiles/checkout_data_bnb_run3_1e19.root")["wcpselection"]
f_data_run3_bdt = f_data_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_data_run3_eval = f_data_run3["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_data_run3_pfeval = f_data_run3["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_data_run3_kine = f_data_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
data_run3_df = pd.concat([f_data_run3_bdt, f_data_run3_eval, f_data_run3_pfeval, f_data_run3_kine], axis=1, sort=False)
del f_data_run3
del f_data_run3_bdt
del f_data_run3_pfeval
del f_data_run3_eval
del f_data_run3_kine
data_run3_df["isEXT"] = [0 for i in range(data_run3_df.shape[0])]
data_run3_df["isDirt"] = [0 for i in range(data_run3_df.shape[0])]
data_run3_df["WC_file"] = ["open_data_run3" for i in range(data_run3_df.shape[0])]
data_run3_df["category"] = ["data" for i in range(data_run3_df.shape[0])]
data_run3_df["run_num"] = [3 for i in range(data_run3_df.shape[0])]

data_all_df = pd.concat([data_run1_df, data_run3_df], sort=False)

del data_run1_df
del data_run3_df

In [24]:
# combining prediction files

all_df = pd.concat([overlay_df, ext_df, dirt_df, nc_delta_df, intrinsic_nue_df, nc_pi0_df], sort=False)

del overlay_df
del ext_df
del dirt_df
del nc_delta_df
del intrinsic_nue_df
del nc_pi0_df

In [25]:
em_charge_scale = 0.95

uncorrected_reco_showerKE = all_df["reco_showerKE"].to_numpy()
all_df["reco_showerKE"] = uncorrected_reco_showerKE * 1000.

uncorrected_reco_showerKE = data_all_df["reco_showerKE"].to_numpy()
data_all_df["reco_showerKE"] = em_charge_scale * uncorrected_reco_showerKE * 1000.

In [26]:
# adding WC reco proton num

proton_nums = []
track_nums = []
energy_lists = all_df["kine_energy_particle"].to_numpy()
pdg_lists = all_df["kine_particle_type"].to_numpy()
for i in tqdm(range(all_df.shape[0])):
    proton_num = 0
    track_num = 0
    energy_list = energy_lists[i]
    pdg_list = pdg_lists[i]
    for i in range(len(energy_list)):
        if abs(pdg_list[i]) == 2212 and energy_list[i] > 35.:
            proton_num += 1
        if abs(pdg_list[i]) == 13 or abs(pdg_list[i]) == 211 and energy_list[i] > 10.: # see N_tracks at https://github.com/BNLIF/wcp-uboone-bdt/blob/main/inc/WCPLEEANA/cuts.h
            track_num += 1
    proton_nums.append(proton_num)
    track_nums.append(track_num)
all_df["WC_reco_num_protons"] = proton_nums
all_df["WC_reco_num_other_tracks"] = track_nums

proton_nums = []
track_nums = []
energy_lists = data_all_df["kine_energy_particle"].to_numpy()
pdg_lists = data_all_df["kine_particle_type"].to_numpy()
for i in tqdm(range(data_all_df.shape[0])):
    proton_num = 0
    track_num = 0
    energy_list = energy_lists[i]
    pdg_list = pdg_lists[i]
    for i in range(len(energy_list)):
        if abs(pdg_list[i]) == 2212 and energy_list[i] > 35.:
            proton_num += 1
        if abs(pdg_list[i]) == 13 or abs(pdg_list[i]) == 211 and energy_list[i] > 10.: # see N_tracks at https://github.com/BNLIF/wcp-uboone-bdt/blob/main/inc/WCPLEEANA/cuts.h
            track_num += 1
    proton_nums.append(proton_num)
    track_nums.append(track_num)
data_all_df["WC_reco_num_protons"] = proton_nums
data_all_df["WC_reco_num_other_tracks"] = track_nums

100%|██████████| 7986901/7986901 [02:14<00:00, 59203.20it/s] 
100%|██████████| 212235/212235 [00:01<00:00, 134646.90it/s]


In [27]:
# adding WC truth category information

categories = ["NC Delta Radiative", "NC 1 Pi0", "numuCC 1 Pi0", "nueCC", "numuCC other", "NC other", "outFV", "badmatch/cosmic", "dirt", "ext"]

queries = [
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==0 and truth_NCDelta==1",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==0 and truth_NprimPio==1 and not (truth_NCDelta==1)",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==1 and abs(truth_nuPdg)==14 and truth_NprimPio==1",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==1 and abs(truth_nuPdg)==12",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==1 and abs(truth_nuPdg)==14 and truth_NprimPio!=1",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==0 and truth_NprimPio!=1 and not (truth_NCDelta==1)",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and not (truth_vtxInside==1)",
    "not (isDirt==1) and not (isEXT==1) and not (match_completeness_energy/truth_energyInside>=0.1)",
    "isDirt==1",
    "isEXT==1",
]

dfs_with_categories = []
for i in tqdm(range(len(categories))):
    category = categories[i]
    query = queries[i]
    cat_df = all_df.query(query)
    cat_df["category"] = category
    dfs_with_categories.append(cat_df)
    
del all_df
all_df = pd.concat(dfs_with_categories, sort=False)

del cat_df
del dfs_with_categories

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 10/10 [01:37<00:00,  9.71s/it]


In [28]:
all_df["data_or_pred"] = ["pred" for i in range(all_df.shape[0])]
data_all_df["data_or_pred"] = ["data" for i in range(data_all_df.shape[0])]

In [29]:
# putting them in order so that we don't remove the only generic selected event when we remove duplicates
all_df.sort_values(by=["kine_reco_Enu"], inplace=True, ascending=False)
data_all_df.sort_values(by=["kine_reco_Enu"], inplace=True, ascending=False)

In [30]:
all_df.reset_index(inplace=True)
data_all_df.reset_index(inplace=True)

In [31]:
all_df.query("run == 8975 and subrun == 106 and event == 5335")

Unnamed: 0,entry,nc_delta_score,run,subrun,event,flash_time,weight_spline,weight_cv,match_completeness_energy,truth_nuEnergy,...,kine_energy_included,kine_reco_Enu,isEXT,isDirt,WC_file,run_num,WC_reco_num_protons,WC_reco_num_other_tracks,category,data_or_pred
5506595,471145,-2.275953,8975,106,5335,5.140625,1.0,1.0,34.550514,2583.854492,...,[],-1.0,0,0,nu_overlay_run2,2.0,0,0,outFV,pred


In [32]:
print("duplicates not removed:")
print(all_df.shape[0])
print(data_all_df.shape[0])

duplicates not removed:
7986901
212235


In [33]:
# remove duplicate RSE nums, would be an issue when merging

data_duplicate_row_list = data_all_df[["run", "subrun", "event"]].duplicated().to_numpy()
pred_duplicate_row_list = all_df[["run", "subrun", "event"]].duplicated().to_numpy()

data_duplicate_indices = list(np.nonzero(data_duplicate_row_list)[0])
pred_duplicate_indices = list(np.nonzero(pred_duplicate_row_list)[0])
data_duplicate_row_list = data_all_df[["run", "subrun", "event"]].duplicated().to_numpy()
pred_duplicate_row_list = all_df[["run", "subrun", "event"]].duplicated().to_numpy()

data_duplicate_indices = list(np.nonzero(data_duplicate_row_list)[0])
pred_duplicate_indices = list(np.nonzero(pred_duplicate_row_list)[0])

data_all_df.drop(data_duplicate_indices, inplace=True)
all_df.drop(pred_duplicate_indices, inplace=True)


In [34]:
# throwing away excess information, combining data and MC files, randomizing order
WC_all_df = pd.concat([all_df[WC_vars], data_all_df[WC_data_vars]], sort=False).sample(frac=1)

In [35]:
print("duplicates removed:")
print(all_df.shape[0])
print(data_all_df.shape[0])

print("\ncombined:")
print(WC_all_df.shape[0])

duplicates removed:
7957158
212235

combined:
8169393


In [36]:
np.sum(WC_all_df[["run", "subrun", "event"]].duplicated().to_numpy())

0

In [37]:
all_df.query("run == 8975 and subrun == 106 and event == 5335")

Unnamed: 0,entry,nc_delta_score,run,subrun,event,flash_time,weight_spline,weight_cv,match_completeness_energy,truth_nuEnergy,...,kine_energy_included,kine_reco_Enu,isEXT,isDirt,WC_file,run_num,WC_reco_num_protons,WC_reco_num_other_tracks,category,data_or_pred
5506595,471145,-2.275953,8975,106,5335,5.140625,1.0,1.0,34.550514,2583.854492,...,[],-1.0,0,0,nu_overlay_run2,2.0,0,0,outFV,pred


In [38]:
# changing one element lists to floats

reco_shower_energy_plane0_arr = []
reco_shower_energy_plane1_arr = []
reco_shower_energy_plane2_arr = []
reco_shower_energy_plane0_lists = gLEE_all_df["reco_shower_energy_plane0"].to_numpy()
reco_shower_energy_plane1_lists = gLEE_all_df["reco_shower_energy_plane1"].to_numpy()
reco_shower_energy_plane2_lists = gLEE_all_df["reco_shower_energy_plane2"].to_numpy()

for i in tqdm(range(gLEE_all_df.shape[0])):    
    reco_shower_energy_plane0_arr.append(reco_shower_energy_plane0_lists[i][0])
    reco_shower_energy_plane1_arr.append(reco_shower_energy_plane0_lists[i][0])
    reco_shower_energy_plane2_arr.append(reco_shower_energy_plane0_lists[i][0])
    
gLEE_all_df["reco_shower_energy_plane0"] = reco_shower_energy_plane0_arr
gLEE_all_df["reco_shower_energy_plane1"] = reco_shower_energy_plane1_arr
gLEE_all_df["reco_shower_energy_plane2"] = reco_shower_energy_plane2_arr


100%|██████████| 5554/5554 [00:00<00:00, 71269.11it/s]


In [39]:
# remove list from gLEE's variables

gLEE_all_df = gLEE_all_df.rename(columns={"run_number": "run", "subrun_number": "subrun", "event_number":"event"})

In [40]:
gLEE_all_df

Unnamed: 0,entry,run,subrun,event,reco_shower_energy_plane0,reco_shower_energy_plane1,reco_shower_energy_plane2,reco_vertex_x,reco_vertex_y,reco_vertex_z,simple_pot_weight,gLEE_file,gLEE_selection,data_or_pred
0,0,6922,108,5430,73.706699,73.706699,73.706699,152.345856,-83.034393,640.097046,0.074065,BNBext,1g0p,pred
1,1,10990,41,2057,170.503905,170.503905,170.503905,127.398247,94.970177,577.528809,0.049936,BNBext,1g0p,pred
2,2,10445,195,9762,115.481219,115.481219,115.481219,214.369736,-19.314178,255.065582,0.049936,BNBext,1g0p,pred
3,3,11150,25,1282,123.496592,123.496592,123.496592,152.424973,-88.547760,402.611511,0.049936,BNBext,1g0p,pred
4,4,12454,277,13854,184.957459,184.957459,184.957459,59.854961,84.578667,595.745789,0.049936,BNBext,1g0p,pred
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5549,3,5904,22,1128,641.165635,641.165635,641.165635,85.821220,-37.485920,673.314392,1.000000,data,1g0p,data
5550,4,5506,30,1506,350.072526,350.072526,350.072526,159.181702,-43.122734,671.978577,1.000000,data,1g0p,data
5551,5,5203,89,4499,228.842836,228.842836,228.842836,210.058868,-79.360077,384.346039,1.000000,data,1g0p,data
5552,6,5783,129,6464,217.081220,217.081220,217.081220,100.292343,-54.400337,648.154968,1.000000,data,1g0p,data


In [41]:
WC_all_df

Unnamed: 0,data_or_pred,run,subrun,event,category,WC_file,match_isFC,kine_reco_Enu,reco_showerKE,nc_delta_score,...,WC_reco_num_other_tracks,reco_showervtxX,reco_showervtxY,reco_showervtxZ,reco_nuvtxX,reco_nuvtxY,reco_nuvtxZ,truth_vtxX,truth_vtxY,truth_vtxZ
3451699,pred,18796,5,282,ext,ext_run3,False,-1.000000,-1000.000000,-2.275953,...,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,,,
5036313,pred,8703,8,419,badmatch/cosmic,nu_overlay_run2,True,-1.000000,-1000.000000,-2.275953,...,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-59.453846,-34.715961,472.850922
6121384,pred,7014,1045,52263,badmatch/cosmic,nu_overlay_run1,True,-1.000000,-1000.000000,-2.275953,...,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,194.251938,-48.578243,477.155792
536492,pred,8548,47,2367,NC 1 Pi0,NC_Pi0_run2,True,766.519531,199.750534,-7.368141,...,1,85.624710,27.544836,259.345215,85.624710,27.544836,259.345215,81.921501,27.339878,259.434875
1186274,pred,17722,156,7823,ext,ext_run3,True,250.381760,244.374405,-1.980812,...,0,197.010513,91.683258,257.083069,197.010513,91.683258,257.083069,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5489799,pred,10547,71,3562,outFV,nu_overlay_run2,False,-1.000000,-1000.000000,-2.275953,...,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,156.414734,0.328779,-50.261715
771838,pred,8958,63,3198,NC other,nu_overlay_run2,True,560.540222,-1000.000000,-4.377742,...,1,-1.000000,-1.000000,-1.000000,26.254660,24.624323,677.363892,15.119493,48.429062,611.396301
2530376,pred,11637,62,3122,ext,ext_run2,False,-1.000000,-1000.000000,-2.275953,...,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,,,
5372541,pred,7014,1553,77668,dirt,dirt_run1,False,-1.000000,-1000.000000,-2.275953,...,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,,,


In [42]:
merged_gLEE_WC_comparison_df = WC_all_df.merge(gLEE_all_df, how="outer", on=["data_or_pred", "run", "subrun", "event"])

In [43]:
merged_gLEE_WC_comparison_df.to_pickle("merged_gLEE_WC_comparison_df_v3_mar_2022.pkl")
print("saved!")

saved!


In [44]:
# older version was 7959310 long

merged_gLEE_WC_comparison_df

Unnamed: 0,data_or_pred,run,subrun,event,category,WC_file,match_isFC,kine_reco_Enu,reco_showerKE,nc_delta_score,...,entry,reco_shower_energy_plane0,reco_shower_energy_plane1,reco_shower_energy_plane2,reco_vertex_x,reco_vertex_y,reco_vertex_z,simple_pot_weight,gLEE_file,gLEE_selection
0,pred,18796,5,282,ext,ext_run3,False,-1.000000,-1000.000000,-2.275953,...,,,,,,,,,,
1,pred,8703,8,419,badmatch/cosmic,nu_overlay_run2,True,-1.000000,-1000.000000,-2.275953,...,,,,,,,,,,
2,pred,7014,1045,52263,badmatch/cosmic,nu_overlay_run1,True,-1.000000,-1000.000000,-2.275953,...,,,,,,,,,,
3,pred,8548,47,2367,NC 1 Pi0,NC_Pi0_run2,True,766.519531,199.750534,-7.368141,...,,,,,,,,,,
4,pred,17722,156,7823,ext,ext_run3,True,250.381760,244.374405,-1.980812,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8171670,pred,17500,60,3007,,,,,,,...,175.0,226.104787,226.104787,226.104787,214.878510,-59.437653,116.877907,0.012385,NCPi0NotCoh,1g1p
8171671,pred,15709,25,1299,,,,,,,...,176.0,211.188389,211.188389,211.188389,18.133066,67.320091,508.359985,0.012385,NCPi0NotCoh,1g1p
8171672,pred,17400,130,6521,,,,,,,...,1.0,242.537306,242.537306,242.537306,29.714933,105.844437,935.194458,0.002387,Nue,1g1p
8171673,pred,18923,396,19821,,,,,,,...,3.0,165.797460,165.797460,165.797460,216.853500,-52.203907,175.701340,0.002724,Nue,1g1p


In [45]:
merged_gLEE_WC_comparison_df.query("run == 5762 and subrun == 114 and event == 5732")

Unnamed: 0,data_or_pred,run,subrun,event,category,WC_file,match_isFC,kine_reco_Enu,reco_showerKE,nc_delta_score,...,entry,reco_shower_energy_plane0,reco_shower_energy_plane1,reco_shower_energy_plane2,reco_vertex_x,reco_vertex_y,reco_vertex_z,simple_pot_weight,gLEE_file,gLEE_selection
3272712,data,5762,114,5732,data,open_data_run1,True,776.491272,424.827087,-6.11451,...,0.0,143.989547,143.989547,143.989547,113.383087,60.917236,541.001648,1.0,data,1g1p


In [58]:
either_df = merged_gLEE_WC_comparison_df.query("data_or_pred=='data' and (gLEE_selection=='1g0p' or gLEE_selection=='1g1p' or (match_isFC==1 and kine_reco_Enu>=0 and reco_showerKE>0 and nc_delta_score>2.61))")[["WC_file", "run", "subrun", "event", "match_isFC", "kine_reco_Enu", "reco_showerKE", "nc_delta_score", "WC_reco_num_protons", "WC_reco_num_other_tracks", "gLEE_selection"]]
either_df

Unnamed: 0,WC_file,run,subrun,event,match_isFC,kine_reco_Enu,reco_showerKE,nc_delta_score,WC_reco_num_protons,WC_reco_num_other_tracks,gLEE_selection
113488,open_data_run1,5203,89,4499,True,470.343384,223.639297,-3.658758,0.0,0.0,1g0p
400803,open_data_run1,5187,188,9430,True,270.433777,205.385468,-1.764994,0.0,0.0,1g0p
670651,open_data_run1,5767,167,8400,True,467.487122,214.691086,5.760055,0.0,1.0,
717542,open_data_run1,5385,43,2160,True,270.335663,251.09169,3.056633,0.0,0.0,
1433159,open_data_run1,5598,53,2663,True,338.355682,251.08075,6.017443,0.0,0.0,
1528420,open_data_run3,14822,15,781,True,395.404633,153.073013,3.52396,0.0,1.0,
2519916,open_data_run1,5598,36,1814,True,530.736145,361.00827,3.251768,1.0,0.0,
3226141,open_data_run1,5758,82,4118,True,431.071136,198.588287,2.86587,0.0,1.0,
3272712,open_data_run1,5762,114,5732,True,776.491272,424.827087,-6.11451,0.0,0.0,1g1p
3538191,open_data_run3,15249,94,4734,True,418.515961,191.70488,4.028068,1.0,0.0,


In [74]:
# run period, run, subrun, event, WC FC, WC generic, WC selected, gLEE selected
# WC and gLEE selected are in the for 1g_p_pi, or - if it's not selected

l = ""
for i, row in either_df.sort_values(by=["run"]).iterrows():
        
    if row["WC_file"] == "open_data_run1":
        l += "1 & "
    else:
        l += "3 & "
        
        
    l += f"{row['run']} & {row['subrun']} & {row['event']} & "
    
    if row["match_isFC"]:
        l += "Y & "
    else:
        l += "N & "
        
    if row["kine_reco_Enu"] > 0:
        l += "Y & "
    else:
        l += "N & "
        
    if row["reco_showerKE"] > 0 and row["nc_delta_score"] > 2.61:
        l += f"1g{int(row['WC_reco_num_protons'])}p{int(row['WC_reco_num_other_tracks'])}pi & "
    else:
        l += "- & "
        
    if row["gLEE_selection"] == row["gLEE_selection"]:
        l += f"{row['gLEE_selection']}0pi \\\\"
    else:
        l += "- \\\\"
        
    l += "\n\\hline\n"
        
print(l)

1 & 5187 & 188 & 9430 & Y & Y & - & 1g0p0pi \\
\hline
1 & 5203 & 89 & 4499 & Y & Y & - & 1g0p0pi \\
\hline
1 & 5203 & 77 & 3888 & Y & Y & 1g0p1pi & - \\
\hline
1 & 5271 & 11 & 598 & Y & Y & 1g0p0pi & - \\
\hline
1 & 5281 & 54 & 2707 & Y & N & - & 1g0p0pi \\
\hline
1 & 5326 & 55 & 2788 & Y & Y & 1g0p0pi & - \\
\hline
1 & 5385 & 23 & 1171 & Y & Y & 1g1p0pi & - \\
\hline
1 & 5385 & 43 & 2160 & Y & Y & 1g0p0pi & - \\
\hline
1 & 5418 & 72 & 3621 & Y & Y & 1g0p0pi & - \\
\hline
1 & 5506 & 30 & 1506 & Y & Y & 1g0p0pi & 1g0p0pi \\
\hline
1 & 5519 & 99 & 5000 & Y & Y & 1g0p0pi & - \\
\hline
1 & 5598 & 36 & 1814 & Y & Y & 1g1p0pi & - \\
\hline
1 & 5598 & 53 & 2663 & Y & Y & 1g0p0pi & - \\
\hline
1 & 5617 & 37 & 1872 & Y & Y & 1g0p0pi & - \\
\hline
1 & 5694 & 8 & 450 & N & N & - & 1g0p0pi \\
\hline
1 & 5705 & 57 & 2856 & Y & Y & 1g0p0pi & - \\
\hline
1 & 5729 & 85 & 4283 & Y & Y & 1g0p0pi & - \\
\hline
1 & 5758 & 82 & 4118 & Y & Y & 1g0p1pi & - \\
\hline
1 & 5762 & 114 & 5732 & Y & Y & - & 1g1p0p

In [None]:
#merged_gLEE_WC_comparison_df = pd.read_pickle("merged_gLEE_WC_comparison_df.pkl")
#merged_gLEE_WC_comparison_df(output)

In [None]:
# next steps:


# redo the saving and exporting whenever Mark gets me the right files







In [None]:
"""f_1g1p_data = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g1pMar2020_v4_stage_6_Data5e19.root")["singlephoton"]

print(f_1g1p_data.keys())
#f_1g1p_NCPi0NotCoh["vertex_tree"].show()
#f_1g1p_NCPi0NotCoh["simple_tree"].show()

# SEEMS TO BE EMPTY??????"""

"""f_1g1p_data_vertex = f_1g1p_data["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_data_simple = f_1g1p_data["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)

df_1g1p_data = pd.concat([f_1g1p_data_vertex, f_1g1p_data_simple], axis=1, sort=False)

del f_1g1p_data
del f_1g1p_data_vertex
del f_1g1p_data_simple

df_1g1p_data["gLEE_file"] = "data"

df_1g1p_data"""

In [None]:
"""f_1g0p_data = uproot.open("gLEE_files/sbnfit_files/sbnfit_1g0pMar2020_stage_4_Data5e19.root")["singlephoton"]

print(f_1g0p_data.keys())
#f_1g1p_NCPi0NotCoh["vertex_tree"].show()
#f_1g1p_NCPi0NotCoh["simple_tree"].show()

# SEEMS TO BE EMPTY??????"""

"""f_1g1p_data_vertex = f_1g1p_data["vertex_tree"].pandas.df(gLEE_vars, flatten=False)
f_1g1p_data_simple = f_1g1p_data["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False)

df_1g1p_data = pd.concat([f_1g1p_data_vertex, f_1g1p_data_simple], axis=1, sort=False)

del f_1g1p_data
del f_1g1p_data_vertex
del f_1g1p_data_simple

df_1g1p_data["gLEE_file"] = "data"

df_1g1p_data"""

In [None]:
"""duplicate_index_list = []

prev_set = set()
#prev_dic = {}
#prev_cat_dic = {}
#prev_en_dic = {}
for i in tqdm(range(WC_all_df.shape[0])):
    row = WC_all_df.iloc[i]
    #if row["kine_reco_Enu"] < 0: # only look for duplicates of generic selected events
    #    continue
    rse_num = row["run"] * 1000000000000 + row["subrun"] * 1000000 + row["event"]
    if rse_num in prev_set:
        duplicate_index_list.append(i)
        #print("duplicate!")
        #print(row["run"], row["subrun"], row["event"])
        #print(row["WC_file"], prev_dic[rse_num])
        #print(row["category"], prev_cat_dic[rse_num])
        #print(row["kine_reco_Enu"], prev_en_dic[rse_num])
    else:
        #prev_dic[rse_num] = row["WC_file"]
        #prev_cat_dic[rse_num] = row["category"]
        #prev_en_dic[rse_num] = row["kine_reco_Enu"]
        prev_set.add(rse_num)

print(duplicate_index_list)"""

