In [1]:
#import uproot
import pandas as pd
import numpy as np
import uproot3 as uproot
from tqdm import tqdm

print(uproot.__version__)

3.14.4


In [2]:
glee_file_location = "/data1/hagaman/glee_files/SBNfit_files/"
wc_file_location = "/data1/hagaman/xin_files/processed_checkout_rootfiles/" # training events removed


In [3]:
# the variables we want to extract from the glee files
glee_vars = [
    "run_number",
    "subrun_number",
    "event_number",
    "reco_shower_energy_plane0",
    "reco_shower_energy_plane1",
    "reco_shower_energy_plane2",
    "reco_shower_energy_max",
    "reco_vertex_x",
    "reco_vertex_y",
    "reco_vertex_z",
]

# the variables we want to extract from the WC files
wc_vars = [
    "data_or_pred",
    "run",
    "subrun",
    "event",
    "category",
    "WC_file",
    "match_isFC",
    "kine_reco_Enu",
    "reco_showerKE",
    "nc_delta_score",
    "WC_reco_num_protons",
    "WC_reco_num_other_tracks",
    "reco_nuvtxX",
    "reco_nuvtxY",
    "reco_nuvtxZ",
    "truth_vtxX",
    "truth_vtxY",
    "truth_vtxZ",
    "net_weight",
]

# the variables we want to extract from the WC files
wc_data_vars = [
    "data_or_pred",
    "run",
    "subrun",
    "event",
    "WC_file",
    "match_isFC",
    "kine_reco_Enu",
    "reco_showerKE",
    "nc_delta_score",
    "WC_reco_num_protons",
    "WC_reco_num_other_tracks",
    "reco_nuvtxX",
    "reco_nuvtxY",
    "reco_nuvtxZ",
]



In [4]:
# loading glee root files

glee_1g0p_location = glee_file_location + "1g0p/"
f_glee_1g0p_NCDeltaRadOverlaySM = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_stage_4_NCDeltaRadOverlaySM.root")["singlephoton"]
f_glee_1g0p_NCPi0Coh = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_stage_4_NCPi0Coh.root")["singlephoton"]
f_glee_1g0p_NCPi0NotCoh = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_stage_4_NCPi0NotCoh.root")["singlephoton"]
f_glee_1g0p_CC1Pi0 = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_stage_4_CC1Pi0.root")["singlephoton"]
f_glee_1g0p_NueOverlays = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_stage_4_NueOverlays.root")["singlephoton"]
f_glee_1g0p_OTPCExtra = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_stage_4_OTPCExtra.root")["singlephoton"]
f_glee_1g0p_BNBOtherExtra = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_stage_4_BNBOtherExtra.root")["singlephoton"]
f_glee_1g0p_Dirt = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_stage_4_Dirt.root")["singlephoton"]
f_glee_1g0p_BNBext = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_stage_4_BNBext.root")["singlephoton"]
f_glee_1g0p_data = uproot.open(glee_1g0p_location + "sbnfit_1g0pMar2020_RealRedoLiveUnblinding_stage_4_FinalSelection1g0p.root")["singlephoton"]

glee_1g1p_location = glee_file_location + "1g1p/"
f_glee_1g1p_NCDeltaRadOverlaySM = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_stage_6_NCDeltaRadOverlaySM.root")["singlephoton"]
f_glee_1g1p_NCPi0Coh = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_stage_6_NCPi0Coh.root")["singlephoton"]
f_glee_1g1p_NCPi0NotCoh = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_stage_6_NCPi0NotCoh.root")["singlephoton"]
f_glee_1g1p_CC1Pi0 = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_stage_6_CC1Pi0.root")["singlephoton"]
f_glee_1g1p_NueOverlays = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_stage_6_NueOverlays.root")["singlephoton"]
f_glee_1g1p_OTPCExtra = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_stage_6_OTPCExtra.root")["singlephoton"]
f_glee_1g1p_BNBOtherExtra = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_stage_6_BNBOtherExtra.root")["singlephoton"]
f_glee_1g1p_Dirt = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_stage_6_Dirt.root")["singlephoton"]
f_glee_1g1p_BNBext = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_stage_6_BNBext.root")["singlephoton"]
f_glee_1g1p_data = uproot.open(glee_1g1p_location + "sbnfit_1g1pMar2020_v4_RealRedoLiveUnblinding_stage_6_FinalSelection1g1p.root")["singlephoton"]


In [5]:
# putting information from the glee root files into pandas dataframes


# 1g0p pred files

glee_1g0p_NCDeltaRadOverlaySM_df = pd.concat([
    f_glee_1g0p_NCDeltaRadOverlaySM["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_NCDeltaRadOverlaySM["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_NCDeltaRadOverlaySM
glee_1g0p_NCDeltaRadOverlaySM_df["glee_file"] = "NCDeltaRadOverlaySM"
glee_1g0p_NCDeltaRadOverlaySM_df["glee_selection"] = "1g0p"

glee_1g0p_NCPi0Coh_df = pd.concat([
    f_glee_1g0p_NCPi0Coh["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_NCPi0Coh["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_NCPi0Coh
glee_1g0p_NCPi0Coh_df["glee_file"] = "NCPi0Coh"
glee_1g0p_NCPi0Coh_df["glee_selection"] = "1g0p"

glee_1g0p_NCPi0NotCoh_df = pd.concat([
    f_glee_1g0p_NCPi0NotCoh["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_NCPi0NotCoh["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_NCPi0NotCoh
glee_1g0p_NCPi0NotCoh_df["glee_file"] = "NCPi0NotCoh"
glee_1g0p_NCPi0NotCoh_df["glee_selection"] = "1g0p"

glee_1g0p_CC1Pi0_df = pd.concat([
    f_glee_1g0p_CC1Pi0["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_CC1Pi0["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_CC1Pi0
glee_1g0p_CC1Pi0_df["glee_file"] = "CC1Pi0"
glee_1g0p_CC1Pi0_df["glee_selection"] = "1g0p"

glee_1g0p_NueOverlays_df = pd.concat([
    f_glee_1g0p_NueOverlays["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_NueOverlays["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_NueOverlays
glee_1g0p_NueOverlays_df["glee_file"] = "NueOverlays"
glee_1g0p_NueOverlays_df["glee_selection"] = "1g0p"

glee_1g0p_OTPCExtra_df = pd.concat([
    f_glee_1g0p_OTPCExtra["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_OTPCExtra["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_OTPCExtra
glee_1g0p_OTPCExtra_df["glee_file"] = "OTPCExtra"
glee_1g0p_OTPCExtra_df["glee_selection"] = "1g0p"

glee_1g0p_BNBOtherExtra_df = pd.concat([
    f_glee_1g0p_BNBOtherExtra["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_BNBOtherExtra["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_BNBOtherExtra
glee_1g0p_BNBOtherExtra_df["glee_file"] = "BNBOtherExtra"
glee_1g0p_BNBOtherExtra_df["glee_selection"] = "1g0p"

glee_1g0p_Dirt_df = pd.concat([
    f_glee_1g0p_Dirt["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_Dirt["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_Dirt
glee_1g0p_Dirt_df["glee_file"] = "Dirt"
glee_1g0p_Dirt_df["glee_selection"] = "1g0p"

glee_1g0p_BNBext_df = pd.concat([
    f_glee_1g0p_BNBext["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_BNBext["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_BNBext
glee_1g0p_BNBext_df["glee_file"] = "BNBext"
glee_1g0p_BNBext_df["glee_selection"] = "1g0p"


# 1g1p pred files

glee_1g1p_NCDeltaRadOverlaySM_df = pd.concat([
    f_glee_1g1p_NCDeltaRadOverlaySM["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_NCDeltaRadOverlaySM["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_NCDeltaRadOverlaySM
glee_1g1p_NCDeltaRadOverlaySM_df["glee_file"] = "NCDeltaRadOverlaySM"
glee_1g1p_NCDeltaRadOverlaySM_df["glee_selection"] = "1g1p"

glee_1g1p_NCPi0Coh_df = pd.concat([
    f_glee_1g1p_NCPi0Coh["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_NCPi0Coh["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_NCPi0Coh
glee_1g1p_NCPi0Coh_df["glee_file"] = "NCPi0Coh"
glee_1g1p_NCPi0Coh_df["glee_selection"] = "1g1p"

glee_1g1p_NCPi0NotCoh_df = pd.concat([
    f_glee_1g1p_NCPi0NotCoh["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_NCPi0NotCoh["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_NCPi0NotCoh
glee_1g1p_NCPi0NotCoh_df["glee_file"] = "NCPi0NotCoh"
glee_1g1p_NCPi0NotCoh_df["glee_selection"] = "1g1p"

glee_1g1p_CC1Pi0_df = pd.concat([
    f_glee_1g1p_CC1Pi0["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_CC1Pi0["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_CC1Pi0
glee_1g1p_CC1Pi0_df["glee_file"] = "CC1Pi0"
glee_1g1p_CC1Pi0_df["glee_selection"] = "1g1p"

glee_1g1p_NueOverlays_df = pd.concat([
    f_glee_1g1p_NueOverlays["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_NueOverlays["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_NueOverlays
glee_1g1p_NueOverlays_df["glee_file"] = "NueOverlays"
glee_1g1p_NueOverlays_df["glee_selection"] = "1g1p"

glee_1g1p_OTPCExtra_df = pd.concat([
    f_glee_1g1p_OTPCExtra["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_OTPCExtra["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_OTPCExtra
glee_1g1p_OTPCExtra_df["glee_file"] = "OTPCExtra"
glee_1g1p_OTPCExtra_df["glee_selection"] = "1g1p"

glee_1g1p_BNBOtherExtra_df = pd.concat([
    f_glee_1g1p_BNBOtherExtra["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_BNBOtherExtra["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_BNBOtherExtra
glee_1g1p_BNBOtherExtra_df["glee_file"] = "BNBOtherExtra"
glee_1g1p_BNBOtherExtra_df["glee_selection"] = "1g1p"

glee_1g1p_Dirt_df = pd.concat([
    f_glee_1g1p_Dirt["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_Dirt["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_Dirt
glee_1g1p_Dirt_df["glee_file"] = "Dirt"
glee_1g1p_Dirt_df["glee_selection"] = "1g1p"

glee_1g1p_BNBext_df = pd.concat([
    f_glee_1g1p_BNBext["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_BNBext["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_BNBext
glee_1g1p_BNBext_df["glee_file"] = "BNBext"
glee_1g1p_BNBext_df["glee_selection"] = "1g1p"




# data files

glee_1g0p_data_df = pd.concat([
    f_glee_1g0p_data["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g0p_data["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g0p_data
glee_1g0p_data_df["glee_file"] = "data"
glee_1g0p_data_df["glee_selection"] = "1g0p"

glee_1g1p_data_df = pd.concat([
    f_glee_1g1p_data["vertex_tree"].pandas.df(glee_vars, flatten=False),
    f_glee_1g1p_data["simple_tree"].pandas.df(["simple_pot_weight"], flatten=False),
    ], axis=1, sort=False)
del f_glee_1g1p_data
glee_1g1p_data_df["glee_file"] = "data"
glee_1g1p_data_df["glee_selection"] = "1g1p"



In [6]:
glee_pred_df = pd.concat([
    glee_1g0p_NCDeltaRadOverlaySM_df,
    glee_1g0p_NCPi0Coh_df,
    glee_1g0p_NCPi0NotCoh_df,
    glee_1g0p_CC1Pi0_df,
    glee_1g0p_NueOverlays_df,
    glee_1g0p_OTPCExtra_df,
    glee_1g0p_BNBOtherExtra_df,
    glee_1g0p_Dirt_df,
    glee_1g0p_BNBext_df,
    glee_1g1p_NCDeltaRadOverlaySM_df,
    glee_1g1p_NCPi0Coh_df,
    glee_1g1p_NCPi0NotCoh_df,
    glee_1g1p_CC1Pi0_df,
    glee_1g1p_NueOverlays_df,
    glee_1g1p_OTPCExtra_df,
    glee_1g1p_BNBOtherExtra_df,
    glee_1g1p_Dirt_df,
    glee_1g1p_BNBext_df,
    ], sort=False)
glee_pred_df["data_or_pred"] = ["pred" for i in range(glee_pred_df.shape[0])]

glee_data_df = pd.concat([
    glee_1g0p_data_df,
    glee_1g1p_data_df,
    ], sort=False)
glee_data_df["data_or_pred"] = ["data" for i in range(glee_data_df.shape[0])]

glee_all_df = pd.concat([
        glee_pred_df,
        glee_data_df,
    ], sort=False)
glee_all_df.reset_index(inplace=True)

In [7]:
# changing one element lists to floats

reco_shower_energy_plane0_arr = []
reco_shower_energy_plane1_arr = []
reco_shower_energy_plane2_arr = []
reco_shower_energy_max_arr = []
reco_shower_energy_plane0_lists = glee_all_df["reco_shower_energy_plane0"].to_numpy()
reco_shower_energy_plane1_lists = glee_all_df["reco_shower_energy_plane1"].to_numpy()
reco_shower_energy_plane2_lists = glee_all_df["reco_shower_energy_plane2"].to_numpy()
reco_shower_energy_max_lists = glee_all_df["reco_shower_energy_max"].to_numpy()

for i in tqdm(range(glee_all_df.shape[0])):    
    reco_shower_energy_plane0_arr.append(reco_shower_energy_plane0_lists[i][0])
    reco_shower_energy_plane1_arr.append(reco_shower_energy_plane1_lists[i][0])
    reco_shower_energy_plane2_arr.append(reco_shower_energy_plane2_lists[i][0])
    reco_shower_energy_max_arr.append(reco_shower_energy_max_lists[i][0])

glee_all_df["reco_shower_energy_plane0"] = reco_shower_energy_plane0_arr
glee_all_df["reco_shower_energy_plane1"] = reco_shower_energy_plane1_arr
glee_all_df["reco_shower_energy_plane2"] = reco_shower_energy_plane2_arr
glee_all_df["reco_shower_energy_max"] = reco_shower_energy_max_arr


100%|██████████████████████████████████| 5716/5716 [00:00<00:00, 1395578.42it/s]


In [8]:
glee_all_df = glee_all_df.rename(columns={"run_number": "run", "subrun_number": "subrun", "event_number":"event"})

In [9]:
glee_all_df

Unnamed: 0,entry,run,subrun,event,reco_shower_energy_plane0,reco_shower_energy_plane1,reco_shower_energy_plane2,reco_shower_energy_max,reco_vertex_x,reco_vertex_y,reco_vertex_z,simple_pot_weight,glee_file,glee_selection,data_or_pred
0,0,6058,81,4079,288.261817,310.056054,309.773263,310.056054,167.029160,-61.018150,387.096191,0.000134,NCDeltaRadOverlaySM,1g0p,pred
1,1,6058,81,4100,118.919947,189.828308,220.959059,220.959059,103.585838,-76.903328,578.505981,0.000134,NCDeltaRadOverlaySM,1g0p,pred
2,2,6221,10,537,255.901477,291.085256,320.449278,320.449278,110.947716,28.835058,965.170654,0.000134,NCDeltaRadOverlaySM,1g0p,pred
3,3,6827,72,3605,223.807478,322.116360,418.597665,418.597665,127.180023,-49.800064,318.352081,0.000134,NCDeltaRadOverlaySM,1g0p,pred
4,4,6827,72,3635,220.771839,261.708669,346.037231,346.037231,218.050476,-41.677532,266.091003,0.000134,NCDeltaRadOverlaySM,1g0p,pred
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5711,11,15901,379,18996,86.929714,97.891710,95.735243,97.891710,188.837296,72.556511,936.094849,1.000000,data,1g1p,data
5712,12,14186,66,3336,18.964524,200.508582,174.618579,200.508582,200.712769,-18.161919,86.095505,1.000000,data,1g1p,data
5713,13,14736,179,8961,370.253497,240.394299,480.185831,480.185831,142.115280,-77.809120,923.357666,1.000000,data,1g1p,data
5714,14,15027,0,33,165.409022,185.473334,188.576298,188.576298,97.284668,-23.450846,523.709595,1.000000,data,1g1p,data


In [10]:
# check to see if any RSE numbers are shared between glee files, would be an issue when merging

prev_set = set()
for i, row in glee_all_df.iterrows():
    rse_num = row["run"] * 1000000000000 + row["subrun"] * 1000000 + row["event"]
    if rse_num in prev_set:
        print("duplicate!")
        print(row["run"], row["subrun"], row["event"], row["glee_file"])
    else:
        prev_set.add(rse_num)
        
"""# remove duplicate RSE nums, would be an issue when merging
glee_duplicate_row_list = glee_all_df[["run_number", "subrun_number", "event_number"]].duplicated().to_numpy()
glee_duplicate_indices = list(np.nonzero(glee_duplicate_row_list)[0])
print(glee_duplicate_indices)
glee_all_df.drop(glee_duplicate_indices, inplace=True)"""

'# remove duplicate RSE nums, would be an issue when merging\nglee_duplicate_row_list = glee_all_df[["run_number", "subrun_number", "event_number"]].duplicated().to_numpy()\nglee_duplicate_indices = list(np.nonzero(glee_duplicate_row_list)[0])\nprint(glee_duplicate_indices)\nglee_all_df.drop(glee_duplicate_indices, inplace=True)'

In [11]:
# variables to load from WC files

bdt_vars = [
    "nc_delta_score",
]

extra_variables = [
    "run",
    "subrun",
    "event",
    "nuvtx_diff",
    "showervtx_diff",
    "muonvtx_diff",
    "truth_isCC",
    "truth_vtxInside",
    "truth_nuPdg",
    "truth_nuEnergy",
    "truth_nuIntType",
    "truth_energyInside",
    "weight_spline",
    "weight_cv",
    "weight_lee",
    "event_type",
    "weight",
    "lowEweight"
]

kine_scalar_vars = [
    "kine_reco_add_energy",
    "kine_pio_mass",
    "kine_pio_flag",
    "kine_pio_vtx_dis",
    "kine_pio_energy_1",
    "kine_pio_theta_1",
    "kine_pio_phi_1",
    "kine_pio_dis_1",
    "kine_pio_energy_2",
    "kine_pio_theta_2",
    "kine_pio_phi_2",
    "kine_pio_dis_2",
    "kine_pio_angle"
]

kine_vector_vars = [
    "kine_energy_particle",
    "kine_energy_info",
    "kine_particle_type",
    "kine_energy_included",
]

eval_mc_variables = [
    "run",
    "subrun",
    "event",
    "flash_time",
    "weight_spline", # this and remaining only make sense for MC
    "weight_cv",
    "match_completeness_energy",
    "truth_nuEnergy",
    "truth_energyInside",
    "truth_electronInside",
    "truth_nuPdg",
    "truth_isCC",
    "truth_isFC",
    "truth_vtxInside",
    "truth_vtxX",
    "truth_vtxY",
    "truth_vtxZ",
    "truth_nuTime",
]

eval_data_variables = [
    "run",
    "subrun",
    "event",
    "flash_time",
]


pf_eval_mc_variables = [
    "truth_NprimPio",
    "truth_NCDelta",
    "nuvtx_diff",
    "showervtx_diff",
    "reco_showerKE",
    "truth_pio_energy_1",
    "truth_pio_energy_2",
    "reco_nuvtxX",
    "reco_nuvtxY",
    "reco_nuvtxZ",
    "reco_showervtxX",
    "reco_showervtxY",
    "reco_showervtxZ",
]

pf_eval_data_variables = [ # also use these for dirt
    "reco_showerKE",
    "reco_nuvtxX",
    "reco_nuvtxY",
    "reco_nuvtxZ",
    "reco_showervtxX",
    "reco_showervtxY",
    "reco_showervtxZ",
]

In [12]:
# loading WC NC Delta files

f_nc_delta_run1 = uproot.open(wc_file_location + "checkout_prodgenie_bnb_nc_delta_overlay_run1_PF.root")["wcpselection"]
#print(f_nc_delta_run1.keys())
#f_nc_delta_run1["T_KINEvars"].show()
f_nc_delta_run1_bdt = f_nc_delta_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nc_delta_run1_eval = f_nc_delta_run1["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nc_delta_run1_pfeval = f_nc_delta_run1["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nc_delta_run1_kine = f_nc_delta_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nc_delta_run1_pot = f_nc_delta_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_delta_run1_file_POT = np.sum(f_nc_delta_run1_pot["pot_tor875good"].to_numpy())
nc_delta_run1_df = pd.concat([f_nc_delta_run1_bdt, f_nc_delta_run1_eval, f_nc_delta_run1_pfeval, f_nc_delta_run1_kine], axis=1, sort=False)
del f_nc_delta_run1
del f_nc_delta_run1_bdt
del f_nc_delta_run1_eval
del f_nc_delta_run1_pfeval
del f_nc_delta_run1_kine
del f_nc_delta_run1_pot
nc_delta_run1_df["isEXT"] = [0 for i in range(nc_delta_run1_df.shape[0])]
nc_delta_run1_df["isDirt"] = [0 for i in range(nc_delta_run1_df.shape[0])]
nc_delta_run1_df["WC_file"] = ["nc_delta_run1" for i in range(nc_delta_run1_df.shape[0])]
nc_delta_run1_df["run_num"] = [1 for i in range(nc_delta_run1_df.shape[0])]

print(nc_delta_run1_file_POT)
print(nc_delta_run1_df.shape[0])

f_nc_delta_run2 = uproot.open(wc_file_location + "checkout_prodgenie_bnb_nc_delta_overlay_run2_PF.root")["wcpselection"]
f_nc_delta_run2_bdt = f_nc_delta_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nc_delta_run2_eval = f_nc_delta_run2["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nc_delta_run2_pfeval = f_nc_delta_run2["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nc_delta_run2_kine = f_nc_delta_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nc_delta_run2_pot = f_nc_delta_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_delta_run2_file_POT = np.sum(f_nc_delta_run2_pot["pot_tor875good"].to_numpy())
nc_delta_run2_df = pd.concat([f_nc_delta_run2_bdt, f_nc_delta_run2_eval, f_nc_delta_run2_pfeval, f_nc_delta_run2_kine], axis=1, sort=False)
del f_nc_delta_run2
del f_nc_delta_run2_bdt
del f_nc_delta_run2_eval
del f_nc_delta_run2_pfeval
del f_nc_delta_run2_kine
del f_nc_delta_run2_pot
nc_delta_run2_df["isEXT"] = [0 for i in range(nc_delta_run2_df.shape[0])]
nc_delta_run2_df["isDirt"] = [0 for i in range(nc_delta_run2_df.shape[0])]
nc_delta_run2_df["WC_file"] = ["nc_delta_run2" for i in range(nc_delta_run2_df.shape[0])]
nc_delta_run2_df["run_num"] = [2 for i in range(nc_delta_run2_df.shape[0])]

print(nc_delta_run2_file_POT)
print(nc_delta_run2_df.shape[0])

f_nc_delta_run3 = uproot.open(wc_file_location + "checkout_prodgenie_bnb_nc_delta_overlay_run3_PF.root")["wcpselection"]
f_nc_delta_run3_bdt = f_nc_delta_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nc_delta_run3_eval = f_nc_delta_run3["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nc_delta_run3_pfeval = f_nc_delta_run3["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nc_delta_run3_kine = f_nc_delta_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nc_delta_run3_pot = f_nc_delta_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_delta_run3_file_POT = np.sum(f_nc_delta_run3_pot["pot_tor875good"].to_numpy())
nc_delta_run3_df = pd.concat([f_nc_delta_run3_bdt, f_nc_delta_run3_eval, f_nc_delta_run3_pfeval, f_nc_delta_run3_kine], axis=1, sort=False)
del f_nc_delta_run3
del f_nc_delta_run3_bdt
del f_nc_delta_run3_eval
del f_nc_delta_run3_pfeval
del f_nc_delta_run3_kine
del f_nc_delta_run3_pot
nc_delta_run3_df["isEXT"] = [0 for i in range(nc_delta_run3_df.shape[0])]
nc_delta_run3_df["isDirt"] = [0 for i in range(nc_delta_run3_df.shape[0])]
nc_delta_run3_df["WC_file"] = ["nc_delta_run3" for i in range(nc_delta_run3_df.shape[0])]
nc_delta_run3_df["run_num"] = [3 for i in range(nc_delta_run3_df.shape[0])]

print(nc_delta_run3_file_POT)
print(nc_delta_run3_df.shape[0])

nc_delta_df = pd.concat([nc_delta_run1_df, nc_delta_run2_df, nc_delta_run3_df], sort=False).query("truth_isCC==0 and truth_NCDelta==1")


2.1819002904629175e+23
47482
4.7398996193248585e+23
101199
5.14123404156446e+23
109843


In [13]:
# loading WC NC Pi0 files

f_ncpi0_run1 = uproot.open(wc_file_location + "checkout_prodgenie_nc_pi0_overlay_run1_PF.root")["wcpselection"]
f_ncpi0_run1_bdt = f_ncpi0_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ncpi0_run1_eval = f_ncpi0_run1["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_ncpi0_run1_pfeval = f_ncpi0_run1["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_ncpi0_run1_kine = f_ncpi0_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ncpi0_run1_pot = f_ncpi0_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_pi0_run1_file_POT = np.sum(f_ncpi0_run1_pot["pot_tor875good"].to_numpy())
nc_pi0_run1_df = pd.concat([f_ncpi0_run1_bdt, f_ncpi0_run1_eval, f_ncpi0_run1_pfeval, f_ncpi0_run1_kine], axis=1, sort=False)
del f_ncpi0_run1
del f_ncpi0_run1_bdt
del f_ncpi0_run1_eval
del f_ncpi0_run1_pfeval
del f_ncpi0_run1_kine
del f_ncpi0_run1_pot
nc_pi0_run1_df["isEXT"] = [0 for i in range(nc_pi0_run1_df.shape[0])]
nc_pi0_run1_df["isDirt"] = [0 for i in range(nc_pi0_run1_df.shape[0])]
nc_pi0_run1_df["WC_file"] = ["NC_Pi0_run1" for i in range(nc_pi0_run1_df.shape[0])]
nc_pi0_run1_df["run_num"] = [1 for i in range(nc_pi0_run1_df.shape[0])]

f_ncpi0_run2 = uproot.open(wc_file_location + "checkout_prodgenie_nc_pi0_overlay_run2_PF.root")["wcpselection"]
f_ncpi0_run2_bdt = f_ncpi0_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ncpi0_run2_eval = f_ncpi0_run2["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_ncpi0_run2_pfeval = f_ncpi0_run2["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_ncpi0_run2_kine = f_ncpi0_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ncpi0_run2_pot = f_ncpi0_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_pi0_run2_file_POT = np.sum(f_ncpi0_run2_pot["pot_tor875good"].to_numpy())
nc_pi0_run2_df = pd.concat([f_ncpi0_run2_bdt, f_ncpi0_run2_eval, f_ncpi0_run2_pfeval, f_ncpi0_run2_kine], axis=1, sort=False)
del f_ncpi0_run2
del f_ncpi0_run2_bdt
del f_ncpi0_run2_eval
del f_ncpi0_run2_pfeval
del f_ncpi0_run2_kine
del f_ncpi0_run2_pot
nc_pi0_run2_df["isEXT"] = [0 for i in range(nc_pi0_run2_df.shape[0])]
nc_pi0_run2_df["isDirt"] = [0 for i in range(nc_pi0_run2_df.shape[0])]
nc_pi0_run2_df["WC_file"] = ["NC_Pi0_run2" for i in range(nc_pi0_run2_df.shape[0])]
nc_pi0_run2_df["run_num"] = [2 for i in range(nc_pi0_run2_df.shape[0])]


f_ncpi0_run3 = uproot.open(wc_file_location + "checkout_prodgenie_nc_pi0_overlay_run3_PF.root")["wcpselection"]
f_ncpi0_run3_bdt = f_ncpi0_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ncpi0_run3_eval = f_ncpi0_run3["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_ncpi0_run3_pfeval = f_ncpi0_run3["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_ncpi0_run3_kine = f_ncpi0_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ncpi0_run3_pot = f_ncpi0_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
nc_pi0_run3_file_POT = np.sum(f_ncpi0_run3_pot["pot_tor875good"].to_numpy())
nc_pi0_run3_df = pd.concat([f_ncpi0_run3_bdt, f_ncpi0_run3_eval, f_ncpi0_run3_pfeval, f_ncpi0_run3_kine], axis=1, sort=False)
del f_ncpi0_run3
del f_ncpi0_run3_bdt
del f_ncpi0_run3_eval
del f_ncpi0_run3_pfeval
del f_ncpi0_run3_kine
del f_ncpi0_run3_pot
nc_pi0_run3_df["isEXT"] = [0 for i in range(nc_pi0_run3_df.shape[0])]
nc_pi0_run3_df["isDirt"] = [0 for i in range(nc_pi0_run3_df.shape[0])]
nc_pi0_run3_df["WC_file"] = ["NC_Pi0_run3" for i in range(nc_pi0_run3_df.shape[0])]
nc_pi0_run3_df["run_num"] = [3 for i in range(nc_pi0_run3_df.shape[0])]

print(nc_pi0_run1_file_POT)
print(nc_pi0_run1_df.shape[0])

print(nc_pi0_run3_file_POT)
print(nc_pi0_run3_df.shape[0])

nc_pi0_df = pd.concat([nc_pi0_run1_df, nc_pi0_run2_df, nc_pi0_run3_df], sort=False).query("truth_isCC==0 and truth_NprimPio>0 and not (truth_NCDelta==1)")

del nc_pi0_run1_df
del nc_pi0_run2_df
del nc_pi0_run3_df

1.224676151063279e+21
48005
1.2277865498371458e+21
47066


In [14]:
# loading WC nue overlay files

f_intrinsic_nue_run1 = uproot.open(wc_file_location + "checkout_prodgenie_bnb_intrinsic_nue_overlay_run1_PF.root")["wcpselection"]
f_intrinsic_nue_run1_bdt = f_intrinsic_nue_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_intrinsic_nue_run1_eval = f_intrinsic_nue_run1["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_intrinsic_nue_run1_pfeval = f_intrinsic_nue_run1["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_intrinsic_nue_run1_kine = f_intrinsic_nue_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_intrinsic_nue_run1_pot = f_intrinsic_nue_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
intrinsic_nue_run1_file_POT = np.sum(f_intrinsic_nue_run1_pot["pot_tor875good"].to_numpy())
intrinsic_nue_run1_df = pd.concat([f_intrinsic_nue_run1_bdt, f_intrinsic_nue_run1_eval, f_intrinsic_nue_run1_pfeval, f_intrinsic_nue_run1_kine], axis=1, sort=False)
del f_intrinsic_nue_run1
del f_intrinsic_nue_run1_bdt
del f_intrinsic_nue_run1_eval
del f_intrinsic_nue_run1_pfeval
del f_intrinsic_nue_run1_kine
del f_intrinsic_nue_run1_pot
intrinsic_nue_run1_df["isEXT"] = [0 for i in range(intrinsic_nue_run1_df.shape[0])]
intrinsic_nue_run1_df["isDirt"] = [0 for i in range(intrinsic_nue_run1_df.shape[0])]
intrinsic_nue_run1_df["WC_file"] = ["intrinsic_nue_run1" for i in range(intrinsic_nue_run1_df.shape[0])]
intrinsic_nue_run1_df["run_num"] = [1 for i in range(intrinsic_nue_run1_df.shape[0])]

print(intrinsic_nue_run1_file_POT)
print(intrinsic_nue_run1_df.shape[0])

f_intrinsic_nue_run2 = uproot.open(wc_file_location + "checkout_prodgenie_bnb_intrinsic_nue_overlay_run2_PF.root")["wcpselection"]
f_intrinsic_nue_run2_bdt = f_intrinsic_nue_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_intrinsic_nue_run2_eval = f_intrinsic_nue_run2["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_intrinsic_nue_run2_pfeval = f_intrinsic_nue_run2["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_intrinsic_nue_run2_kine = f_intrinsic_nue_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_intrinsic_nue_run2_pot = f_intrinsic_nue_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
intrinsic_nue_run2_file_POT = np.sum(f_intrinsic_nue_run2_pot["pot_tor875good"].to_numpy())
intrinsic_nue_run2_df = pd.concat([f_intrinsic_nue_run2_bdt, f_intrinsic_nue_run2_eval, f_intrinsic_nue_run2_pfeval, f_intrinsic_nue_run2_kine], axis=1, sort=False)
del f_intrinsic_nue_run2
del f_intrinsic_nue_run2_bdt
del f_intrinsic_nue_run2_eval
del f_intrinsic_nue_run2_pfeval
del f_intrinsic_nue_run2_kine
del f_intrinsic_nue_run2_pot
intrinsic_nue_run2_df["isEXT"] = [0 for i in range(intrinsic_nue_run2_df.shape[0])]
intrinsic_nue_run2_df["isDirt"] = [0 for i in range(intrinsic_nue_run2_df.shape[0])]
intrinsic_nue_run2_df["WC_file"] = ["intrinsic_nue_run2" for i in range(intrinsic_nue_run2_df.shape[0])]
intrinsic_nue_run2_df["run_num"] = [2 for i in range(intrinsic_nue_run2_df.shape[0])]

print(intrinsic_nue_run2_file_POT)
print(intrinsic_nue_run2_df.shape[0])

f_intrinsic_nue_run3 = uproot.open(wc_file_location + "checkout_prodgenie_bnb_intrinsic_nue_overlay_run3_PF.root")["wcpselection"]
f_intrinsic_nue_run3_bdt = f_intrinsic_nue_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_intrinsic_nue_run3_eval = f_intrinsic_nue_run3["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_intrinsic_nue_run3_pfeval = f_intrinsic_nue_run3["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_intrinsic_nue_run3_kine = f_intrinsic_nue_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_intrinsic_nue_run3_pot = f_intrinsic_nue_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
intrinsic_nue_run3_file_POT = np.sum(f_intrinsic_nue_run3_pot["pot_tor875good"].to_numpy())
intrinsic_nue_run3_df = pd.concat([f_intrinsic_nue_run3_bdt, f_intrinsic_nue_run3_eval, f_intrinsic_nue_run3_pfeval, f_intrinsic_nue_run3_kine], axis=1, sort=False)
del f_intrinsic_nue_run3
del f_intrinsic_nue_run3_bdt
del f_intrinsic_nue_run3_eval
del f_intrinsic_nue_run3_pfeval
del f_intrinsic_nue_run3_kine
del f_intrinsic_nue_run3_pot
intrinsic_nue_run3_df["isEXT"] = [0 for i in range(intrinsic_nue_run3_df.shape[0])]
intrinsic_nue_run3_df["isDirt"] = [0 for i in range(intrinsic_nue_run3_df.shape[0])]
intrinsic_nue_run3_df["WC_file"] = ["intrinsic_nue_run3" for i in range(intrinsic_nue_run3_df.shape[0])]
intrinsic_nue_run3_df["run_num"] = [3 for i in range(intrinsic_nue_run3_df.shape[0])]

print(intrinsic_nue_run3_file_POT)
print(intrinsic_nue_run3_df.shape[0])

intrinsic_nue_df = pd.concat([intrinsic_nue_run1_df, intrinsic_nue_run2_df, intrinsic_nue_run3_df], sort=False).query(
    "truth_isCC==1 and abs(truth_nuPdg)==12")

del intrinsic_nue_run1_df
del intrinsic_nue_run2_df
del intrinsic_nue_run3_df


3.931094470783121e+22
68206
1.2460289446544576e+23
209160
8.821478522260567e+22
148962


In [15]:
# loading WC nu overlay files

f_nu_overlay_run1 = uproot.open(wc_file_location + "checkout_prodgenie_bnb_nu_overlay_run1_PF.root")["wcpselection"]
f_nu_overlay_run1_bdt = f_nu_overlay_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nu_overlay_run1_eval = f_nu_overlay_run1["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nu_overlay_run1_pfeval = f_nu_overlay_run1["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nu_overlay_run1_kine = f_nu_overlay_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nu_overlay_run1_pot = f_nu_overlay_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
nu_overlay_run1_POT = np.sum(f_nu_overlay_run1_pot["pot_tor875good"].to_numpy())
nu_overlay_run1_df = pd.concat([f_nu_overlay_run1_bdt, f_nu_overlay_run1_eval, f_nu_overlay_run1_pfeval, f_nu_overlay_run1_kine], axis=1, sort=False)
del f_nu_overlay_run1
del f_nu_overlay_run1_bdt
del f_nu_overlay_run1_eval
del f_nu_overlay_run1_pfeval
del f_nu_overlay_run1_kine
del f_nu_overlay_run1_pot
nu_overlay_run1_df["isEXT"] = [0 for i in range(nu_overlay_run1_df.shape[0])]
nu_overlay_run1_df["isDirt"] = [0 for i in range(nu_overlay_run1_df.shape[0])]
nu_overlay_run1_df["WC_file"] = ["nu_overlay_run1" for i in range(nu_overlay_run1_df.shape[0])]
nu_overlay_run1_df["run_num"] = [1 for i in range(nu_overlay_run1_df.shape[0])]

f_nu_overlay_run2 = uproot.open(wc_file_location + "checkout_prodgenie_bnb_nu_overlay_run2_PF.root")["wcpselection"]
f_nu_overlay_run2_bdt = f_nu_overlay_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nu_overlay_run2_eval = f_nu_overlay_run2["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nu_overlay_run2_pfeval = f_nu_overlay_run2["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nu_overlay_run2_kine = f_nu_overlay_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nu_overlay_run2_pot = f_nu_overlay_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
nu_overlay_run2_POT = np.sum(f_nu_overlay_run2_pot["pot_tor875good"].to_numpy())
nu_overlay_run2_df = pd.concat([f_nu_overlay_run2_bdt, f_nu_overlay_run2_eval, f_nu_overlay_run2_pfeval, f_nu_overlay_run2_kine], axis=1, sort=False)
del f_nu_overlay_run2
del f_nu_overlay_run2_bdt
del f_nu_overlay_run2_eval
del f_nu_overlay_run2_pfeval
del f_nu_overlay_run2_kine
del f_nu_overlay_run2_pot
nu_overlay_run2_df["isEXT"] = [0 for i in range(nu_overlay_run2_df.shape[0])]
nu_overlay_run2_df["isDirt"] = [0 for i in range(nu_overlay_run2_df.shape[0])]
nu_overlay_run2_df["WC_file"] = ["nu_overlay_run2" for i in range(nu_overlay_run2_df.shape[0])]
nu_overlay_run2_df["run_num"] = [2 for i in range(nu_overlay_run2_df.shape[0])]

f_nu_overlay_run3 = uproot.open(wc_file_location + "checkout_prodgenie_bnb_nu_overlay_run3_PF.root")["wcpselection"]
f_nu_overlay_run3_bdt = f_nu_overlay_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_nu_overlay_run3_eval = f_nu_overlay_run3["T_eval"].pandas.df(eval_mc_variables + ["match_isFC"], flatten=False)
f_nu_overlay_run3_pfeval = f_nu_overlay_run3["T_PFeval"].pandas.df(pf_eval_mc_variables, flatten=False)
f_nu_overlay_run3_kine = f_nu_overlay_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_nu_overlay_run3_pot = f_nu_overlay_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
nu_overlay_run3_POT = np.sum(f_nu_overlay_run3_pot["pot_tor875good"].to_numpy())
nu_overlay_run3_df = pd.concat([f_nu_overlay_run3_bdt, f_nu_overlay_run3_eval, f_nu_overlay_run3_pfeval, f_nu_overlay_run3_kine], axis=1, sort=False)
del f_nu_overlay_run3
del f_nu_overlay_run3_bdt
del f_nu_overlay_run3_eval
del f_nu_overlay_run3_pfeval
del f_nu_overlay_run3_kine
del f_nu_overlay_run3_pot
nu_overlay_run3_df["isEXT"] = [0 for i in range(nu_overlay_run3_df.shape[0])]
nu_overlay_run3_df["isDirt"] = [0 for i in range(nu_overlay_run3_df.shape[0])]
nu_overlay_run3_df["WC_file"] = ["nu_overlay_run3" for i in range(nu_overlay_run3_df.shape[0])]
nu_overlay_run3_df["run_num"] = [3 for i in range(nu_overlay_run3_df.shape[0])]

overlay_df = pd.concat([nu_overlay_run1_df, nu_overlay_run2_df, nu_overlay_run3_df], sort=False).query(
    "not (truth_isCC==0 and truth_NCDelta==1) and not (truth_isCC==0 and truth_NprimPio>0 and not (truth_NCDelta==1)) and not (truth_isCC==1 and abs(truth_nuPdg)==12)")
del nu_overlay_run1_df
del nu_overlay_run2_df
del nu_overlay_run3_df

In [16]:
# loading WC EXT files

f_ext_run1 = uproot.open(wc_file_location + "wcp_data_extbnb_run1_mcc9_v08_00_00_53_checkout.root")["wcpselection"]
f_ext_run1_bdt = f_ext_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ext_run1_eval = f_ext_run1["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_ext_run1_pfeval = f_ext_run1["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_ext_run1_kine = f_ext_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ext_run1_pot = f_ext_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
ext_run1_POT = np.sum(f_ext_run1_pot["pot_tor875good"].to_numpy())
ext_run1_df = pd.concat([f_ext_run1_bdt, f_ext_run1_eval, f_ext_run1_pfeval, f_ext_run1_kine], axis=1, sort=False)
del f_ext_run1
del f_ext_run1_bdt
del f_ext_run1_eval
del f_ext_run1_pfeval
del f_ext_run1_kine
del f_ext_run1_pot
ext_run1_df["isEXT"] = [1 for i in range(ext_run1_df.shape[0])]
ext_run1_df["isDirt"] = [0 for i in range(ext_run1_df.shape[0])]
ext_run1_df["WC_file"] = ["ext_run1" for i in range(ext_run1_df.shape[0])]
ext_run1_df["run_num"] = [1 for i in range(ext_run1_df.shape[0])]

f_ext_run2 = uproot.open(wc_file_location + "wcp_data_extbnb_run2_mcc9_v08_00_00_53_checkout.root")["wcpselection"]
f_ext_run2_bdt = f_ext_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ext_run2_eval = f_ext_run2["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_ext_run2_pfeval = f_ext_run2["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_ext_run2_kine = f_ext_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ext_run2_pot = f_ext_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
ext_run2_POT = np.sum(f_ext_run2_pot["pot_tor875good"].to_numpy())
ext_run2_df = pd.concat([f_ext_run2_bdt, f_ext_run2_eval, f_ext_run2_pfeval, f_ext_run2_kine], axis=1, sort=False)
del f_ext_run2
del f_ext_run2_bdt
del f_ext_run2_eval
del f_ext_run2_pfeval
del f_ext_run2_kine
del f_ext_run2_pot
ext_run2_df["isEXT"] = [1 for i in range(ext_run2_df.shape[0])]
ext_run2_df["isDirt"] = [0 for i in range(ext_run2_df.shape[0])]
ext_run2_df["WC_file"] = ["ext_run2" for i in range(ext_run2_df.shape[0])]
ext_run2_df["run_num"] = [2 for i in range(ext_run2_df.shape[0])]

f_ext_run3 = uproot.open(wc_file_location + "wcp_data_extbnb_run3_mcc9_v08_00_00_53_checkout.root")["wcpselection"]
f_ext_run3_bdt = f_ext_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_ext_run3_eval = f_ext_run3["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_ext_run3_pfeval = f_ext_run3["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_ext_run3_kine = f_ext_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_ext_run3_pot = f_ext_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
ext_run3_POT = np.sum(f_ext_run3_pot["pot_tor875good"].to_numpy())
ext_run3_df = pd.concat([f_ext_run3_bdt, f_ext_run3_eval, f_ext_run3_pfeval, f_ext_run3_kine], axis=1, sort=False)
del f_ext_run3
del f_ext_run3_bdt
del f_ext_run3_eval
del f_ext_run3_pfeval
del f_ext_run3_kine
del f_ext_run3_pot
ext_run3_df["isEXT"] = [1 for i in range(ext_run3_df.shape[0])]
ext_run3_df["isDirt"] = [0 for i in range(ext_run3_df.shape[0])]
ext_run3_df["WC_file"] = ["ext_run3" for i in range(ext_run3_df.shape[0])]
ext_run3_df["run_num"] = [3 for i in range(ext_run3_df.shape[0])]

ext_df = pd.concat([ext_run1_df, ext_run2_df, ext_run3_df], sort=False)
del ext_run1_df
del ext_run2_df
del ext_run3_df

In [17]:
# loading WC dirt files

f_dirt_run1 = uproot.open(wc_file_location + "checkout_prodgenie_dirt_overlay_run1_all.root")["wcpselection"]
f_dirt_run1_bdt = f_dirt_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_dirt_run1_eval = f_dirt_run1["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_dirt_run1_pfeval = f_dirt_run1["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_dirt_run1_kine = f_dirt_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_dirt_run1_pot = f_dirt_run1["T_pot"].pandas.df("pot_tor875good", flatten=False)
dirt_run1_POT = np.sum(f_dirt_run1_pot["pot_tor875good"].to_numpy())
dirt_run1_df = pd.concat([f_dirt_run1_bdt, f_dirt_run1_eval, f_dirt_run1_pfeval, f_dirt_run1_kine], axis=1, sort=False)
del f_dirt_run1
del f_dirt_run1_bdt
del f_dirt_run1_eval
del f_dirt_run1_pfeval
del f_dirt_run1_kine
del f_dirt_run1_pot
dirt_run1_df["isEXT"] = [0 for i in range(dirt_run1_df.shape[0])]
dirt_run1_df["isDirt"] = [1 for i in range(dirt_run1_df.shape[0])]
dirt_run1_df["WC_file"] = ["dirt_run1" for i in range(dirt_run1_df.shape[0])]
dirt_run1_df["run_num"] = [1 for i in range(dirt_run1_df.shape[0])]

f_dirt_run2 = uproot.open(wc_file_location + "checkout_prodgenie_dirt_overlay_run2_all.root")["wcpselection"]
f_dirt_run2_bdt = f_dirt_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_dirt_run2_eval = f_dirt_run2["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_dirt_run2_pfeval = f_dirt_run2["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_dirt_run2_kine = f_dirt_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_dirt_run2_pot = f_dirt_run2["T_pot"].pandas.df("pot_tor875good", flatten=False)
dirt_run2_POT = np.sum(f_dirt_run2_pot["pot_tor875good"].to_numpy())
dirt_run2_df = pd.concat([f_dirt_run2_bdt, f_dirt_run2_eval, f_dirt_run2_pfeval, f_dirt_run2_kine], axis=1, sort=False)
del f_dirt_run2
del f_dirt_run2_bdt
del f_dirt_run2_eval
del f_dirt_run2_pfeval
del f_dirt_run2_kine
del f_dirt_run2_pot
dirt_run2_df["isEXT"] = [0 for i in range(dirt_run2_df.shape[0])]
dirt_run2_df["isDirt"] = [1 for i in range(dirt_run2_df.shape[0])]
dirt_run2_df["WC_file"] = ["dirt_run2" for i in range(dirt_run2_df.shape[0])]
dirt_run2_df["run_num"] = [2 for i in range(dirt_run2_df.shape[0])]

f_dirt_run3 = uproot.open(wc_file_location + "checkout_prodgenie_dirt_overlay_run3_all.root")["wcpselection"]
f_dirt_run3_bdt = f_dirt_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_dirt_run3_eval = f_dirt_run3["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_dirt_run3_pfeval = f_dirt_run3["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_dirt_run3_kine = f_dirt_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
f_dirt_run3_pot = f_dirt_run3["T_pot"].pandas.df("pot_tor875good", flatten=False)
dirt_run3_POT = np.sum(f_dirt_run3_pot["pot_tor875good"].to_numpy())
dirt_run3_df = pd.concat([f_dirt_run3_bdt, f_dirt_run3_eval, f_dirt_run3_pfeval, f_dirt_run3_kine], axis=1, sort=False)
del f_dirt_run3
del f_dirt_run3_bdt
del f_dirt_run3_eval
del f_dirt_run3_pfeval
del f_dirt_run3_kine
del f_dirt_run3_pot
dirt_run3_df["isEXT"] = [0 for i in range(dirt_run3_df.shape[0])]
dirt_run3_df["isDirt"] = [1 for i in range(dirt_run3_df.shape[0])]
dirt_run3_df["WC_file"] = ["dirt_run3" for i in range(dirt_run3_df.shape[0])]
dirt_run3_df["run_num"] = [3 for i in range(dirt_run3_df.shape[0])]

dirt_df = pd.concat([dirt_run1_df, dirt_run2_df, dirt_run3_df], sort=False)
del dirt_run1_df
del dirt_run2_df
del dirt_run3_df

In [18]:
# full data

f_data_run1 = uproot.open(wc_file_location + "checkout_data_bnb_run1_all.root")["wcpselection"]
f_data_run1_bdt = f_data_run1["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_data_run1_eval = f_data_run1["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_data_run1_pfeval = f_data_run1["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_data_run1_kine = f_data_run1["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
data_run1_df = pd.concat([f_data_run1_bdt, f_data_run1_eval, f_data_run1_pfeval, f_data_run1_kine], axis=1, sort=False)
del f_data_run1
del f_data_run1_bdt
del f_data_run1_eval
del f_data_run1_pfeval
del f_data_run1_kine
data_run1_df["isEXT"] = [0 for i in range(data_run1_df.shape[0])]
data_run1_df["isDirt"] = [0 for i in range(data_run1_df.shape[0])]
data_run1_df["WC_file"] = ["data_run1" for i in range(data_run1_df.shape[0])]
data_run1_df["category"] = ["data" for i in range(data_run1_df.shape[0])]
data_run1_df["run_num"] = [1 for i in range(data_run1_df.shape[0])]

f_data_run2 = uproot.open(wc_file_location + "checkout_data_bnb_run2_all.root")["wcpselection"]
f_data_run2_bdt = f_data_run2["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_data_run2_eval = f_data_run2["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_data_run2_pfeval = f_data_run2["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_data_run2_kine = f_data_run2["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
data_run2_df = pd.concat([f_data_run2_bdt, f_data_run2_eval, f_data_run2_pfeval, f_data_run2_kine], axis=1, sort=False)
del f_data_run2
del f_data_run2_bdt
del f_data_run2_eval
del f_data_run2_pfeval
del f_data_run2_kine
data_run2_df["isEXT"] = [0 for i in range(data_run2_df.shape[0])]
data_run2_df["isDirt"] = [0 for i in range(data_run2_df.shape[0])]
data_run2_df["WC_file"] = ["data_run2" for i in range(data_run2_df.shape[0])]
data_run2_df["category"] = ["data" for i in range(data_run2_df.shape[0])]
data_run2_df["run_num"] = [2 for i in range(data_run2_df.shape[0])]

f_data_run3 = uproot.open(wc_file_location + "checkout_data_bnb_run3_all.root")["wcpselection"]
f_data_run3_bdt = f_data_run3["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_data_run3_eval = f_data_run3["T_eval"].pandas.df(eval_data_variables + ["match_isFC"], flatten=False)
f_data_run3_pfeval = f_data_run3["T_PFeval"].pandas.df(pf_eval_data_variables, flatten=False)
f_data_run3_kine = f_data_run3["T_KINEvars"].pandas.df(kine_scalar_vars + kine_vector_vars + ["kine_reco_Enu"], flatten=False)
data_run3_df = pd.concat([f_data_run3_bdt, f_data_run3_eval, f_data_run3_pfeval, f_data_run3_kine], axis=1, sort=False)
del f_data_run3
del f_data_run3_bdt
del f_data_run3_eval
del f_data_run3_pfeval
del f_data_run3_kine
data_run3_df["isEXT"] = [0 for i in range(data_run3_df.shape[0])]
data_run3_df["isDirt"] = [0 for i in range(data_run3_df.shape[0])]
data_run3_df["WC_file"] = ["data_run3" for i in range(data_run3_df.shape[0])]
data_run3_df["category"] = ["data" for i in range(data_run3_df.shape[0])]
data_run3_df["run_num"] = [3 for i in range(data_run3_df.shape[0])]


data_all_df = pd.concat([data_run1_df, data_run2_df, data_run3_df], sort=False)

del data_run1_df
del data_run2_df
del data_run3_df

In [19]:
# combining prediction files

all_df = pd.concat([overlay_df, ext_df, dirt_df, nc_delta_df, intrinsic_nue_df, nc_pi0_df], sort=False)

del overlay_df
del ext_df
del dirt_df
del nc_delta_df
del intrinsic_nue_df
del nc_pi0_df

In [20]:
normalizing_POT_run1 = 1.423e20
normalizing_POT_run2 = 2.541e20
normalizing_POT_run3 = 2.405e20

weight_cv = all_df["weight_cv"].to_numpy()
weight_spline = all_df["weight_spline"].to_numpy()
is_ext = all_df["isEXT"].to_numpy()
is_dirt = all_df["isDirt"].to_numpy()
is_nc_delta = all_df["truth_NCDelta"].to_numpy() # should give 0 for data (truth_NCDelta==NaN)
is_CC = all_df["truth_isCC"].to_numpy() # should give 0 for data (truth_isCC==NaN)
num_pi0 = all_df["truth_NprimPio"].to_numpy() # should give 0 for data (truth_NprimPio==NaN)
truth_nuPdgs = all_df["truth_nuPdg"].to_numpy()


run_nums = all_df["run_num"].to_numpy()
WC_file_str = all_df["WC_file"].to_numpy()
net_weights = []
for i in tqdm(range(len(weight_cv))):
    weight_temp = weight_cv[i] * weight_spline[i]
    if weight_temp <= 0. or weight_temp > 30. or np.isnan(weight_temp): # something went wrong with the saved weights
        weight_temp = 1.
    if run_nums[i] == 1:
        if is_ext[i]:
            net_weights.append(normalizing_POT_run1 / ext_run1_POT)
        elif is_dirt[i]:
            net_weights.append(weight_temp * normalizing_POT_run1 / dirt_run1_POT)
        elif is_nc_delta[i]:
            net_weights.append(weight_temp * normalizing_POT_run1 / nc_delta_run1_file_POT)
        elif not is_CC[i] and num_pi0[i] > 0:
            net_weights.append(weight_temp * normalizing_POT_run1 / nc_pi0_run1_file_POT)
        elif is_CC[i] and abs(truth_nuPdgs[i]) == 12:
            net_weights.append(weight_temp * normalizing_POT_run1 / intrinsic_nue_run1_file_POT)
        else:
            net_weights.append(weight_temp * normalizing_POT_run1 / nu_overlay_run1_POT)
    elif run_nums[i] == 2:
        if is_ext[i]:
            net_weights.append(normalizing_POT_run2 / ext_run2_POT)
        elif is_dirt[i]:
            net_weights.append(weight_temp * normalizing_POT_run2 / dirt_run2_POT)
        elif is_nc_delta[i]:
            net_weights.append(weight_temp * normalizing_POT_run2 / nc_delta_run2_file_POT)
        elif not is_CC[i] and num_pi0[i] > 0:
            net_weights.append(weight_temp * normalizing_POT_run2 / nc_pi0_run2_file_POT)
        elif is_CC[i] and abs(truth_nuPdgs[i]) == 12:
            net_weights.append(weight_temp * normalizing_POT_run2 / intrinsic_nue_run2_file_POT)
        else:
            net_weights.append(weight_temp * normalizing_POT_run2 / nu_overlay_run2_POT)
    elif run_nums[i] == 3:
        if is_ext[i]:
            net_weights.append(normalizing_POT_run3 / ext_run3_POT)
        elif is_dirt[i]:
            net_weights.append(weight_temp * normalizing_POT_run3 / dirt_run3_POT)
        elif is_nc_delta[i]:
            net_weights.append(weight_temp * normalizing_POT_run3 / nc_delta_run3_file_POT)
        elif not is_CC[i] and num_pi0[i] > 0:
            net_weights.append(weight_temp * normalizing_POT_run3 / nc_pi0_run3_file_POT)
        elif is_CC[i] and abs(truth_nuPdgs[i]) == 12:
            net_weights.append(weight_temp * normalizing_POT_run3 / intrinsic_nue_run3_file_POT)
        else:
            net_weights.append(weight_temp * normalizing_POT_run3 / nu_overlay_run3_POT)

all_df["net_weight"] = net_weights

100%|█████████████████████████████| 8119423/8119423 [00:37<00:00, 219287.09it/s]


In [21]:
em_charge_scale = 0.95

uncorrected_reco_showerKE = all_df["reco_showerKE"].to_numpy()
all_df["reco_showerKE"] = uncorrected_reco_showerKE * 1000.

uncorrected_reco_showerKE = data_all_df["reco_showerKE"].to_numpy()
data_all_df["reco_showerKE"] = em_charge_scale * uncorrected_reco_showerKE * 1000.

In [22]:
# adding WC reco proton num

proton_nums = []
track_nums = []
energy_lists = all_df["kine_energy_particle"].to_numpy()
pdg_lists = all_df["kine_particle_type"].to_numpy()
for i in tqdm(range(all_df.shape[0])):
    proton_num = 0
    track_num = 0
    energy_list = energy_lists[i]
    pdg_list = pdg_lists[i]
    for i in range(len(energy_list)):
        if abs(pdg_list[i]) == 2212 and energy_list[i] > 35.:
            proton_num += 1
        if abs(pdg_list[i]) == 13 or abs(pdg_list[i]) == 211 and energy_list[i] > 10.: # see N_tracks at https://github.com/BNLIF/wcp-uboone-bdt/blob/main/inc/WCPleeANA/cuts.h
            track_num += 1
    proton_nums.append(proton_num)
    track_nums.append(track_num)
all_df["WC_reco_num_protons"] = proton_nums
all_df["WC_reco_num_other_tracks"] = track_nums

proton_nums = []
track_nums = []
energy_lists = data_all_df["kine_energy_particle"].to_numpy()
pdg_lists = data_all_df["kine_particle_type"].to_numpy()
for i in tqdm(range(data_all_df.shape[0])):
    proton_num = 0
    track_num = 0
    energy_list = energy_lists[i]
    pdg_list = pdg_lists[i]
    for i in range(len(energy_list)):
        if abs(pdg_list[i]) == 2212 and energy_list[i] > 35.:
            proton_num += 1
        if abs(pdg_list[i]) == 13 or abs(pdg_list[i]) == 211 and energy_list[i] > 10.: # see N_tracks at https://github.com/BNLIF/wcp-uboone-bdt/blob/main/inc/WCPleeANA/cuts.h
            track_num += 1
    proton_nums.append(proton_num)
    track_nums.append(track_num)
data_all_df["WC_reco_num_protons"] = proton_nums
data_all_df["WC_reco_num_other_tracks"] = track_nums

100%|█████████████████████████████| 8119423/8119423 [00:42<00:00, 189293.64it/s]
100%|█████████████████████████████| 2421328/2421328 [00:06<00:00, 349008.33it/s]


In [23]:
# adding WC truth category information

categories = ["NC Delta Radiative", "NC 1 Pi0", "numuCC 1 Pi0", "nueCC", "numuCC other", "NC other", "outFV", "badmatch/cosmic", "dirt", "ext"]

queries = [
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==0 and truth_NCDelta==1",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==0 and truth_NprimPio==1 and not (truth_NCDelta==1)",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==1 and abs(truth_nuPdg)==14 and truth_NprimPio==1",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==1 and abs(truth_nuPdg)==12",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==1 and abs(truth_nuPdg)==14 and truth_NprimPio!=1",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and truth_vtxInside==1 and truth_isCC==0 and truth_NprimPio!=1 and not (truth_NCDelta==1)",
    "not (isDirt==1) and not (isEXT==1) and match_completeness_energy/truth_energyInside>=0.1 and not (truth_vtxInside==1)",
    "not (isDirt==1) and not (isEXT==1) and not (match_completeness_energy/truth_energyInside>=0.1)",
    "isDirt==1",
    "isEXT==1",
]

dfs_with_categories = []
for i in tqdm(range(len(categories))):
    category = categories[i]
    query = queries[i]
    cat_df = all_df.query(query)
    cat_df["category"] = category
    dfs_with_categories.append(cat_df)
    
del all_df
all_df = pd.concat(dfs_with_categories, sort=False)

del cat_df
del dfs_with_categories

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|███████████████████████████████████████████| 10/10 [00:05<00:00,  1.94it/s]


In [24]:
all_df["data_or_pred"] = ["pred" for i in range(all_df.shape[0])]
data_all_df["data_or_pred"] = ["data" for i in range(data_all_df.shape[0])]

In [25]:
# putting them in order so that we don't remove the only generic selected event when we remove duplicates
all_df.sort_values(by=["kine_reco_Enu"], inplace=True, ascending=False)
data_all_df.sort_values(by=["kine_reco_Enu"], inplace=True, ascending=False)

all_df.reset_index(inplace=True)
data_all_df.reset_index(inplace=True)

In [26]:
print("duplicates still included:")
print(all_df.shape[0])
print(data_all_df.shape[0])

duplicates still included:
8119423
2421328


In [27]:
# remove duplicate RSE nums, would be an issue when merging

data_duplicate_row_list = data_all_df[["run", "subrun", "event"]].duplicated().to_numpy()
pred_duplicate_row_list = all_df[["run", "subrun", "event"]].duplicated().to_numpy()

data_duplicate_indices = list(np.nonzero(data_duplicate_row_list)[0])
pred_duplicate_indices = list(np.nonzero(pred_duplicate_row_list)[0])
data_duplicate_row_list = data_all_df[["run", "subrun", "event"]].duplicated().to_numpy()
pred_duplicate_row_list = all_df[["run", "subrun", "event"]].duplicated().to_numpy()

data_duplicate_indices = list(np.nonzero(data_duplicate_row_list)[0])
pred_duplicate_indices = list(np.nonzero(pred_duplicate_row_list)[0])

data_all_df.drop(data_duplicate_indices, inplace=True)
all_df.drop(pred_duplicate_indices, inplace=True)

In [28]:
# throwing away excess information, combining data and MC files, randomizing order
WC_all_df = pd.concat([all_df[wc_vars], data_all_df[wc_data_vars]], sort=False).sample(frac=1)

In [29]:
print("duplicates removed:")
print(all_df.shape[0])
print(data_all_df.shape[0])

print("\ncombined:")
print(WC_all_df.shape[0])

duplicates removed:
8087438
2421328

combined:
10508766


In [30]:
np.sum(WC_all_df[["run", "subrun", "event"]].duplicated().to_numpy())

0

In [31]:
merged_glee_WC_comparison_df = WC_all_df.merge(glee_all_df, how="outer", on=["data_or_pred", "run", "subrun", "event"])
merged_glee_WC_comparison_df.to_pickle("merged_glee_WC_comparison_df_full_v1.pkl")
print("saved!")

saved!


In [32]:
merged_glee_WC_comparison_df.columns

Index(['data_or_pred', 'run', 'subrun', 'event', 'category', 'WC_file',
       'match_isFC', 'kine_reco_Enu', 'reco_showerKE', 'nc_delta_score',
       'WC_reco_num_protons', 'WC_reco_num_other_tracks', 'reco_nuvtxX',
       'reco_nuvtxY', 'reco_nuvtxZ', 'truth_vtxX', 'truth_vtxY', 'truth_vtxZ',
       'net_weight', 'entry', 'reco_shower_energy_plane0',
       'reco_shower_energy_plane1', 'reco_shower_energy_plane2',
       'reco_shower_energy_max', 'reco_vertex_x', 'reco_vertex_y',
       'reco_vertex_z', 'simple_pot_weight', 'glee_file', 'glee_selection'],
      dtype='object')

In [33]:
merged_glee_WC_comparison_df

Unnamed: 0,data_or_pred,run,subrun,event,category,WC_file,match_isFC,kine_reco_Enu,reco_showerKE,nc_delta_score,...,reco_shower_energy_plane0,reco_shower_energy_plane1,reco_shower_energy_plane2,reco_shower_energy_max,reco_vertex_x,reco_vertex_y,reco_vertex_z,simple_pot_weight,glee_file,glee_selection
0,pred,18032,62,3123,ext,ext_run3,False,-1.000000,-1000.000000,-2.275953,...,,,,,,,,,,
1,pred,17560,93,4689,numuCC other,nu_overlay_run3,False,357.782501,114.468925,-5.453149,...,,,,,,,,,,
2,pred,9545,231,11576,NC 1 Pi0,NC_Pi0_run2,True,326.510406,108.675423,-1.706928,...,,,,,,,,,,
3,pred,12364,37,1861,ext,ext_run2,False,-1.000000,-1000.000000,-2.275953,...,,,,,,,,,,
4,pred,7492,8,417,ext,ext_run1,True,-1.000000,-1000.000000,-2.275953,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10510531,data,5519,99,5000,,,,,,,...,135.678651,142.248800,164.402147,164.402147,227.585846,69.191933,504.145142,1.0,data,1g0p
10510532,data,8750,344,17205,,,,,,,...,89.911065,206.786793,266.836514,266.836514,59.691727,79.927483,294.536957,1.0,data,1g0p
10510533,data,5762,114,5732,,,,,,,...,143.989547,186.020242,186.856392,186.856392,113.383087,60.917236,541.001648,1.0,data,1g1p
10510534,data,5634,22,1149,,,,,,,...,136.021075,125.121449,146.504544,146.504544,153.665298,-8.323478,1006.099976,1.0,data,1g1p


In [34]:
"""import warnings
warnings.filterwarnings('ignore')

# evaluate overlap fractions
# (could loosen the WC truth orthogonality cuts to up some of these fractions)

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='NCDeltaRadOverlaySM'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='NCDeltaRadOverlaySM' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p NCDeltaRadOverlaySM events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='NCPi0Coh'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='NCPi0Coh' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p NCPi0Coh events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='NCPi0NotCoh'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='NCPi0NotCoh' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p NCPi0NotCoh events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='CC1Pi0'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='CC1Pi0' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p CC1Pi0 events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='NueOverlays'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='NueOverlays' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p NueOverlays events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='OTPCExtra'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='OTPCExtra' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p OTPCExtra events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='BNBOtherExtra'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='BNBOtherExtra' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p BNBOtherExtra events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='Dirt'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='Dirt' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p Dirt events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='BNBext'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='BNBext' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p BNBext events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='data'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g0p' and glee_file=='data' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g0p data events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

print("\n\n")

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='NCDeltaRadOverlaySM'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='NCDeltaRadOverlaySM' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p NCDeltaRadOverlaySM events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='NCPi0Coh'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='NCPi0Coh' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p NCPi0Coh events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='NCPi0NotCoh'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='NCPi0NotCoh' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p NCPi0NotCoh events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='CC1Pi0'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='CC1Pi0' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p CC1Pi0 events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='NueOverlays'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='NueOverlays' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p NueOverlays events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='OTPCExtra'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='OTPCExtra' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p OTPCExtra events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='BNBOtherExtra'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='BNBOtherExtra' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p BNBOtherExtra events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='Dirt'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='Dirt' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p Dirt events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='BNBext'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='BNBext' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p BNBext events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))

total_pred_glee = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='data'")["simple_pot_weight"].to_numpy())
total_pred_glee_in_a_wc_file = np.sum(merged_glee_WC_comparison_df.query(
    "glee_selection=='1g1p' and glee_file=='data' and WC_file==WC_file")["simple_pot_weight"].to_numpy())
print("fraction of glee 1g1p data events in WC files:", np.round(total_pred_glee_in_a_wc_file, 4), "/", np.round(total_pred_glee, 4), "=", np.round(total_pred_glee_in_a_wc_file / total_pred_glee, 4))
"""



