In [None]:
import numpy as np
import uproot as uproot
import uproot3 as uproot3
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd

from tqdm.notebook import tqdm

import pickle


# Loading Events

In [None]:
bdt_vars = [
    "nue_score",
    "numu_score",
    "numu_cc_flag"
]

eval_vars = [
    "run",
    "subrun",
    "event",
    "truth_nuEnergy",
    "truth_nuPdg",
    "truth_isCC",
    "truth_vtxInside",
    "match_isFC",
    "match_completeness_energy",
    "truth_energyInside",

    "weight_cv",
    "weight_spline",
]

eval_data_vars = [
    "match_isFC",
]

kine_vars = [
    "kine_reco_Enu",
]

pf_vars = [
    "reco_muonMomentum",
    "truth_muonMomentum",
    "truth_nuIntType",
    "truth_nuScatType",
]

pf_data_vars = [
    "reco_muonMomentum",
]


loc = "/Users/leehagaman/data/from_london/"

f = uproot3.open(loc + "checkout_prodgenie_bnb_nu_overlay_run1.root")["wcpselection"]
f_bdt = f["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_eval = f["T_eval"].pandas.df(eval_vars, flatten=False)
f_kine = f["T_KINEvars"].pandas.df(kine_vars, flatten=False)
f_pfeval = f["T_PFeval"].pandas.df(pf_vars, flatten=False)
nu_overlay_run1_pot = np.sum(f["T_pot"].pandas.df("pot_tor875good", flatten=False)["pot_tor875good"].to_numpy())
nu_overlay_run1_df = pd.concat([f_bdt, f_eval, f_kine, f_pfeval], axis=1, sort=False)
nu_overlay_run1_df["file"] = "nu_overlay_run1"
del f
del f_bdt
del f_eval
del f_kine
del f_pfeval

f = uproot3.open(loc + "checkout_prodgenie_bnb_nu_overlay_run2.root")["wcpselection"]
f_bdt = f["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_eval = f["T_eval"].pandas.df(eval_vars, flatten=False)
f_kine = f["T_KINEvars"].pandas.df(kine_vars, flatten=False)
f_pfeval = f["T_PFeval"].pandas.df(pf_vars, flatten=False)
nu_overlay_run2_pot = np.sum(f["T_pot"].pandas.df("pot_tor875good", flatten=False)["pot_tor875good"].to_numpy())
nu_overlay_run2_df = pd.concat([f_bdt, f_eval, f_kine, f_pfeval], axis=1, sort=False)
nu_overlay_run2_df["file"] = "nu_overlay_run2"
del f
del f_bdt
del f_eval
del f_kine
del f_pfeval

f = uproot3.open(loc + "checkout_prodgenie_bnb_nu_overlay_run3.root")["wcpselection"]
f_bdt = f["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_eval = f["T_eval"].pandas.df(eval_vars, flatten=False)
f_kine = f["T_KINEvars"].pandas.df(kine_vars, flatten=False)
f_pfeval = f["T_PFeval"].pandas.df(pf_vars, flatten=False)
nu_overlay_run3_pot = np.sum(f["T_pot"].pandas.df("pot_tor875good", flatten=False)["pot_tor875good"].to_numpy())
nu_overlay_run3_df = pd.concat([f_bdt, f_eval, f_kine, f_pfeval], axis=1, sort=False)
nu_overlay_run3_df["file"] = "nu_overlay_run3"
del f
del f_bdt
del f_eval
del f_kine
del f_pfeval

print(nu_overlay_run1_df.shape)
print(nu_overlay_run2_df.shape)
print(nu_overlay_run3_df.shape)

nu_overlay_df = pd.concat([
    nu_overlay_run1_df, 
    nu_overlay_run2_df, 
    nu_overlay_run3_df], sort=False)

all_df = nu_overlay_df


In [None]:
muon_mass = 105.66

all_df["truth_muonMomentum_3"] = all_df["truth_muonMomentum[3]"].to_numpy()
all_df["true_muon_energy"] = all_df["truth_muonMomentum_3"].to_numpy()*1000.
all_df["true_muon_KE"] = all_df["truth_muonMomentum_3"].to_numpy()*1000. - muon_mass

# muon_energy = sqrt(muon_momentum^2 + muon_mass^2)
# muon_KE = muon_energy - muon_mass = sqrt(muon_momentum^2 + muon_mass^2) - muon_mass
# sqrt(muon_momentum^2 + muon_mass^2) = muon_KE + muon_mass
# muon_momentum = sqrt((muon_KE + muon_mass)**2 - muon_mass**2)
# muon_momentum = sqrt(muon_KE**2 + 2*muon_KE*muon_mass)

all_df["true_muon_momentum"] = np.sqrt(all_df["true_muon_KE"]**2 + 2*all_df["true_muon_KE"] * muon_mass)

all_df["true_muon_costheta"] = all_df["truth_muonMomentum_3"] / all_df["true_muon_momentum"]

# Q^2 = - (p_nu - p_mu)^2
#     = -p_nu^2 + 2 p_nu * p_mu - p_mu^2
#     = -m_nu^2 + 2 E_nu * E_mu - 2 * P_nu * P_mu * cos(theta) - m_mu^2
#     = 2 E_nu * E_mu - 2 * E_nu * P_mu * cos(theta) - m_mu^2
#     = 2 * E_nu * (E_mu - P_mu * cos(theta)) - m_mu^2

all_df["true_Q2"] = 2 * all_df["truth_nuEnergy"] * (all_df["true_muon_energy"] - all_df["true_muon_momentum"] * all_df["true_muon_costheta"]) - all_df["true_muon_KE"]**2

all_df["true_Q2"] = all_df["true_Q2"] / 1000.**2


In [None]:
costheta_vals = []
muonmomentum_vals = []
reco_muonmomentum_x = all_df["reco_muonMomentum[0]"].to_numpy()
reco_muonmomentum_y = all_df["reco_muonMomentum[1]"].to_numpy()
reco_muonmomentum_z = all_df["reco_muonMomentum[2]"].to_numpy()
reco_muonmomentum_t = all_df["reco_muonMomentum[3]"].to_numpy()
for i in range(len(reco_muonmomentum_x)):
    if reco_muonmomentum_t[i] < 105.66 / 1000.: # surprising that this happens for positive values, but I did find some events
        costheta_vals.append(-1)
        muonmomentum_vals.append(-1)
    else:
        costheta_vals.append(reco_muonmomentum_z[i] / np.sqrt(reco_muonmomentum_x[i]**2 + reco_muonmomentum_y[i]**2 + reco_muonmomentum_z[i]**2))
        muon_KE = reco_muonmomentum_t[i] * 1000. - 105.66
        muonmomentum_vals.append(np.sqrt(muon_KE**2 + 2 * muon_KE * 105.66))

all_df["reco_costheta"] = costheta_vals
all_df["reco_muon_momentum"] = muonmomentum_vals


In [None]:
data_pots = [
    1.42319e+20,
    2.5413e+20,
    2.40466e+20
]

weight_cv_vals = all_df["weight_cv"].to_numpy()
weight_spline_vals = all_df["weight_spline"].to_numpy()
files = all_df["file"].to_numpy()
net_weight_vals = []
for i in range(len(weight_cv_vals)):
    w_cv = weight_cv_vals[i]
    if not (0 < w_cv < 30):
        w_cv = 1
    
    if files[i] == "nu_overlay_run1":
        net_weight_vals.append(w_cv * weight_spline_vals[i] * data_pots[0] / nu_overlay_run1_pot)
    elif files[i] == "nu_overlay_run2":
        net_weight_vals.append(w_cv * weight_spline_vals[i] * data_pots[1] / nu_overlay_run2_pot)
    elif files[i] == "nu_overlay_run3":
        net_weight_vals.append(w_cv * weight_spline_vals[i] * data_pots[2] / nu_overlay_run3_pot)
    
all_df["net_weight"] = net_weight_vals


In [None]:
sel_df = all_df.query("numu_cc_flag >= 0 and numu_score > 0.9 and nue_score < 7 and reco_muon_momentum>0")


# Interaction Type Plots

In [None]:
# using https://internal.dunescience.org/doxygen/MCNeutrino_8h_source.html
# something weird with these variables, there are a lot of 1000 events in the file
"""
true_qe_query = "1001 <= truth_nuIntType <= 1002 or truth_nuIntType == 1095"
true_res_query = "1003 <= truth_nuIntType <= 1090"
true_dis_query = "1091 <= truth_nuIntType <= 1092"
true_coh_query = "1096 <= truth_nuIntType <= 1097"
true_mec_query = "truth_nuIntType == 1100"
true_other_query = f"not ({true_qe_query} or {true_res_query} or {true_dis_query} or {true_coh_query} or {true_mec_query})"
"""

# https://internal.dunescience.org/doxygen/classgenie_1_1ScatteringType.html

"""
            case(kScUnknown) :                 return "Uknown to GENIE"; break;
   67       case(kScQuasiElastic) :            return "QES";       break;
   68       case(kScSingleKaon) :              return "1Kaon";     break;
   69       case(kScDeepInelastic) :           return "DIS";       break;
   70       case(kScResonant) :                return "RES";       break;
   71       case(kScCoherentProduction) :      return "COH";       break;
   72       case(kScDiffractive) :             return "DFR";       break;
   73       case(kScNuElectronElastic) :       return "NuEEL";     break;
   74       case(kScInverseMuDecay) :          return "IMD";       break;
   75       case(kScAMNuGamma) :               return "AMNuGamma"; break;
   76       case(kScMEC) :                     return "MEC";       break;
   77       case(kScCoherentElastic) :         return "CEvNS";     break;
   78       case(kScInverseBetaDecay) :        return "IBD";       break;
   79       case(kScGlashowResonance) :        return "GLR";       break;
   80       case(kScIMDAnnihilation) :         return "IMDAnh";    break;
   81       case(kScDarkMatterElastic) :       return "DMEL";      break;
   82       case(kScDarkMatterDeepInelastic) : return "DMDIS";     break;
   83       case(kScDarkMatterElectron) :      return "DME";       break;
   84       default :                          return "Unknown";   break;
"""

true_qe_query = "truth_nuScatType == 1"
true_dis_query = "truth_nuScatType == 3"
true_res_query = "truth_nuScatType == 4"
true_coh_query = "truth_nuScatType == 5"
true_mec_query = "truth_nuScatType == 10"
true_other_query = f"not ({true_qe_query} or {true_res_query} or {true_dis_query} or {true_coh_query} or {true_mec_query})"

sel_dfs = {}

sel_dfs["QE"] = sel_df.query(true_qe_query)
sel_dfs["RES"] = sel_df.query(true_res_query)
sel_dfs["DIS"] = sel_df.query(true_dis_query)
sel_dfs["COH"] = sel_df.query(true_coh_query)
sel_dfs["MEC"] = sel_df.query(true_mec_query)
sel_dfs["OTHER"] = sel_df.query(true_other_query)


In [None]:
counts = {}

for int_type in sel_dfs.keys():

    selected_df = sel_dfs[int_type]
    counts[int_type] = []

    for containment in ["FC", "PC"]:
        
        if containment == "FC":
            containment_df = selected_df.query("match_isFC==1")
        else:
            containment_df = selected_df.query("match_isFC==0")
            
        for Enu_bin in range(4):
            
            if Enu_bin == 0:
                Enu_df = containment_df.query("200 < kine_reco_Enu <= 705")
            elif Enu_bin == 1:
                Enu_df = containment_df.query("705 < kine_reco_Enu < 1050")
            elif Enu_bin == 2:
                Enu_df = containment_df.query("1050 < kine_reco_Enu < 1570")
            elif Enu_bin == 3:
                Enu_df = containment_df.query("1570 < kine_reco_Enu < 4000")
            
            for theta_bin in range(9):
                
                if theta_bin == 0:
                    theta_df = Enu_df.query("-1 < reco_costheta <= -0.5")
                elif theta_bin == 1:
                    theta_df = Enu_df.query("-0.5 < reco_costheta <= 0.")
                elif theta_bin == 2:
                    theta_df = Enu_df.query("0. < reco_costheta <= 0.27")
                elif theta_bin == 3:
                    theta_df = Enu_df.query("0.27 < reco_costheta <= 0.45")
                elif theta_bin == 4:
                    theta_df = Enu_df.query("0.45 < reco_costheta <= 0.62")
                elif theta_bin == 5:
                    theta_df = Enu_df.query("0.62 < reco_costheta <= 0.76")
                elif theta_bin == 6:
                    theta_df = Enu_df.query("0.76 < reco_costheta <= 0.86")
                elif theta_bin == 7:
                    theta_df = Enu_df.query("0.86 < reco_costheta <= 0.94")
                else:
                    theta_df = Enu_df.query("0.94 < reco_costheta <= 1.")
                
                counts[int_type] += list(np.histogram(theta_df["reco_muon_momentum"].to_numpy(), 
                                            weights=theta_df["net_weight"].to_numpy(),
                                            bins = [i*100 for i in range(16)] + [1e9] # fifteen bins from 0 to 1500 plus an overflow
                                            )[0])

for k, v in counts.items():
    counts[k] = np.array(v)


In [None]:
bins = np.linspace(0, 1152, 1153)
bin_centers = (bins[:-1] + bins[1:]) / 2

interaction_type_labels = ["QE", "RES", "DIS", "COH", "MEC", "OTHER"]

interaction_type_stack_list = [
    counts["QE"],
    counts["RES"],
    counts["DIS"],
    counts["COH"],
    counts["MEC"],
    counts["OTHER"]
]

stack_centers = [
    bin_centers,
    bin_centers,
    bin_centers,
    bin_centers,
    bin_centers,
    bin_centers
]

plt.rcParams.update({'font.size': 16})

plt.figure(figsize=(15, 6))
plt.hist(stack_centers, weights=interaction_type_stack_list, label=interaction_type_labels, stacked=True, bins=bins)
plt.legend()
plt.xlabel("reco bin number")
plt.ylabel("counts")
plt.xlim(0, 1152)
plt.savefig("plots/ccqe_vs_nonccqe.png")
plt.savefig("plots/ccqe_vs_nonccqe.pdf")


In [None]:
total_counts = counts["QE"] + counts["RES"] + counts["DIS"] + counts["COH"] + counts["MEC"] + counts["OTHER"]

interaction_type_frac_counts = [
    np.nan_to_num(counts["QE"] / total_counts, nan=0),
    np.nan_to_num(counts["RES"] / total_counts, nan=0),
    np.nan_to_num(counts["DIS"] / total_counts, nan=0),
    np.nan_to_num(counts["COH"] / total_counts, nan=0),
    np.nan_to_num(counts["MEC"] / total_counts, nan=0),
    np.nan_to_num(counts["OTHER"] / total_counts, nan=0)
]

plt.figure(figsize=(15, 6))
n, bins, patches = plt.hist(stack_centers, weights=interaction_type_frac_counts, label=interaction_type_labels, stacked=True, bins=bins)
plt.legend()
plt.xlabel("reco bin number")
plt.ylabel("fraction")
plt.xlim(0, 1152)
plt.ylim(0, 1)
plt.savefig("plots/ccqe_vs_nonccqe_frac.png")
plt.savefig("plots/ccqe_vs_nonccqe_frac.pdf")


In [None]:
low_total_count_indices = np.where(total_counts < 10)[0]

interaction_type_low_removed_frac_counts = []
interaction_type_low_removed_stack_centers = []
for interaction_type_frac_count, interaction_type_stack_center in zip(interaction_type_frac_counts, stack_centers):
    interaction_type_low_removed_frac_counts.append(np.delete(interaction_type_frac_count, low_total_count_indices))

num_low_removed_bins = len(interaction_type_low_removed_frac_counts[0])

interaction_type_low_removed_bins = np.linspace(0, num_low_removed_bins, num_low_removed_bins + 1)
interaction_type_low_removed_bin_centers = (interaction_type_low_removed_bins[:-1] + interaction_type_low_removed_bins[1:]) / 2
interaction_type_low_removed_stack_centers = [interaction_type_low_removed_bin_centers] * len(interaction_type_labels)

plt.figure(figsize=(15, 6))
n, bins, patches = plt.hist(interaction_type_low_removed_stack_centers, 
                            weights=interaction_type_low_removed_frac_counts, 
                            label=interaction_type_labels, 
                            stacked=True, 
                            bins=interaction_type_low_removed_bins)
plt.legend()
plt.xlabel("reco bin number, removing bins with low predicted counts")
plt.ylabel("fraction")
plt.xlim(0, interaction_type_low_removed_bins[-1])
plt.ylim(0, 1)
plt.savefig("plots/low_removed_ccqe_vs_nonccqe_frac.png")
plt.savefig("plots/low_removed_ccqe_vs_nonccqe_frac.pdf")


In [None]:
qe_fractions = interaction_type_low_removed_frac_counts[0]

interaction_type_sort_indices = np.argsort(qe_fractions)

interaction_type_sorted_frac_counts = []
for interaction_type_frac_count in interaction_type_low_removed_frac_counts:
    interaction_type_sorted_frac_counts.append(interaction_type_frac_count[interaction_type_sort_indices])

num_bins = len(interaction_type_sorted_frac_counts[0])
interaction_type_sorted_bins = np.linspace(0, num_bins, num_bins + 1)
interaction_type_sorted_bin_centers = (interaction_type_sorted_bins[:-1] + interaction_type_sorted_bins[1:]) / 2
interaction_type_sorted_stack_centers = [interaction_type_sorted_bin_centers] * len(interaction_type_labels)

plt.figure(figsize=(15, 6))
n, bins, patches = plt.hist(interaction_type_sorted_stack_centers, 
                          weights=interaction_type_sorted_frac_counts, 
                          label=interaction_type_labels, 
                          stacked=True, 
                          bins=interaction_type_sorted_bins)
plt.legend()
plt.xlabel("reco bin number, sorted by QE fraction, no low-pred bins")
plt.ylabel("fraction")
plt.xlim(0, interaction_type_sorted_bins[-1])
plt.ylim(0, 1)
plt.savefig("plots/sorted_ccqe_vs_nonccqe_frac.png")
plt.savefig("plots/sorted_ccqe_vs_nonccqe_frac.pdf")



# True Q^2 Plots

In [None]:
# truth_nuEnergy > true_muon_energy cuts out a tiny number of weird events, probably with two neutrinos in the same simulation?
true_numuCC_sel_df = sel_df.query("truth_nuEnergy > true_muon_energy and true_muon_energy > 0") 

cutoffs = [0] + list(np.quantile(true_numuCC_sel_df["true_Q2"].to_numpy(), [0.25, 0.5, 0.75])) + [1e9]
print(cutoffs)

print(np.min(true_numuCC_sel_df["true_Q2"].to_numpy()))
print(np.max(true_numuCC_sel_df["true_Q2"].to_numpy()))
plt.hist(true_numuCC_sel_df["true_Q2"].to_numpy(), bins=100)
plt.xlabel("True Q^2")
plt.ylabel("Counts")
plt.title("True Q^2 for selected true numu CC events")
plt.show()


In [None]:
q2_sel_dfs = {}

q2_sel_dfs["all"] = true_numuCC_sel_df
q2_sel_dfs["1"] = true_numuCC_sel_df.query(f"{cutoffs[0]} < true_Q2 <= {cutoffs[1]}")
q2_sel_dfs["2"] = true_numuCC_sel_df.query(f"{cutoffs[1]} < true_Q2 <= {cutoffs[2]}")
q2_sel_dfs["3"] = true_numuCC_sel_df.query(f"{cutoffs[2]} < true_Q2 <= {cutoffs[3]}")
q2_sel_dfs["4"] = true_numuCC_sel_df.query(f"{cutoffs[3]} < true_Q2")
q2_sel_dfs["QE"] = true_numuCC_sel_df.query(true_qe_query)
q2_sel_dfs["non-QE"] = true_numuCC_sel_df.query(f"not ({true_qe_query})")

median_q2_by_bin = []

q2_counts = {}

for quantile_str in q2_sel_dfs.keys():

    selected_df = q2_sel_dfs[quantile_str]
    q2_counts[quantile_str] = []

    for containment in ["FC", "PC"]:
        
        if containment == "FC":
            containment_df = selected_df.query("match_isFC==1")
        else:
            containment_df = selected_df.query("match_isFC==0")
            
        for Enu_bin in range(4):
            
            if Enu_bin == 0:
                Enu_df = containment_df.query("200 < kine_reco_Enu <= 705")
            elif Enu_bin == 1:
                Enu_df = containment_df.query("705 < kine_reco_Enu < 1050")
            elif Enu_bin == 2:
                Enu_df = containment_df.query("1050 < kine_reco_Enu < 1570")
            elif Enu_bin == 3:
                Enu_df = containment_df.query("1570 < kine_reco_Enu < 4000")
            
            for theta_bin in range(9):
                
                if theta_bin == 0:
                    theta_df = Enu_df.query("-1 < reco_costheta <= -0.5")
                elif theta_bin == 1:
                    theta_df = Enu_df.query("-0.5 < reco_costheta <= 0.")
                elif theta_bin == 2:
                    theta_df = Enu_df.query("0. < reco_costheta <= 0.27")
                elif theta_bin == 3:
                    theta_df = Enu_df.query("0.27 < reco_costheta <= 0.45")
                elif theta_bin == 4:
                    theta_df = Enu_df.query("0.45 < reco_costheta <= 0.62")
                elif theta_bin == 5:
                    theta_df = Enu_df.query("0.62 < reco_costheta <= 0.76")
                elif theta_bin == 6:
                    theta_df = Enu_df.query("0.76 < reco_costheta <= 0.86")
                elif theta_bin == 7:
                    theta_df = Enu_df.query("0.86 < reco_costheta <= 0.94")
                else:
                    theta_df = Enu_df.query("0.94 < reco_costheta <= 1.")
                
                q2_counts[quantile_str] += list(np.histogram(theta_df["reco_muon_momentum"].to_numpy(), 
                                            weights=theta_df["net_weight"].to_numpy(),
                                            bins = [i*100 for i in range(16)] + [1e9] # fifteen bins from 0 to 1500 plus an overflow
                                            )[0])
                
                if quantile_str == "all":
                    for i in range(16):
                        curr_bin_df = theta_df.query(f"reco_muon_momentum >= {i*100} and reco_muon_momentum < {(i+1)*100}")
                        median_q2_by_bin.append(np.median(curr_bin_df["true_Q2"].to_numpy()))

for k, v in q2_counts.items():
    q2_counts[k] = np.array(v)


In [None]:
bins = np.linspace(0, 1152, 1153)
bin_centers = (bins[:-1] + bins[1:]) / 2

q2_labels = [f"$Q^2$ < {cutoffs[1]:.2f}", 
          f"{cutoffs[1]:.2f} < $Q^2$ < {cutoffs[2]:.2f}", 
          f"{cutoffs[2]:.2f} < $Q^2$ < {cutoffs[3]:.2f}", 
          f"{cutoffs[3]:.2f} < $Q^2$"
          ]

q2_stack_list = [
    q2_counts["1"],
    q2_counts["2"],
    q2_counts["3"],
    q2_counts["4"],
]

stack_centers = [
    bin_centers,
    bin_centers,
    bin_centers,
    bin_centers,
]

plt.rcParams.update({'font.size': 16})

plt.figure(figsize=(15, 6))
plt.hist(stack_centers, weights=q2_stack_list, label=q2_labels, stacked=True, bins=bins)
plt.legend()
plt.xlabel("reco bin number")
plt.ylabel("counts")
plt.xlim(0, 1152)
plt.savefig("plots/quantiles.png")
plt.savefig("plots/quantiles.pdf")


In [None]:
total_counts = q2_counts["1"] + q2_counts["2"] + q2_counts["3"] + q2_counts["4"]

q2_QE_frac_count = q2_counts["QE"] / total_counts

q2_frac_counts = [
    np.nan_to_num(q2_counts["1"] / total_counts, nan=0),
    np.nan_to_num(q2_counts["2"] / total_counts, nan=0),
    np.nan_to_num(q2_counts["3"] / total_counts, nan=0),
    np.nan_to_num(q2_counts["4"] / total_counts, nan=0),
]

plt.figure(figsize=(15, 6))
n, bins, patches = plt.hist(stack_centers, weights=q2_frac_counts, label=q2_labels, stacked=True, bins=bins)
plt.legend(loc="upper right")
plt.xlabel("reco bin number")
plt.ylabel("fraction")
plt.xlim(0, 1152)
plt.ylim(0, 1)
plt.savefig("plots/quantiles_frac.png")
plt.savefig("plots/quantiles_frac.pdf")


In [None]:
q2_low_total_count_indices = np.where(total_counts < 10)[0]

q2_low_removed_frac_counts = []
q2_low_removed_stack_centers = []
for q2_frac_count, q2_stack_center in zip(q2_frac_counts, stack_centers):
    q2_low_removed_frac_counts.append(np.delete(q2_frac_count, q2_low_total_count_indices))
q2_low_removed_median_q2_by_bin = np.delete(median_q2_by_bin, q2_low_total_count_indices)

q2_low_removed_QE_frac_count = np.delete(q2_QE_frac_count, q2_low_total_count_indices)

num_low_removed_bins = len(q2_low_removed_frac_counts[0])

q2_low_removed_bins = np.linspace(0, num_low_removed_bins, num_low_removed_bins + 1)
q2_low_removed_bin_centers = (q2_low_removed_bins[:-1] + q2_low_removed_bins[1:]) / 2
q2_low_removed_stack_centers = [q2_low_removed_bin_centers] * len(q2_labels)

plt.figure(figsize=(15, 6))
n, bins, patches = plt.hist(q2_low_removed_stack_centers, 
                            weights=q2_low_removed_frac_counts, 
                            label=q2_labels, 
                            stacked=True, 
                            bins=q2_low_removed_bins)
plt.legend()
plt.xlabel("reco bin number, removing bins with low predicted counts")
plt.ylabel("fraction")
plt.xlim(0, q2_low_removed_bins[-1])
plt.ylim(0, 1)
plt.savefig("plots/low_removed_quantiles_frac.png")
plt.savefig("plots/low_removed_quantiles_frac.pdf")



In [None]:
q2_sort_indices = np.argsort(q2_low_removed_median_q2_by_bin)

q2_sorted_frac_counts = []
for q2_frac_count in q2_low_removed_frac_counts:
    q2_sorted_frac_counts.append(q2_frac_count[q2_sort_indices])

num_bins = len(q2_sorted_frac_counts[0])
q2_sorted_bins = np.linspace(0, num_bins, num_bins + 1)
q2_sorted_bin_centers = (q2_sorted_bins[:-1] + q2_sorted_bins[1:]) / 2
q2_sorted_stack_centers = [q2_sorted_bin_centers] * len(q2_labels)

plt.figure(figsize=(15, 6))
n, bins, patches = plt.hist(q2_sorted_stack_centers, 
                          weights=q2_sorted_frac_counts, 
                          label=q2_labels, 
                          stacked=True, 
                          bins=q2_sorted_bins)
plt.legend()
plt.xlabel(f"reco bin number, sorted by median $Q^2$")
plt.ylabel("fraction")
plt.xlim(0, q2_sorted_bins[-1])
plt.ylim(0, 1)
plt.savefig("plots/sorted_quantiles_frac.png")
plt.savefig("plots/sorted_quantiles_frac.pdf")


# 2D Q^2 vs QE Fraction

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(q2_low_removed_median_q2_by_bin, q2_low_removed_QE_frac_count, c="k", s=1)
plt.xlabel("Median $Q^2$")
plt.ylabel("QE Fraction")
plt.title("4D Reco Bins, low-stat removed")
plt.savefig("plots/q2_vs_qe_frac_low_stat_removed.png")
plt.savefig("plots/q2_vs_qe_frac_low_stat_removed.pdf")


# Four Panel True Q^2

In [None]:
# four panel plot of ccqe vs nonccqe by true Q^2

# make 4x1 subplots
fig, axs = plt.subplots(4, 1, figsize=(15, 15))

for Q_bin in range(4):

    qbin_sel_df = true_numuCC_sel_df.query(f"{cutoffs[Q_bin]} < true_Q2 <= {cutoffs[Q_bin+1]}")

    sel_dfs = {}

    sel_dfs["QE"] = qbin_sel_df.query(true_qe_query)
    sel_dfs["RES"] = qbin_sel_df.query(true_res_query)
    sel_dfs["DIS"] = qbin_sel_df.query(true_dis_query)
    sel_dfs["COH"] = qbin_sel_df.query(true_coh_query)
    sel_dfs["MEC"] = qbin_sel_df.query(true_mec_query)
    sel_dfs["OTHER"] = qbin_sel_df.query(true_other_query)

    counts = {}

    for int_type in sel_dfs.keys():

        selected_df = sel_dfs[int_type]
        counts[int_type] = []

        for containment in ["FC", "PC"]:
            
            if containment == "FC":
                containment_df = selected_df.query("match_isFC==1")
            else:
                containment_df = selected_df.query("match_isFC==0")
                
            for Enu_bin in range(4):
                
                if Enu_bin == 0:
                    Enu_df = containment_df.query("200 < kine_reco_Enu <= 705")
                elif Enu_bin == 1:
                    Enu_df = containment_df.query("705 < kine_reco_Enu < 1050")
                elif Enu_bin == 2:
                    Enu_df = containment_df.query("1050 < kine_reco_Enu < 1570")
                elif Enu_bin == 3:
                    Enu_df = containment_df.query("1570 < kine_reco_Enu < 4000")
                
                for theta_bin in range(9):
                    
                    if theta_bin == 0:
                        theta_df = Enu_df.query("-1 < reco_costheta <= -0.5")
                    elif theta_bin == 1:
                        theta_df = Enu_df.query("-0.5 < reco_costheta <= 0.")
                    elif theta_bin == 2:
                        theta_df = Enu_df.query("0. < reco_costheta <= 0.27")
                    elif theta_bin == 3:
                        theta_df = Enu_df.query("0.27 < reco_costheta <= 0.45")
                    elif theta_bin == 4:
                        theta_df = Enu_df.query("0.45 < reco_costheta <= 0.62")
                    elif theta_bin == 5:
                        theta_df = Enu_df.query("0.62 < reco_costheta <= 0.76")
                    elif theta_bin == 6:
                        theta_df = Enu_df.query("0.76 < reco_costheta <= 0.86")
                    elif theta_bin == 7:
                        theta_df = Enu_df.query("0.86 < reco_costheta <= 0.94")
                    else:
                        theta_df = Enu_df.query("0.94 < reco_costheta <= 1.")
                    
                    counts[int_type] += list(np.histogram(theta_df["reco_muon_momentum"].to_numpy(), 
                                                weights=theta_df["net_weight"].to_numpy(),
                                                bins = [i*100 for i in range(16)] + [1e9] # fifteen bins from 0 to 1500 plus an overflow
                                                )[0])

    for k, v in counts.items():
        counts[k] = np.array(v)


    bins = np.linspace(0, 1152, 1153)
    bin_centers = (bins[:-1] + bins[1:]) / 2

    labels = ["QE", "RES", "DIS", "COH", "MEC", "OTHER"]

    stack_list = [
        counts["QE"],
        counts["RES"],
        counts["DIS"],
        counts["COH"],
        counts["MEC"],
        counts["OTHER"]
    ]

    stack_centers = [
        bin_centers,
        bin_centers,
        bin_centers,
        bin_centers,
        bin_centers,
        bin_centers
    ]

    axs[Q_bin].hist(stack_centers, weights=stack_list, label=labels, stacked=True, bins=bins)
    if Q_bin == 0:
        axs[Q_bin].legend()
    if Q_bin == 3:
        axs[Q_bin].set_xlabel("reco bin number")
    axs[Q_bin].set_ylabel("counts")
    axs[Q_bin].set_xlim(0, 1152)
    if Q_bin == 0:
        axs[Q_bin].text(0.1, 0.8, f"$Q^2$ < {cutoffs[Q_bin+1]:.2f}", transform=axs[Q_bin].transAxes, fontsize=16)
    elif Q_bin == 3:
        axs[Q_bin].text(0.1, 0.8, f"{cutoffs[Q_bin]:.2f} < $Q^2$", transform=axs[Q_bin].transAxes, fontsize=16)
    else:
        axs[Q_bin].text(0.1, 0.8, f"{cutoffs[Q_bin]:.2f} < $Q^2$ < {cutoffs[Q_bin+1]:.2f}", transform=axs[Q_bin].transAxes, fontsize=16)

plt.savefig("plots/q2_slices.png")
plt.savefig("plots/q2_slices.pdf")


In [None]:
# four panel plot of ccqe vs nonccqe by true Q^2

# make 4x1 subplots
fig, axs = plt.subplots(4, 1, figsize=(15, 15))

for Q_bin in range(4):

    qbin_sel_df = true_numuCC_sel_df.query(f"{cutoffs[Q_bin]} < true_Q2 <= {cutoffs[Q_bin+1]}")

    sel_dfs = {}

    sel_dfs["QE"] = qbin_sel_df.query(true_qe_query)
    sel_dfs["RES"] = qbin_sel_df.query(true_res_query)
    sel_dfs["DIS"] = qbin_sel_df.query(true_dis_query)
    sel_dfs["COH"] = qbin_sel_df.query(true_coh_query)
    sel_dfs["MEC"] = qbin_sel_df.query(true_mec_query)
    sel_dfs["OTHER"] = qbin_sel_df.query(true_other_query)

    counts = {}

    for int_type in sel_dfs.keys():

        selected_df = sel_dfs[int_type]
        counts[int_type] = []

        for containment in ["FC", "PC"]:
            
            if containment == "FC":
                containment_df = selected_df.query("match_isFC==1")
            else:
                containment_df = selected_df.query("match_isFC==0")
                
            for Enu_bin in range(4):
                
                if Enu_bin == 0:
                    Enu_df = containment_df.query("200 < kine_reco_Enu <= 705")
                elif Enu_bin == 1:
                    Enu_df = containment_df.query("705 < kine_reco_Enu < 1050")
                elif Enu_bin == 2:
                    Enu_df = containment_df.query("1050 < kine_reco_Enu < 1570")
                elif Enu_bin == 3:
                    Enu_df = containment_df.query("1570 < kine_reco_Enu < 4000")
                
                for theta_bin in range(9):
                    
                    if theta_bin == 0:
                        theta_df = Enu_df.query("-1 < reco_costheta <= -0.5")
                    elif theta_bin == 1:
                        theta_df = Enu_df.query("-0.5 < reco_costheta <= 0.")
                    elif theta_bin == 2:
                        theta_df = Enu_df.query("0. < reco_costheta <= 0.27")
                    elif theta_bin == 3:
                        theta_df = Enu_df.query("0.27 < reco_costheta <= 0.45")
                    elif theta_bin == 4:
                        theta_df = Enu_df.query("0.45 < reco_costheta <= 0.62")
                    elif theta_bin == 5:
                        theta_df = Enu_df.query("0.62 < reco_costheta <= 0.76")
                    elif theta_bin == 6:
                        theta_df = Enu_df.query("0.76 < reco_costheta <= 0.86")
                    elif theta_bin == 7:
                        theta_df = Enu_df.query("0.86 < reco_costheta <= 0.94")
                    else:
                        theta_df = Enu_df.query("0.94 < reco_costheta <= 1.")
                    
                    counts[int_type] += list(np.histogram(theta_df["reco_muon_momentum"].to_numpy(), 
                                                weights=theta_df["net_weight"].to_numpy(),
                                                bins = [i*100 for i in range(16)] + [1e9] # fifteen bins from 0 to 1500 plus an overflow
                                                )[0])

    for k, v in counts.items():
        counts[k] = np.array(v)


    bins = np.linspace(0, 1152, 1153)
    bin_centers = (bins[:-1] + bins[1:]) / 2

    labels = ["QE", "RES", "DIS", "COH", "MEC", "OTHER"]

    stack_centers = [
        bin_centers,
        bin_centers,
        bin_centers,
        bin_centers,
        bin_centers,
        bin_centers
    ]

    total_counts = counts["QE"] + counts["RES"] + counts["DIS"] + counts["COH"] + counts["MEC"] + counts["OTHER"]

    frac_counts = [
        np.nan_to_num(counts["QE"] / total_counts, nan=0),
        np.nan_to_num(counts["RES"] / total_counts, nan=0),
        np.nan_to_num(counts["DIS"] / total_counts, nan=0),
        np.nan_to_num(counts["COH"] / total_counts, nan=0),
        np.nan_to_num(counts["MEC"] / total_counts, nan=0),
        np.nan_to_num(counts["OTHER"] / total_counts, nan=0)
    ]

    low_total_count_indices = np.where(total_counts < 10)[0]

    low_removed_frac_counts = []
    low_removed_stack_centers = []
    for frac_count, stack_center in zip(frac_counts, stack_centers):
        low_removed_frac_counts.append(np.delete(frac_count, low_total_count_indices))
    low_removed_median_q2_by_bin = np.delete(median_q2_by_bin, low_total_count_indices)

    num_low_removed_bins = len(low_removed_frac_counts[0])

    low_removed_bins = np.linspace(0, num_low_removed_bins, num_low_removed_bins + 1)
    low_removed_bin_centers = (low_removed_bins[:-1] + low_removed_bins[1:]) / 2
    low_removed_stack_centers = [low_removed_bin_centers] * len(labels)

    qe_fractions = low_removed_frac_counts[0]
    sort_indices = np.argsort(qe_fractions)
    sorted_frac_counts = []
    for frac_count in low_removed_frac_counts:
        sorted_frac_counts.append(frac_count[sort_indices])
    num_bins = len(sorted_frac_counts[0])
    sorted_bins = np.linspace(0, num_bins, num_bins + 1)
    sorted_bin_centers = (sorted_bins[:-1] + sorted_bins[1:]) / 2
    sorted_stack_centers = [sorted_bin_centers] * len(labels)

    axs[Q_bin].hist(sorted_stack_centers, weights=sorted_frac_counts, label=labels, stacked=True, bins=sorted_bins)
    if Q_bin == 0:
        axs[Q_bin].legend()
    if Q_bin == 3:
        axs[Q_bin].set_xlabel("reco bin number, sorted by QE fraction, no low-pred bins")

    axs[Q_bin].set_ylabel("fraction")
    axs[Q_bin].set_xlim(0, len(low_removed_bins))
    axs[Q_bin].set_ylim(0, 1)
    if Q_bin == 0:
        axs[Q_bin].text(0.6, 0.1, f"$Q^2$ < {cutoffs[Q_bin+1]:.2f}", transform=axs[Q_bin].transAxes, fontsize=16)
    elif Q_bin == 3:
        axs[Q_bin].text(0.6, 0.1, f"{cutoffs[Q_bin]:.2f} < $Q^2$", transform=axs[Q_bin].transAxes, fontsize=16)
    else:
        axs[Q_bin].text(0.6, 0.1, f"{cutoffs[Q_bin]:.2f} < $Q^2$ < {cutoffs[Q_bin+1]:.2f}", transform=axs[Q_bin].transAxes, fontsize=16)

plt.savefig("plots/q2_slices_sorted_by_qe_frac.png")
plt.savefig("plots/q2_slices_sorted_by_qe_frac.pdf")
