In [None]:
import numpy as np
import uproot as uproot
import uproot3 as uproot3
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd

from tqdm.notebook import tqdm

import pickle

# NuWro From Fake Data

In [None]:
nuwro_fake_data_true_MA = 1.03

In [None]:
bdt_vars = [
    "nue_score",
    "numu_score",
    "numu_cc_flag"
]

eval_vars = [
    "run",
    "subrun",
    "event",
    "match_isFC",

    # these aren't present in the fake data file
    #"truth_nuEnergy",
    #"truth_nuPdg",
    #"truth_isCC",
    #"truth_vtxInside",
    #"match_completeness_energy",
    #"truth_energyInside",
    #"weight_cv",
    #"weight_spline",
]

eval_data_vars = [
    "match_isFC",
]

kine_vars = [
    "kine_reco_Enu",
]

pf_vars = [
    "reco_muonMomentum",
    #"truth_muonMomentum",
]

pf_data_vars = [
    "reco_muonMomentum",
]

loc = "/Users/leehagaman/data/processed_checkout_rootfiles/"

f = uproot3.open(loc + "checkout_fakedata_nuwro_run1.root")["wcpselection"] # run 1 is numuCC only
f_bdt = f["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_eval = f["T_eval"].pandas.df(eval_vars, flatten=False)
f_kine = f["T_KINEvars"].pandas.df(kine_vars, flatten=False)
f_pfeval = f["T_PFeval"].pandas.df(pf_vars, flatten=False)
nuwro_fake_run1_vars_pot = np.sum(f["T_pot"].pandas.df("pot_tor875good", flatten=False)["pot_tor875good"].to_numpy())
nuwro_fake_run1_df = pd.concat([f_bdt, f_eval, f_kine, f_pfeval], axis=1, sort=False)
nuwro_fake_run1_df["file"] = "nuwro_fake_run1"

f = uproot3.open(loc + "checkout_fakedata_nuwro_run2.root")["wcpselection"]
f_bdt = f["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_eval = f["T_eval"].pandas.df(eval_vars, flatten=False)
f_kine = f["T_KINEvars"].pandas.df(kine_vars, flatten=False)
f_pfeval = f["T_PFeval"].pandas.df(pf_vars, flatten=False)
nuwro_fake_run2_vars_pot = np.sum(f["T_pot"].pandas.df("pot_tor875good", flatten=False)["pot_tor875good"].to_numpy())
nuwro_fake_run2_df = pd.concat([f_bdt, f_eval, f_kine, f_pfeval], axis=1, sort=False)
nuwro_fake_run2_df["file"] = "nuwro_fake_run2"

f = uproot3.open(loc + "checkout_fakedata_nuwro_run3.root")["wcpselection"]
f_bdt = f["T_BDTvars"].pandas.df(bdt_vars, flatten=False)
f_eval = f["T_eval"].pandas.df(eval_vars, flatten=False)
f_kine = f["T_KINEvars"].pandas.df(kine_vars, flatten=False)
f_pfeval = f["T_PFeval"].pandas.df(pf_vars, flatten=False)
nuwro_fake_run3_vars_pot = np.sum(f["T_pot"].pandas.df("pot_tor875good", flatten=False)["pot_tor875good"].to_numpy())
nuwro_fake_run3_df = pd.concat([f_bdt, f_eval, f_kine, f_pfeval], axis=1, sort=False)
nuwro_fake_run3_df["file"] = "nuwro_fake_run3"
del f
del f_bdt
del f_eval
del f_kine
del f_pfeval

total_wc_nuwro_fake_pot = nuwro_fake_run1_vars_pot + nuwro_fake_run2_vars_pot + nuwro_fake_run3_vars_pot
num_wc_nuwro_fake_events = len(nuwro_fake_run1_df) + len(nuwro_fake_run2_df) + len(nuwro_fake_run3_df)

wc_nuwro_fake_df = pd.concat([nuwro_fake_run1_df, nuwro_fake_run2_df, nuwro_fake_run3_df], axis=0, sort=False)

wc_nuwro_fake_df


In [None]:
f = uproot.open(f"/Users/leehagaman/data/pelee_files/high_stat_prodgenie_bnb_nu_overlay_DetVar_Run1_NuWro_reco2_reco2.root")["nuselection"]["NeutrinoSelectionFilter"]
f.items()

In [None]:
print(np.sum(f["isVtxInFiducial"].array()) / len(f["isVtxInFiducial"].array()))
print(np.sum(f["truthFiducial"].array()) / len(f["truthFiducial"].array()))

In [None]:
for r in [1,2,3]:

    f = uproot.open(f"/Users/leehagaman/data/pelee_files/high_stat_prodgenie_bnb_nu_overlay_DetVar_Run{r}_NuWro_reco2_reco2.root")["nuselection"]["NeutrinoSelectionFilter"]

    run = f["run"].array()
    subrun = f["sub"].array()
    event = f["evt"].array()
    nu_pdg = f["nu_pdg"].array()
    true_NC = f["ccnc"].array()

    # https://internal.dunescience.org/doxygen/classgenie_1_1ScatteringType.html, equivalent to truth_nuScatType in WC files
    # different from truth_nuIntType, which comes from https://internal.dunescience.org/doxygen/MCNeutrino_8h_source.html but doesn't totally make sense with the values we see
    interaction = f["interaction"].array()

    true_nu_energy = f["nu_e"].array() * 1000.
    true_lep_energy = f["lep_e"].array() * 1000.
    true_lep_theta = f["theta"].array()
    true_inFV = f["isVtxInFiducial"].array()
    true_nu_vtx_x = f["true_nu_vtx_x"].array()
    true_nu_vtx_y = f["true_nu_vtx_y"].array()
    true_nu_vtx_z = f["true_nu_vtx_z"].array()

    if r == 1:
        print(f"creating df with {len(run)} entries")
        pelee_nuWro_truth_df = pd.DataFrame({"run": run, "subrun": subrun, "event": event, 
                                             "pl_true_nu_pdg": nu_pdg, "pl_true_NC": true_NC,
                                             "pl_true_nu_energy": true_nu_energy, "pl_true_lep_energy": true_lep_energy, "pl_true_lep_theta": true_lep_theta,
                                             "pl_interaction_type": interaction, "pl_inFV": true_inFV,
                                             "pl_true_nu_vtx_x": true_nu_vtx_x, "pl_true_nu_vtx_y": true_nu_vtx_y, "pl_true_nu_vtx_z": true_nu_vtx_z})
    else:
        print(f"appending df with {len(run)} entries")
        pelee_nuWro_truth_df = pd.concat([pelee_nuWro_truth_df, pd.DataFrame({"run": run, "subrun": subrun, "event": event, 
                                                                              "pl_true_nu_pdg": nu_pdg, "pl_true_NC": true_NC,
                                                                              "pl_true_nu_energy": true_nu_energy, "pl_true_lep_energy": true_lep_energy, "pl_true_lep_theta": true_lep_theta,
                                                                              "pl_interaction_type": interaction, "pl_inFV": true_inFV,
                                                                              "pl_true_nu_vtx_x": true_nu_vtx_x, "pl_true_nu_vtx_y": true_nu_vtx_y, "pl_true_nu_vtx_z": true_nu_vtx_z})])
        

# Q^2 = - (p_nu - p_mu)^2
#     = -p_nu^2 + 2 p_nu * p_mu - p_mu^2
#     = -m_nu^2 + 2 E_nu * E_mu - 2 * P_nu * P_mu * cos(theta) - m_mu^2
#     = 2 E_nu * E_mu - 2 * E_nu * P_mu * cos(theta) - m_mu^2
#     = 2 * E_nu * (E_mu - P_mu * cos(theta)) - m_mu^2

pelee_nuWro_truth_df["pl_true_lep_mass"] = [105.6583755 if abs(pdg) == 13 else 0.51099895069 if abs(pdg) == 11 else 0. for pdg in pelee_nuWro_truth_df["pl_true_nu_pdg"]]
pelee_nuWro_truth_df["pl_true_lep_KE"] = pelee_nuWro_truth_df["pl_true_lep_energy"] - pelee_nuWro_truth_df["pl_true_lep_mass"]
pelee_nuWro_truth_df["pl_true_lep_momentum"] = np.sqrt(pelee_nuWro_truth_df["pl_true_lep_energy"]**2 - pelee_nuWro_truth_df["pl_true_lep_mass"]**2)
pelee_nuWro_truth_df["pl_true_lep_costheta"] = np.cos(pelee_nuWro_truth_df["pl_true_lep_theta"])


pelee_nuWro_truth_df["pl_true_Q2"] = 2 * pelee_nuWro_truth_df["pl_true_nu_energy"] * (pelee_nuWro_truth_df["pl_true_lep_energy"] - pelee_nuWro_truth_df["pl_true_lep_momentum"] * pelee_nuWro_truth_df["pl_true_lep_costheta"]) - pelee_nuWro_truth_df["pl_true_lep_mass"]**2

pelee_nuWro_truth_df


In [None]:
nuwro_fake_df = pd.merge(wc_nuwro_fake_df, pelee_nuWro_truth_df, on=["run", "subrun", "event"], how="inner")

nuwro_fake_pot = total_wc_nuwro_fake_pot * len(nuwro_fake_df) / num_wc_nuwro_fake_events
print("reduced pot by factor of ", len(nuwro_fake_df) / num_wc_nuwro_fake_events)
nuwro_fake_df


In [None]:
nuwro_numuCC_fake_df = nuwro_fake_df.query("pl_true_nu_pdg == 14 and pl_true_NC == 0 and pl_inFV == 1")

In [None]:
x_width = np.max(nuwro_numuCC_fake_df["pl_true_nu_vtx_x"]) - np.min(nuwro_numuCC_fake_df["pl_true_nu_vtx_x"])
y_width = np.max(nuwro_numuCC_fake_df["pl_true_nu_vtx_y"]) - np.min(nuwro_numuCC_fake_df["pl_true_nu_vtx_y"])
z_width = np.max(nuwro_numuCC_fake_df["pl_true_nu_vtx_z"]) - np.min(nuwro_numuCC_fake_df["pl_true_nu_vtx_z"])

detector_volume = x_width * y_width * z_width
print("detector volume: ", detector_volume, "cm^3")

# https://microboone.fnal.gov/wp-content/uploads/MICROBOONE-NOTE-1054-PUB.pdf
density = 1.3954 # g/cm^3
avogadro = 6.022e23 # 1/mol
n_nucleons = 40
m_mol = 39.95 # g/mol

num_targets = density * detector_volume * avogadro * n_nucleons / m_mol
print("number of targets: ", num_targets)


In [None]:
fake_1030_q2_vals = nuwro_numuCC_fake_df["pl_true_Q2"] / 1e6
fake_1030_Enu_vals = nuwro_numuCC_fake_df["pl_true_nu_energy"] / 1e3

fake_1030_q2_vals = np.nan_to_num(fake_1030_q2_vals)
fake_1030_Enu_vals = np.nan_to_num(fake_1030_Enu_vals)

q2_bins = np.linspace(0, 10, 51)
Enu_bins = np.linspace(0, 7, 51) 

q2_bins = np.linspace(0, 3, 31)
Enu_bins = np.linspace(0, 3, 31) 

plt.figure()
n_fake_1030, _, _, _ = plt.hist2d(fake_1030_q2_vals, fake_1030_Enu_vals, bins=[q2_bins, Enu_bins])
plt.colorbar()
plt.xlabel("True $Q^2$ (GeV$^2$)")
plt.ylabel("True Neutrino Energy (GeV)")
plt.title("NuWro Fake Data M_A=1.03")
plt.show()

plt.figure()
n_fake_1030, _, _, _ = plt.hist2d(fake_1030_q2_vals, fake_1030_Enu_vals, bins=[q2_bins, Enu_bins], norm=mpl.colors.LogNorm())
plt.colorbar()
plt.xlabel("True $Q^2$ (GeV$^2$)")
plt.ylabel("True Neutrino Energy (GeV)")
plt.title("NuWro Fake Data M_A=1.03")
plt.show()

# NuWro From XS Generator

In [None]:
# from /exp/uboone/data/users/bbogart/ma/BuildEventGenerators/jobcards/samples
f = uproot.open("generator_files/samples/NuWroCard_CC_Ar_uBFlux_1030.flat.root")
#f = uproot.open("generator_files/samples/NuWroCard_CC_alt_uBFlux_1030.flat.root") # this is on carbon rather than argon, similar XS per nucleon
#f = uproot.open("generator_files/samples/NuWroCard_CC_Ar_high_uBFlux_1030.flat.root") # this uses more events to simulate the inclusive XS, basically the same as above


gen_1000_q2_vals = np.array(f["FlatTree_VARS"]["Q2"].array())
gen_1000_Enu_vals = np.array(f["FlatTree_VARS"]["Enu_true"].array())

plt.figure()
n_gen_1000, _, _, _ = plt.hist2d(gen_1000_q2_vals, gen_1000_Enu_vals, bins=[q2_bins, Enu_bins], norm=mpl.colors.LogNorm())
plt.colorbar()
plt.xlabel("True $Q^2$ (GeV$^2$)")
plt.ylabel("True Neutrino Energy (GeV)")
plt.title("NuWro Generator M_A=1.03")
plt.show()

fake_1030_q2_vals = nuwro_numuCC_fake_df["pl_true_Q2"] / 1e6
fake_1030_Enu_vals = nuwro_numuCC_fake_df["pl_true_nu_energy"] / 1e3

fake_1030_q2_vals = np.nan_to_num(fake_1030_q2_vals)
fake_1030_Enu_vals = np.nan_to_num(fake_1030_Enu_vals)

plt.figure()
n_fake_1030, _, _, _ = plt.hist2d(fake_1030_q2_vals, fake_1030_Enu_vals, bins=[q2_bins, Enu_bins], norm=mpl.colors.LogNorm())
plt.colorbar()
plt.xlabel("True $Q^2$ (GeV$^2$)")
plt.ylabel("True Neutrino Energy (GeV)")
plt.title("NuWro Fake Data M_A=1.03")
plt.show()

gen_1000_Enu_vals = np.array(f["FlatTree_VARS"]["Enu_true"].array())
fake_1030_Enu_vals = nuwro_numuCC_fake_df["pl_true_nu_energy"]

integrated_flux = 7.2e-10 # nu per POT per cm^2
scale_factor = f["FlatTree_VARS"]["fScaleFactor"].array()[0]
print("scale factor:", scale_factor)

gen_1000_pot = 1 / (num_targets * integrated_flux * scale_factor)
print("generator M_A=1.0 POT: ", gen_1000_pot)

fudge_factor = 1.1031934721265315 # from comparison of generator and fake data at M_A=1.03, not sure where this comes from
#fudge_factor = 1 # TEMPORARY

gen_1000_weights = [(nuwro_fake_pot / gen_1000_pot) / fudge_factor for _ in range(len(gen_1000_Enu_vals))]

bins = np.linspace(0, 10, 51)

plt.figure()
plt.hist(gen_1000_Enu_vals, bins=bins, histtype="step", label=f"Generator, M_A=1.03 ({gen_1000_pot:.2e} POT)")
plt.hist(fake_1030_Enu_vals/1000, bins=bins, histtype="step", label=f"Fake Data, M_A=1.03 ({nuwro_fake_pot:.2e} POT)")
plt.legend()
plt.xlabel("True Neutrino Energy (GeV)")
plt.ylabel(f"Number of Events")
plt.show()

plt.figure()
plt.hist(gen_1000_Enu_vals, weights=gen_1000_weights, bins=bins, histtype="step", label="Generator, M_A=1.03")
plt.hist(fake_1030_Enu_vals/1000, bins=bins, histtype="step", label="Fake Data, M_A=1.03")
plt.legend()
plt.yscale("log")
plt.xlabel("True Neutrino Energy (GeV)")
plt.ylabel(f"Number of Events (scaled to {nuwro_fake_pot:.2e} POT)")
plt.show()

gen_1000_q2_vals = np.array(f["FlatTree_VARS"]["Q2"].array())
fake_1030_q2_vals = nuwro_numuCC_fake_df["pl_true_Q2"] / 1e6

plt.figure()
plt.hist(gen_1000_q2_vals, weights=gen_1000_weights, bins=q2_bins, histtype="step", label="Generator, M_A=1.03")
plt.hist(fake_1030_q2_vals, bins=q2_bins, histtype="step", label="Fake Data, M_A=1.03")
plt.legend()
plt.xlabel("True $Q^2$ (GeV$^2$)")
plt.ylabel(f"Number of Events (scaled to {nuwro_fake_pot:.2e} POT)")
plt.show()

print("ratio of all events, generator / fake data: ", np.sum(gen_1000_weights) / len(fake_1030_Enu_vals))


In [None]:
plt.rcParams.update({'font.size': 8})

reweighting_ratios = {}

fig, axes = plt.subplots(2, 5, figsize=(13, 5))
axes = axes.flatten()

for idx, M_A_1000 in enumerate([700, 800, 900, 1000, 1030, 1100, 1200, 1300, 1400, 1500]):
    f = uproot.open(f"generator_files/samples/NuWroCard_CC_Ar_uBFlux_{M_A_1000:04d}.flat.root")

    print("M_A: ", M_A_1000/1000, "number of events: ", len(f["FlatTree_VARS"]["Q2"].array()), "scale factor: ", f["FlatTree_VARS"]["fScaleFactor"].array()[0])

    q2_vals = np.array(f["FlatTree_VARS"]["Q2"].array())
    Enu_vals = np.array(f["FlatTree_VARS"]["Enu_true"].array())
    scale_factor = f["FlatTree_VARS"]["fScaleFactor"].array()[0]

    gen_pot = 1 / (num_targets * integrated_flux * scale_factor)
    gen_weights = [(nuwro_fake_pot / gen_pot) / fudge_factor for _ in range(len(Enu_vals))]

    n_gen, _, _ = np.histogram2d(q2_vals, Enu_vals, weights=gen_weights, bins=[q2_bins, Enu_bins])

    with np.errstate(divide='ignore', invalid='ignore'):  
        ratios = n_gen / n_fake_1030
    ratios = np.nan_to_num(ratios, nan=1)
    ratios[ratios == 0] = 1

    reweighting_ratios[M_A_1000/1000] = {
        'ratios': ratios,
        'q2_bins': q2_bins,
        'Enu_bins': Enu_bins
    }

    im = axes[idx].pcolormesh(q2_bins, Enu_bins, ratios.T, vmin=0, vmax=2, cmap="coolwarm")
    
    # Only show x labels for bottom row (indices 5-9)
    if idx >= 5:
        axes[idx].set_xlabel("True $Q^2$ (GeV$^2$)")
    else:
        axes[idx].set_xticklabels([])
    
    # Only show y labels for leftmost column (indices 0 and 5)
    if idx % 5 == 0:
        axes[idx].set_ylabel("True Neutrino Energy (GeV)")
    else:
        axes[idx].set_yticklabels([])
        
    axes[idx].set_title(f"$M_A$ = {M_A_1000 / 1000:.2f}")

fig.subplots_adjust(right=0.9)
cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
fig.colorbar(im, cax=cbar_ax)

plt.suptitle("Ratio: Generator / Fake Data ($M_A$ = 1.03)", fontsize=12)
plt.show()

with open('nuwro_reweighting_ratios.pkl', 'wb') as f:
    pickle.dump(reweighting_ratios, f)