In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import simweights
import pickle
import os, sys
import re
import numpy as np
import matplotlib as mat
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.colors as colors
import matplotlib.gridspec as gridspec
import pandas as pd
import tables
import h5py
import math
from scipy.stats import mstats
import matplotlib as mpl
import matplotlib.font_manager as font_manager


In [5]:
sys.path.append("/data/user/tvaneede/GlobalFit/reco_processing/notebooks/weighting")
from weights import *
from utils import *
from selections import selection_mask
from fonts import *
from plot_utils import *

In [6]:
# Append the custom module path
sys.path.append("/data/user/tvaneede/GlobalFit/reco_processing")

# Import the datasets module
from datasets import datasets

# set the inputs
reco_versions = ["evtgen_v1_rec_v2", "spice_tau_reco"]

# Dynamically select the desired dataset
simulation_datasets = {}
for reco_version in reco_versions: simulation_datasets[reco_version] = getattr(datasets, reco_version)

In [None]:
livetime_yr = 11.687
livetime_s  = livetime_yr * 365.25 * 24 * 3600 # 11.687 year

In [8]:
main_plotting_path = f"/data/user/tvaneede/GlobalFit/reco_processing/notebooks/compare_spice_ftp/output"
os.system(f"mkdir -p {main_plotting_path}")

0

In [9]:
# weight functions
spline_file = '/data/ana/Diffuse/NNMFit/MCEq_splines/v1.2.1/MCEq_splines_PRI-Gaisser-H4a_INT-SIBYLL23c_allfluxes.pickle'

# conventional            
flux_keys_conv =  ['conv_antinumu','conv_numu','conv_antinue','conv_nue','conv_antinutau','conv_nutau']
spline_object_conv = SplineHandler(spline_file, flux_keys_conv)
conv_flux = spline_object_conv.return_weight
generator_conv = lambda pdgid, energy, cos_zen: conv_flux(pdgid, energy, cos_zen)

# prompt
flux_keys_pr =  ['pr_antinumu','pr_numu','pr_antinue','pr_nue','pr_antinutau','pr_nutau']
spline_object_pr = SplineHandler(spline_file, flux_keys_pr)
pr_flux = spline_object_pr.return_weight
generator_pr = lambda pdgid, energy, cos_zen: pr_flux(pdgid, energy, cos_zen)

# astro
gamma_astro = 2.87
per_flavor_norm = 2.12
def AstroFluxModel(pdgid, energy, cos_zen):
    flux = 0.5*(per_flavor_norm*1e-18)*(energy/1e5)**-gamma_astro
    return flux

In [10]:
def open_datasets( simulation_dataset, keys_to_merge ):

    # open the files
    for key in simulation_dataset:
        print(f"----- Extracting files for {key}")
        simulation_dataset[key]['hdf_file'] = pd.HDFStore(simulation_dataset[key]['hdf_file_path'],'r')
        simulation_dataset[key]['weighter'] = simweights.NuGenWeighter( simulation_dataset[key]['hdf_file'] ,nfiles=simulation_dataset[key]['nfiles'])

    # merging files
    for new_key in keys_to_merge:
        print(f"----- Creating new key {new_key}")
        simulation_dataset[new_key] = {}
        simulation_dataset[new_key]['variables'] = {}
        simulation_dataset[new_key]['weighter'] = None

        for key in keys_to_merge[new_key]:
            
            print(f"Using {key}")
            # merge the weighters
            if simulation_dataset[new_key]['weighter'] == None:
                simulation_dataset[new_key]['weighter'] = simulation_dataset[key]['weighter']
            else: simulation_dataset[new_key]['weighter'] += simulation_dataset[key]['weighter']

    # calculate weights
    for key in simulation_dataset:
        simulation_dataset[key]['weights_astro'] = simulation_dataset[key]["weighter"].get_weights(AstroFluxModel) * livetime_s
        simulation_dataset[key]['weights_conv'] = simulation_dataset[key]["weighter"].get_weights(generator_conv) * livetime_s
        simulation_dataset[key]['weights_pr'] = simulation_dataset[key]["weighter"].get_weights(generator_pr) * livetime_s


    return simulation_dataset




In [11]:
keys_to_merge = {}

keys_to_merge["evtgen_v1_rec_v2"] = {
    "NuE" : ["NuE_midE", "NuE_highE"],
    "NuMu" : ["NuMu_midE", "NuMu_highE"],
    "NuTau" : ["NuTau_midE", "NuTau_highE"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],
}

keys_to_merge["v2"] = {
    "NuE" : ["NuE_midE", "NuE_highE"],
    "NuMu" : ["NuMu_midE", "NuMu_highE"],
    "NuTau" : ["NuTau_midE", "NuTau_highE"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],
}

keys_to_merge["spice_tau_reco"] = {
    "NuE" : ["NuE_midE1", "NuE_highE1", "NuE_midE2", "NuE_highE2"],
    "NuMu" : ["NuMu_midE1", "NuMu_highE1","NuMu_midE2", "NuMu_highE2"],
    "NuTau" : ["NuTau_midE1", "NuTau_highE1","NuTau_midE2", "NuTau_highE2"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],

}


In [10]:
for key in simulation_datasets: simulation_datasets[key] = open_datasets( simulation_datasets[key], keys_to_merge[key] )

----- Extracting files for NuTau_midE
----- Extracting files for NuTau_highE
----- Extracting files for NuE_midE
----- Extracting files for NuE_highE
----- Extracting files for NuMu_midE
----- Extracting files for NuMu_highE
----- Creating new key NuE
Using NuE_midE
Using NuE_highE
----- Creating new key NuMu
Using NuMu_midE
Using NuMu_highE
----- Creating new key NuTau
Using NuTau_midE
Using NuTau_highE
----- Creating new key NuAll
Using NuE
Using NuMu
Using NuTau
----- Extracting files for NuTau_midE1
----- Extracting files for NuTau_highE1
----- Extracting files for NuTau_midE2
----- Extracting files for NuTau_highE2
----- Extracting files for NuE_midE1
----- Extracting files for NuE_highE1
----- Extracting files for NuE_midE2
----- Extracting files for NuE_highE2
----- Extracting files for NuMu_midE1
----- Extracting files for NuMu_highE1
----- Extracting files for NuMu_midE2
----- Extracting files for NuMu_highE2
----- Creating new key NuE
Using NuE_midE1
Using NuE_highE1
Using Nu

In [11]:
data = {}

for key in simulation_datasets:

    simulation_dataset = simulation_datasets[key]

    channel_data = {}

    for flavor in ['NuE', "NuMu", "NuTau"]:
        weights = simulation_dataset[flavor]["weighter"].get_weights(AstroFluxModel) * livetime_s
        rate = np.sum(weights)
        error = np.sqrt(np.sum(weights**2))
        channel_data[f"astro_{flavor}"] = f"{rate:.2f} ± {error:.2f}"

    # Conventional
    flavor = "NuAll"
    weights_conv = simulation_dataset[flavor]["weighter"].get_weights(generator_conv) * livetime_s
    rate_conv = np.sum(weights_conv)
    err_conv = np.sqrt(np.sum(weights_conv**2))
    channel_data["conv"] = f"{rate_conv:.2f} ± {err_conv:.2f}"

    # Prompt
    weights_prompt = simulation_dataset[flavor]["weighter"].get_weights(generator_pr) * livetime_s
    rate_prompt = np.sum(weights_prompt)
    err_prompt = np.sqrt(np.sum(weights_prompt**2))
    channel_data["prompt"] = f"{rate_prompt:.2f} ± {err_prompt:.2f}"

    data[key] = channel_data

# Create DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Optional: specify column order
columns_order = [f"astro_{flavor}" for flavor in ['NuE', 'NuMu', 'NuTau']] + ["conv", "prompt"]
df = df[columns_order]

# Display as string table
print(df.to_string())

                     astro_NuE    astro_NuMu   astro_NuTau          conv        prompt
evtgen_v1_rec_v2  56.20 ± 0.54  14.74 ± 0.22  33.91 ± 0.39  32.36 ± 0.97  12.47 ± 0.10
spice_tau_reco    56.77 ± 0.56  20.42 ± 0.22  34.89 ± 0.43  38.77 ± 0.95  13.41 ± 0.11


In [12]:
for key in simulation_datasets:

    data = {}
    simulation_dataset = simulation_datasets[key]

    channel_data = {}

    for flavor in keys_to_merge[key]["NuMu"]:
        weights = simulation_dataset[flavor]["weighter"].get_weights(AstroFluxModel) * livetime_s
        rate = np.sum(weights)
        error = np.sqrt(np.sum(weights**2))
        channel_data[f"astro_{flavor}"] = f"{rate:.2f} ± {error:.2f}"

    data[key] = channel_data

    # Create DataFrame
    df = pd.DataFrame.from_dict(data, orient='index')

    # Display as string table
    print(df.to_string())

                 astro_NuMu_midE astro_NuMu_highE
evtgen_v1_rec_v2    13.92 ± 0.21      0.82 ± 0.04
               astro_NuMu_midE1 astro_NuMu_highE1 astro_NuMu_midE2 astro_NuMu_highE2
spice_tau_reco     19.28 ± 0.41       1.22 ± 0.04     19.18 ± 0.27       1.21 ± 0.03


I seem to have only 72-67%. Let's see if v2 of the reco is fine.

In [13]:
from datasets import datasets
import importlib

importlib.reload(datasets)

# set the inputs
reco_versions = ["v2", "spice_tau_reco"]

# Dynamically select the desired dataset
simulation_datasets = {}
for reco_version in reco_versions: simulation_datasets[reco_version] = getattr(datasets, reco_version)

In [14]:
for key in simulation_datasets: simulation_datasets[key] = open_datasets( simulation_datasets[key], keys_to_merge[key] )

----- Extracting files for NuTau_midE
----- Extracting files for NuTau_highE
----- Extracting files for NuE_midE
----- Extracting files for NuE_highE
----- Extracting files for NuMu_midE
----- Extracting files for NuMu_highE
----- Creating new key NuE
Using NuE_midE
Using NuE_highE
----- Creating new key NuMu
Using NuMu_midE
Using NuMu_highE
----- Creating new key NuTau
Using NuTau_midE
Using NuTau_highE
----- Creating new key NuAll
Using NuE
Using NuMu
Using NuTau
----- Extracting files for NuTau_midE1
----- Extracting files for NuTau_highE1
----- Extracting files for NuTau_midE2
----- Extracting files for NuTau_highE2
----- Extracting files for NuE_midE1
----- Extracting files for NuE_highE1
----- Extracting files for NuE_midE2
----- Extracting files for NuE_highE2
----- Extracting files for NuMu_midE1
----- Extracting files for NuMu_highE1
----- Extracting files for NuMu_midE2
----- Extracting files for NuMu_highE2
----- Creating new key NuE
Using NuE_midE1
Using NuE_highE1
Using Nu

In [15]:
for key in simulation_datasets:

    data = {}
    simulation_dataset = simulation_datasets[key]

    channel_data = {}

    for flavor in keys_to_merge[key]["NuMu"]:
        weights = simulation_dataset[flavor]["weighter"].get_weights(AstroFluxModel) * livetime_s
        rate = np.sum(weights)
        error = np.sqrt(np.sum(weights**2))
        channel_data[f"astro_{flavor}"] = f"{rate:.2f} ± {error:.2f}"

    data[key] = channel_data

    # Create DataFrame
    df = pd.DataFrame.from_dict(data, orient='index')

    # Display as string table
    print(df.to_string())

   astro_NuMu_midE astro_NuMu_highE
v2    13.92 ± 0.21      0.82 ± 0.04
               astro_NuMu_midE1 astro_NuMu_highE1 astro_NuMu_midE2 astro_NuMu_highE2
spice_tau_reco     19.28 ± 0.41       1.22 ± 0.04     19.18 ± 0.27       1.21 ± 0.03


Also missing! Let's take a look at ftp_l3casc and do a cut myself

In [16]:
from datasets import datasets
import importlib

importlib.reload(datasets)

# set the inputs
reco_versions = ["ftp_l3casc", "spice_l3casc"]

# Dynamically select the desired dataset
simulation_datasets = {}
for reco_version in reco_versions: simulation_datasets[reco_version] = getattr(datasets, reco_version)

In [17]:
keys_to_merge["ftp_l3casc"] = {
    "NuE" : ["NuE_midE1", "NuE_highE1", "NuE_midE2", "NuE_highE2"],
    "NuMu" : ["NuMu_midE1", "NuMu_highE1","NuMu_midE2", "NuMu_highE2"],
    "NuTau" : ["NuTau_midE1", "NuTau_highE1","NuTau_midE2", "NuTau_highE2"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],   
}

keys_to_merge["spice_l3casc"] = {
    "NuE" : ["NuE_midE1", "NuE_highE1", "NuE_midE2", "NuE_highE2"],
    "NuMu" : ["NuMu_midE1", "NuMu_highE1","NuMu_midE2", "NuMu_highE2"],
    "NuTau" : ["NuTau_midE1", "NuTau_highE1","NuTau_midE2", "NuTau_highE2"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],
}

for key in simulation_datasets: simulation_datasets[key] = open_datasets( simulation_datasets[key], keys_to_merge[key] )

----- Extracting files for NuTau_midE1
----- Extracting files for NuTau_midE2
----- Extracting files for NuTau_highE1
----- Extracting files for NuTau_highE2
----- Extracting files for NuE_midE1
----- Extracting files for NuE_midE2
----- Extracting files for NuE_highE1
----- Extracting files for NuE_highE2
----- Extracting files for NuMu_midE1
----- Extracting files for NuMu_midE2
----- Extracting files for NuMu_highE1
----- Extracting files for NuMu_highE2
----- Creating new key NuE
Using NuE_midE1
Using NuE_highE1
Using NuE_midE2
Using NuE_highE2
----- Creating new key NuMu
Using NuMu_midE1
Using NuMu_highE1
Using NuMu_midE2
Using NuMu_highE2
----- Creating new key NuTau
Using NuTau_midE1
Using NuTau_highE1
Using NuTau_midE2
Using NuTau_highE2
----- Creating new key NuAll
Using NuE
Using NuMu
Using NuTau
----- Extracting files for NuTau_midE1
----- Extracting files for NuTau_highE1
----- Extracting files for NuTau_midE2
----- Extracting files for NuTau_highE2
----- Extracting files f

In [18]:
data = {}

key = "ftp_l3casc"

simulation_dataset = simulation_datasets[key]

channel_data = {}
channel_data_masked = {}

for flavor in keys_to_merge[key]["NuMu"]:
    weights = simulation_dataset[flavor]["weighter"].get_weights(AstroFluxModel) * livetime_s
    rate = np.sum(weights)
    error = np.sqrt(np.sum(weights**2))
    channel_data[f"astro_{flavor}"] = f"{rate:.2f} ± {error:.2f}"

    HESE_CausalQTot = simulation_dataset[flavor]["hdf_file"]["HESE_CausalQTot"].value
    mask = HESE_CausalQTot > 6000
    rate_masked = np.sum(weights[mask])
    error_masked = np.sqrt(np.sum(weights[mask]**2))
    channel_data_masked[f"astro_{flavor}"] = f"{rate_masked:.2f} ± {error_masked:.2f}"

data[key] = channel_data
data[f"{key}_masked"] = channel_data_masked

# Create DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Display as string table
print(df.to_string())

                  astro_NuMu_midE1 astro_NuMu_highE1 astro_NuMu_midE2 astro_NuMu_highE2
ftp_l3casc           890.37 ± 1.65       8.84 ± 0.05    894.52 ± 1.70       8.86 ± 0.06
ftp_l3casc_masked     13.94 ± 0.13       0.83 ± 0.01     14.18 ± 0.14       0.84 ± 0.02


Seems to be the same amount of events missing. Lets check one dataset of 0000000-0000999

In [19]:
file_path_NuMu_midE = "/data/user/tvaneede/GlobalFit/reco_processing/hdf/output/ftp_l3casc/NuMu_22645_0000000-0000999.h5"
hdf_NuMu_midE = pd.HDFStore(file_path_NuMu_midE,'r')
nfiles_NuMu_midE = 1000
weighter_NuMu_midE = simweights.NuGenWeighter( hdf_NuMu_midE, nfiles=nfiles_NuMu_midE)
weights_NuMu_midE = weighter_NuMu_midE.get_weights(AstroFluxModel) * livetime_s
rate_NuMu_midE = np.sum(weights_NuMu_midE)
mask_NuMu_midE = hdf_NuMu_midE["HESE_CausalQTot"].value > 6000
rate_masked_NuMu_midE = np.sum(weights_NuMu_midE[mask_NuMu_midE])

print("rate_NuMu_midE", rate_NuMu_midE, rate_masked_NuMu_midE)

file_path_NuMu_highE = "/data/user/tvaneede/GlobalFit/reco_processing/hdf/output/ftp_l3casc/NuMu_22644_0000000-0000999.h5"
hdf_NuMu_highE = pd.HDFStore(file_path_NuMu_highE,'r')
nfiles_NuMu_highE = 1000
weighter_NuMu_highE = simweights.NuGenWeighter( hdf_NuMu_highE, nfiles=nfiles_NuMu_highE)
weights_NuMu_highE = weighter_NuMu_highE.get_weights(AstroFluxModel) * livetime_s
rate_NuMu_highE = np.sum(weights_NuMu_highE)
mask_NuMu_highE = hdf_NuMu_highE["HESE_CausalQTot"].value > 6000
rate_masked_NuMu_highE = np.sum(weights_NuMu_highE[mask_NuMu_highE])

print("rate_NuMu_highE", rate_NuMu_highE, rate_masked_NuMu_highE)


rate_NuMu_midE 888.3088102238869 14.876935899724039
rate_NuMu_highE 8.933702850310775 0.9000591852292844


I am really starting to believe that we actually have fewer muon neutrinos at hese level for the ftp-v3 simulations. Lets make a hdf of the spice files at cascade level to see if there is a difference there as well?

Turns out one reco file was corrupted, so the hdf of that group of files was broken, see tools/find_error_in_log.py

Missing jobs: 0

Error jobs: 1
{'NuMu_22043_0000000-0000999': {'LOGDIR': '/scratch/tvaneede/reco/hdf_taupede_tianlu/spice_l3casc/hdf_dag_spice_l3casc/logs', 'JOBID': 'NuMu_22043_0000000-0000999', 'INPATH': '/data/sim/IceCube/2020/filtered/level3/cascade/neutrino-generator/22043/0000000-0000999', 'OUTFILE': '/data/user/tvaneede/GlobalFit/reco_processing/hdf/output/spice_l3casc/NuMu_22043_0000000-0000999.h5'}}

In [20]:
data = {}

key = "spice_l3casc"

simulation_dataset = simulation_datasets[key]

channel_data = {}
channel_data_masked = {}

for flavor in ["NuMu_midE1","NuMu_highE1","NuMu_midE2", "NuMu_highE2"]: # "NuMu_midE1" was/is corrupt
    weights = simulation_dataset[flavor]["weighter"].get_weights(AstroFluxModel) * livetime_s
    rate = np.sum(weights)
    error = np.sqrt(np.sum(weights**2))
    channel_data[f"astro_{flavor}"] = f"{rate:.2f} ± {error:.2f}"

    HESE_CausalQTot = simulation_dataset[flavor]["hdf_file"]["HESE_CausalQTot"].value
    mask = HESE_CausalQTot > 6000
    rate_masked = np.sum(weights[mask])
    error_masked = np.sqrt(np.sum(weights[mask]**2))
    channel_data_masked[f"astro_{flavor}"] = f"{rate_masked:.2f} ± {error_masked:.2f}"

data[key] = channel_data
data[f"{key}_masked"] = channel_data_masked

# Create DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Display as string table
print(df.to_string())

                    astro_NuMu_midE1 astro_NuMu_highE1 astro_NuMu_midE2 astro_NuMu_highE2
spice_l3casc           917.54 ± 4.53       9.31 ± 0.13    919.78 ± 2.97       9.19 ± 0.08
spice_l3casc_masked     14.01 ± 0.36       0.81 ± 0.04     13.77 ± 0.23       0.82 ± 0.02


Wow!! It turns out, if I do the cut myself on spice, I get the same number. How did Neha get higher values? Probably due to her definitions in 
https://github.com/icecube/wg-diffuse/blob/2023_GlobalFit_Flavor/Ternary_Classifier/segments/VHESelfVeto.py

I made new datasets in spice_l3casc_qtot that contains both CausalQTot calculations. Let's see what happens.

In [24]:
from datasets import datasets
import importlib

importlib.reload(datasets)

# set the inputs
reco_versions = ["ftp_l3casc", "spice_l3casc_qtot"]

# Dynamically select the desired dataset
simulation_datasets = {}
for reco_version in reco_versions: simulation_datasets[reco_version] = getattr(datasets, reco_version)

In [25]:
keys_to_merge["ftp_l3casc"] = {
    "NuE" : ["NuE_midE1", "NuE_highE1", "NuE_midE2", "NuE_highE2"],
    "NuMu" : ["NuMu_midE1", "NuMu_highE1","NuMu_midE2", "NuMu_highE2"],
    "NuTau" : ["NuTau_midE1", "NuTau_highE1","NuTau_midE2", "NuTau_highE2"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],   
}

keys_to_merge["spice_l3casc_qtot"] = {
    "NuE" : ["NuE_midE1", "NuE_highE1", "NuE_midE2", "NuE_highE2"],
    "NuMu" : ["NuMu_midE1", "NuMu_highE1","NuMu_midE2", "NuMu_highE2"],
    "NuTau" : ["NuTau_midE1", "NuTau_highE1","NuTau_midE2", "NuTau_highE2"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],
}

for key in simulation_datasets: simulation_datasets[key] = open_datasets( simulation_datasets[key], keys_to_merge[key] )

----- Extracting files for NuTau_midE1
----- Extracting files for NuTau_midE2
----- Extracting files for NuTau_highE1
----- Extracting files for NuTau_highE2
----- Extracting files for NuE_midE1
----- Extracting files for NuE_midE2
----- Extracting files for NuE_highE1
----- Extracting files for NuE_highE2
----- Extracting files for NuMu_midE1
----- Extracting files for NuMu_midE2
----- Extracting files for NuMu_highE1
----- Extracting files for NuMu_highE2
----- Creating new key NuE
Using NuE_midE1
Using NuE_highE1
Using NuE_midE2
Using NuE_highE2
----- Creating new key NuMu
Using NuMu_midE1
Using NuMu_highE1
Using NuMu_midE2
Using NuMu_highE2
----- Creating new key NuTau
Using NuTau_midE1
Using NuTau_highE1
Using NuTau_midE2
Using NuTau_highE2
----- Creating new key NuAll
Using NuE
Using NuMu
Using NuTau
----- Extracting files for NuTau_midE1
----- Extracting files for NuTau_highE1
----- Extracting files for NuTau_midE2
----- Extracting files for NuTau_highE2
----- Extracting files f

In [28]:
data = {}

key = "spice_l3casc_qtot"

simulation_dataset = simulation_datasets[key]

channel_data = {}
channel_data_masked = {}
channel_data_masked_neha = {}

for flavor in ["NuMu_midE1","NuMu_highE1","NuMu_midE2", "NuMu_highE2"]: # "NuMu_midE1" was/is corrupt
    weights = simulation_dataset[flavor]["weighter"].get_weights(AstroFluxModel) * livetime_s
    rate = np.sum(weights)
    error = np.sqrt(np.sum(weights**2))
    channel_data[f"astro_{flavor}"] = f"{rate:.2f} ± {error:.2f}"

    HESE_CausalQTot = simulation_dataset[flavor]["hdf_file"]["HESE_CausalQTot"].value
    mask = HESE_CausalQTot > 6000
    rate_masked = np.sum(weights[mask])
    error_masked = np.sqrt(np.sum(weights[mask]**2))
    channel_data_masked[f"astro_{flavor}"] = f"{rate_masked:.2f} ± {error_masked:.2f}"

    CausalQTot = simulation_dataset[flavor]["hdf_file"]["CausalQTot"].value
    VHESelfVeto = simulation_dataset[flavor]["hdf_file"]["VHESelfVeto"].value
    mask = (CausalQTot > 6000) & (VHESelfVeto == False)
    rate_masked = np.sum(weights[mask])
    error_masked = np.sqrt(np.sum(weights[mask]**2))
    channel_data_masked_neha[f"astro_{flavor}"] = f"{rate_masked:.2f} ± {error_masked:.2f}"

data[key] = channel_data
data[f"{key}_masked"] = channel_data_masked
data[f"{key}_masked_neha"] = channel_data_masked_neha

# Create DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Display as string table
print(df.to_string())

                              astro_NuMu_midE1 astro_NuMu_highE1 astro_NuMu_midE2 astro_NuMu_highE2
spice_l3casc_qtot                917.54 ± 4.53       9.31 ± 0.13    919.78 ± 2.97       9.19 ± 0.08
spice_l3casc_qtot_masked          14.01 ± 0.36       0.81 ± 0.04     13.77 ± 0.23       0.82 ± 0.02
spice_l3casc_qtot_masked_neha     14.01 ± 0.36       0.81 ± 0.04     13.77 ± 0.23       0.82 ± 0.02


Interesting. It seems that both definitions get the exact same events. Why does spice_tau_reco have more events?

Let's take a look at one file and select some events.

In [3]:
file_path_l3 = "/data/user/tvaneede/GlobalFit/reco_processing/hdf/output/spice_l3casc_nehaqtot/NuMu_22043_0000000-0000999.h5"
hdf_l3 = pd.HDFStore(file_path_l3,'r')
mask_l3 = hdf_l3["HESE_CausalQTot"].value > 6000
total_l3 = len(hdf_l3["I3EventHeader"])
masked_l3 = len(hdf_l3["I3EventHeader"][mask_l3])
mask_l3_neha = (hdf_l3["CausalQTot"].value > 6000) & (hdf_l3["VHESelfVeto"].value == False)
masked_l3_neha = len(hdf_l3["I3EventHeader"][mask_l3_neha])

print("total_events", total_l3, "masked_events", masked_l3, masked_l3_neha)

file_path_reco = "/data/user/tvaneede/GlobalFit/reco_processing/hdf/output/spice_tau_reco/NuMu_22043_0000000-0000999.h5"
hdf_reco = pd.HDFStore(file_path_reco,'r')
total_reco = len(hdf_reco["I3EventHeader"])
mask_reco_neha = (hdf_reco["CausalQTot"].value > 6000) & (hdf_reco["VHESelfVeto"].value == False)
masked_reco_neha = len(hdf_reco["I3EventHeader"][mask_reco_neha])

print("total_events", total_reco, "masked_events", masked_reco_neha)



total_events 101608 masked_events 4590 4590
total_events 7128 masked_events 7128


In [5]:
# Extract masked headers
header_l3_masked = hdf_l3["I3EventHeader"][mask_l3_neha][["Run", "Event"]]
header_reco_masked = hdf_reco["I3EventHeader"][mask_reco_neha][["Run", "Event"]]

# Convert to sets of (Run, Event) pairs
l3_events = set(zip(header_l3_masked["Run"], header_l3_masked["Event"]))
reco_events = set(zip(header_reco_masked["Run"], header_reco_masked["Event"]))

only_in_reco = reco_events - l3_events
print(f"Number of events only in reco after mask: {len(only_in_reco)}")

# Create boolean mask for those events
reco_header = hdf_reco["I3EventHeader"][mask_reco_neha]
only_in_reco_mask = reco_header.apply(lambda row: (row["Run"], row["Event"]) in only_in_reco, axis=1)
reco_only_events = reco_header[only_in_reco_mask]

reco_only_events[reco_only_events["Event"] < 100]

Number of events only in reco after mask: 2538


Unnamed: 0,Run,Event,SubEvent,SubEventStream,exists,time_start_utc_daq,time_start_mjd,time_end_utc_daq,time_end_mjd
141,2204300670,89,0,0,1,130612472805802469,59000.171844,130612472806023139,59000.171844
226,2204300612,99,0,0,1,130612472805775229,59000.171844,130612472806005409,59000.171844
274,2204300304,72,0,0,1,130612472805729339,59000.171844,130612472805933649,59000.171844
284,2204300574,13,0,0,1,130612472805725099,59000.171844,130612472805939449,59000.171844
616,2204300282,57,0,0,1,130612472805790819,59000.171844,130612472806004239,59000.171844
777,2204300667,83,0,0,1,130612472805704679,59000.171844,130612472805944849,59000.171844
1147,2204300424,32,0,0,1,130612472805716559,59000.171844,130612472805945729,59000.171844
1419,2204300455,1,0,0,1,130612472805719919,59000.171844,130612472805937009,59000.171844
1441,2204300342,81,0,0,1,130612472805710669,59000.171844,130612472805946249,59000.171844
1463,2204300535,67,0,0,1,130612472810244899,59000.171844,130612472810460059,59000.171844


I have one event I will take a look at:
1419  2204300455      1         0               0       1  130612472805719919   

dataio-shovel /data/sim/IceCube/2020/filtered/level3/cascade/neutrino-generator/22043/0000000-0000999/Level3_NuMu_NuGenCCNC.022043.000455.i3.zst 
I dont see this event?

dataio-shovel /data/ana/Diffuse/GlobalFit_Flavor/taupede/SnowStorm/RecowithBfr/Baseline/22043/0000000-0000999/Reco_NuMu_000455_out.i3.bz2 
Here I find it:
I3EventHeader [I3EventHeader]:
[ I3EventHeader:
        StartTime: 2020-05-31 04:07:27.280,571,991,9 UTC
         EndTime : 2020-05-31 04:07:27.280,593,700,9 UTC
           RunID : 2204300455
        SubrunID : 4294967295
         EventID : 1
      SubEventID : 0
  SubEventStream : InIceSplit
]
with CausalQTot 6839.98

I do find it in the muon files.
dataio-shovel /data/sim/IceCube/2020/filtered/level3/muon/neutrino-generator/22043/0000000-0000999/Level3_NuMu_NuGenCCNC.022043.000455.i3.zst 

I3EventHeader [I3EventHeader]:
[ I3EventHeader:
        StartTime: 2020-05-31 04:07:27.280,565,991,9 UTC
         EndTime : 2020-05-31 04:07:27.280,597,699,9 UTC
           RunID : 2204300455
        SubrunID : 4294967295
         EventID : 1
      SubEventID : 0
  SubEventStream : Final
]

Wait a second? Should I actually start with the cascade reco files?!?!?!

I should have not! I should start from the l2 files. I do that now from v3.

In [12]:
from datasets import datasets
import importlib

importlib.reload(datasets)

# set the inputs
reco_versions = ["spice_tau_reco", "v3"]

# Dynamically select the desired dataset
simulation_datasets = {}
for reco_version in reco_versions: simulation_datasets[reco_version] = getattr(datasets, reco_version)

In [13]:
keys_to_merge["v3"] = {
    "NuE" : ["NuE_midE", "NuE_highE"],
    "NuMu" : ["NuMu_midE", "NuMu_highE"],
    "NuTau" : ["NuTau_midE", "NuTau_highE"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],
}

keys_to_merge["spice_tau_reco"] = {
    "NuE" : ["NuE_midE1", "NuE_highE1", "NuE_midE2", "NuE_highE2"],
    "NuMu" : ["NuMu_midE1", "NuMu_highE1","NuMu_midE2", "NuMu_highE2"],
    "NuTau" : ["NuTau_midE1", "NuTau_highE1","NuTau_midE2", "NuTau_highE2"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],

}

for key in simulation_datasets: simulation_datasets[key] = open_datasets( simulation_datasets[key], keys_to_merge[key] )

----- Extracting files for NuTau_midE1
----- Extracting files for NuTau_highE1
----- Extracting files for NuTau_midE2
----- Extracting files for NuTau_highE2
----- Extracting files for NuE_midE1
----- Extracting files for NuE_highE1
----- Extracting files for NuE_midE2
----- Extracting files for NuE_highE2
----- Extracting files for NuMu_midE1
----- Extracting files for NuMu_highE1
----- Extracting files for NuMu_midE2
----- Extracting files for NuMu_highE2
----- Creating new key NuE
Using NuE_midE1
Using NuE_highE1
Using NuE_midE2
Using NuE_highE2
----- Creating new key NuMu
Using NuMu_midE1
Using NuMu_highE1
Using NuMu_midE2
Using NuMu_highE2
----- Creating new key NuTau
Using NuTau_midE1
Using NuTau_highE1
Using NuTau_midE2
Using NuTau_highE2
----- Creating new key NuAll
Using NuE
Using NuMu
Using NuTau
----- Extracting files for NuTau_midE
----- Extracting files for NuTau_highE
----- Extracting files for NuE_midE
----- Extracting files for NuE_highE
----- Extracting files for NuMu_

In [14]:
data = {}

for key in simulation_datasets:

    simulation_dataset = simulation_datasets[key]

    channel_data = {}

    for flavor in ['NuE', "NuMu", "NuTau"]:
        weights = simulation_dataset[flavor]["weighter"].get_weights(AstroFluxModel) * livetime_s
        rate = np.sum(weights)
        error = np.sqrt(np.sum(weights**2))
        channel_data[f"astro_{flavor}"] = f"{rate:.2f} ± {error:.2f}"

    # Conventional
    flavor = "NuAll"
    weights_conv = simulation_dataset[flavor]["weighter"].get_weights(generator_conv) * livetime_s
    rate_conv = np.sum(weights_conv)
    err_conv = np.sqrt(np.sum(weights_conv**2))
    channel_data["conv"] = f"{rate_conv:.2f} ± {err_conv:.2f}"

    # Prompt
    weights_prompt = simulation_dataset[flavor]["weighter"].get_weights(generator_pr) * livetime_s
    rate_prompt = np.sum(weights_prompt)
    err_prompt = np.sqrt(np.sum(weights_prompt**2))
    channel_data["prompt"] = f"{rate_prompt:.2f} ± {err_prompt:.2f}"

    data[key] = channel_data

# Create DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Optional: specify column order
columns_order = [f"astro_{flavor}" for flavor in ['NuE', 'NuMu', 'NuTau']] + ["conv", "prompt"]
df = df[columns_order]

# Display as string table
print(df.to_string())

                   astro_NuE    astro_NuMu   astro_NuTau          conv        prompt
spice_tau_reco  56.77 ± 0.56  20.42 ± 0.22  34.89 ± 0.43  38.77 ± 0.95  13.41 ± 0.11
v3              56.81 ± 2.48  12.05 ± 0.88  34.89 ± 1.87  26.54 ± 3.13  12.24 ± 0.47


I am stilling missing muons!! Lets check
- scratch log if I actually reconstructed level2 files: 22645 YES, 22644, YES
- are there errors in the reco path: YES, some, but only the gulliver problem.
Lets see if I actually have extra events with respect to v2, which was based on l3 cascade.

In [15]:
from datasets import datasets
import importlib

importlib.reload(datasets)

# set the inputs
reco_versions = ["v2", "v3"]

# Dynamically select the desired dataset
simulation_datasets = {}
for reco_version in reco_versions: simulation_datasets[reco_version] = getattr(datasets, reco_version)

In [17]:
keys_to_merge["v2"] = {
    "NuE" : ["NuE_midE", "NuE_highE"],
    "NuMu" : ["NuMu_midE", "NuMu_highE"],
    "NuTau" : ["NuTau_midE", "NuTau_highE"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],
}
keys_to_merge["v3"] = {
    "NuE" : ["NuE_midE", "NuE_highE"],
    "NuMu" : ["NuMu_midE", "NuMu_highE"],
    "NuTau" : ["NuTau_midE", "NuTau_highE"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],
}
for key in simulation_datasets: simulation_datasets[key] = open_datasets( simulation_datasets[key], keys_to_merge[key] )

----- Extracting files for NuTau_midE
----- Extracting files for NuTau_highE
----- Extracting files for NuE_midE
----- Extracting files for NuE_highE
----- Extracting files for NuMu_midE
----- Extracting files for NuMu_highE
----- Creating new key NuE
Using NuE_midE
Using NuE_highE
----- Creating new key NuMu
Using NuMu_midE
Using NuMu_highE
----- Creating new key NuTau
Using NuTau_midE
Using NuTau_highE
----- Creating new key NuAll
Using NuE
Using NuMu
Using NuTau
----- Extracting files for NuTau_midE
----- Extracting files for NuTau_highE
----- Extracting files for NuE_midE
----- Extracting files for NuE_highE
----- Extracting files for NuMu_midE
----- Extracting files for NuMu_highE
----- Creating new key NuE
Using NuE_midE
Using NuE_highE
----- Creating new key NuMu
Using NuMu_midE
Using NuMu_highE
----- Creating new key NuTau
Using NuTau_midE
Using NuTau_highE
----- Creating new key NuAll
Using NuE
Using NuMu
Using NuTau


In [34]:
file_path_v2 = "/data/user/tvaneede/GlobalFit/reco_processing/hdf/output/v2/NuMu_22645_0000000-0000999.h5"
hdf_v2 = pd.HDFStore(file_path_v2,'r')

file_path_v3 = "/data/user/tvaneede/GlobalFit/reco_processing/hdf/output/v3/NuMu_22645_0000000-0000999.h5"
hdf_v3 = pd.HDFStore(file_path_v3,'r')

run_mask_v2 = hdf_v2["I3EventHeader"]["Run"] == 2264500000
run_mask_v3 = hdf_v3["I3EventHeader"]["Run"] == 2264500000

# Extract masked headers
header_v2_masked = hdf_v2["I3EventHeader"][run_mask_v2][["Run", "Event"]]
header_v3_masked = hdf_v3["I3EventHeader"][run_mask_v3][["Run", "Event"]]

# Convert to sets of (Run, Event) pairs
v2_events = set(zip(header_v2_masked["Run"], header_v2_masked["Event"]))
v3_events = set(zip(header_v3_masked["Run"], header_v3_masked["Event"]))

only_in_v2 = v2_events - v3_events
only_in_v3 = v3_events - v2_events
print(f"Number of events only in v2: {len(only_in_v2)}")
print(only_in_v2)
print(header_v2_masked)
print(f"Number of events only in v3: {len(only_in_v3)}")
print(only_in_v3)
print(header_v3_masked)


Number of events only in v2: 3
{(2264500000, 1382), (2264500000, 3880), (2264500000, 47)}
             Run  Event
5442  2264500000     47
5443  2264500000   1382
5444  2264500000   3880
Number of events only in v3: 0
set()
Empty DataFrame
Columns: [Run, Event]
Index: []


Lets check event 47:
/data/sim/IceCube/2023/filtered/level2/neutrino-generator/22645/0000000-0000999/Level2_NuMu_NuGenCCNC.022645.000000.i3.zst
Found event in frame 45, already has HESE_VHESelfVeto == false && HESE_CausalQTot == 22676.2

Now in v2:
/data/user/tvaneede/GlobalFit/reco_processing/output/v2/22645/0000000-0000999/Reco_NuMu_NuGenCCNC.022645.000000.i3.zst_out.i3.bz2
I find the event in frame 7 with HESE_VHESelfVeto == false && HESE_CausalQTot == 22676.2.

Now in v3:
/data/user/tvaneede/GlobalFit/reco_processing/output/v3/22645/0000000-0000999/Reco_NuMu_NuGenCCNC.022645.000000.i3.zst_out.i3.bz2
I also see the event!!

Is there an error in the making of the hdf? NO
Fuck.. I had a number with how many frames should be processed. I rerun the hdf maker, lets see:

In [36]:
from datasets import datasets
import importlib

importlib.reload(datasets)

# set the inputs
reco_versions = ["spice_tau_reco", "v3"]

# Dynamically select the desired dataset
simulation_datasets = {}
for reco_version in reco_versions: simulation_datasets[reco_version] = getattr(datasets, reco_version)

keys_to_merge["v3"] = {
    "NuE" : ["NuE_midE", "NuE_highE"],
    "NuMu" : ["NuMu_midE", "NuMu_highE"],
    "NuTau" : ["NuTau_midE", "NuTau_highE"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],
}

keys_to_merge["spice_tau_reco"] = {
    "NuE" : ["NuE_midE1", "NuE_highE1", "NuE_midE2", "NuE_highE2"],
    "NuMu" : ["NuMu_midE1", "NuMu_highE1","NuMu_midE2", "NuMu_highE2"],
    "NuTau" : ["NuTau_midE1", "NuTau_highE1","NuTau_midE2", "NuTau_highE2"],
    "NuAll" : ['NuE', "NuMu", "NuTau"],

}

for key in simulation_datasets: simulation_datasets[key] = open_datasets( simulation_datasets[key], keys_to_merge[key] )

----- Extracting files for NuTau_midE1
----- Extracting files for NuTau_highE1
----- Extracting files for NuTau_midE2
----- Extracting files for NuTau_highE2
----- Extracting files for NuE_midE1
----- Extracting files for NuE_highE1
----- Extracting files for NuE_midE2
----- Extracting files for NuE_highE2
----- Extracting files for NuMu_midE1
----- Extracting files for NuMu_highE1
----- Extracting files for NuMu_midE2
----- Extracting files for NuMu_highE2
----- Creating new key NuE
Using NuE_midE1
Using NuE_highE1
Using NuE_midE2
Using NuE_highE2
----- Creating new key NuMu
Using NuMu_midE1
Using NuMu_highE1
Using NuMu_midE2
Using NuMu_highE2
----- Creating new key NuTau
Using NuTau_midE1
Using NuTau_highE1
Using NuTau_midE2
Using NuTau_highE2
----- Creating new key NuAll
Using NuE
Using NuMu
Using NuTau
----- Extracting files for NuTau_midE
----- Extracting files for NuTau_highE
----- Extracting files for NuE_midE
----- Extracting files for NuE_highE
----- Extracting files for NuMu_

In [37]:
data = {}

for key in simulation_datasets:

    simulation_dataset = simulation_datasets[key]

    channel_data = {}

    for flavor in ['NuE', "NuMu", "NuTau"]:
        weights = simulation_dataset[flavor]["weighter"].get_weights(AstroFluxModel) * livetime_s
        rate = np.sum(weights)
        error = np.sqrt(np.sum(weights**2))
        channel_data[f"astro_{flavor}"] = f"{rate:.2f} ± {error:.2f}"

    # Conventional
    flavor = "NuAll"
    weights_conv = simulation_dataset[flavor]["weighter"].get_weights(generator_conv) * livetime_s
    rate_conv = np.sum(weights_conv)
    err_conv = np.sqrt(np.sum(weights_conv**2))
    channel_data["conv"] = f"{rate_conv:.2f} ± {err_conv:.2f}"

    # Prompt
    weights_prompt = simulation_dataset[flavor]["weighter"].get_weights(generator_pr) * livetime_s
    rate_prompt = np.sum(weights_prompt)
    err_prompt = np.sqrt(np.sum(weights_prompt**2))
    channel_data["prompt"] = f"{rate_prompt:.2f} ± {err_prompt:.2f}"

    data[key] = channel_data

# Create DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Optional: specify column order
columns_order = [f"astro_{flavor}" for flavor in ['NuE', 'NuMu', 'NuTau']] + ["conv", "prompt"]
df = df[columns_order]

# Display as string table
print(df.to_string())

                   astro_NuE    astro_NuMu   astro_NuTau          conv        prompt
spice_tau_reco  56.77 ± 0.56  20.42 ± 0.22  34.89 ± 0.43  38.77 ± 0.95  13.41 ± 0.11
v3              56.81 ± 2.48  21.97 ± 1.15  37.11 ± 1.92  41.04 ± 3.85  13.81 ± 0.49


Problem solved!! I think the main differences now come from statistics.