In [15]:
import os, sys
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import numpy as np


In [27]:
datasets_tau_reco = {
#   23436: {"flavor": "NuMu", "energy": "low", "true_dataset": 22646, "nfiles" : 8000},
  23435: {"flavor": "NuMu", "energy": "mid", "true_dataset": 22645, "nfiles" : 5000},
  23434: {"flavor": "NuMu", "energy": "high", "true_dataset": 22644, "nfiles" : 15000},
  23433: {"flavor": "NuTau", "energy": "high", "true_dataset": 22635, "nfiles" : 20000},
  23432: {"flavor": "NuTau", "energy": "mid", "true_dataset": 22634, "nfiles" : 4000},
#   23431: {"flavor": "NuTau", "energy": "low", "true_dataset": 22633, "nfiles" : 1000},
#   23430: {"flavor": "NuE", "energy": "low", "true_dataset": 22614, "nfiles" : 1000},
  23429: {"flavor": "NuE", "energy": "mid", "true_dataset": 22613, "nfiles" : 4000},
  23428: {"flavor": "NuE", "energy": "high", "true_dataset": 22612, "nfiles" : 20000}
}

datasets_level4_6 = {
  23155: {"flavor": "NuMu", "energy": "low", "true_dataset": 22646},
  23154: {"flavor": "NuMu", "energy": "mid", "true_dataset": 22645},
  23153: {"flavor": "NuMu", "energy": "high", "true_dataset": 22644},
  23152: {"flavor": "NuTau", "energy": "high", "true_dataset": 22635},
  23151: {"flavor": "NuTau", "energy": "mid", "true_dataset": 22634},
  23150: {"flavor": "NuTau", "energy": "low", "true_dataset": 22633},
  23149: {"flavor": "NuE", "energy": "low", "true_dataset": 22614},
  23148: {"flavor": "NuE", "energy": "mid", "true_dataset": 22613},
  23147: {"flavor": "NuE", "energy": "high", "true_dataset": 22612}
}

In [28]:
# open datasets
for dataset_id in datasets_tau_reco:
    datasets_tau_reco[dataset_id]["df"] = df = pd.read_hdf(f"/data/user/tvaneede/GlobalFit/reco_processing/notebooks/benchmark_tau_reco_iceprod/requirements/data/{dataset_id}.hdf5", key=f'/{dataset_id}')

for dataset_id in datasets_level4_6:
    datasets_level4_6[dataset_id]["df"] = df = pd.read_hdf(f"/data/user/tvaneede/GlobalFit/SnowStorm_systematics/iceprod_req_harvest/data/{dataset_id}.hdf5", key=f'/{dataset_id}')


In [29]:
task_dict = {
    "low" : {
        0 : "Level7_Cascade_cascade",
        1 : "Level8_Cascade_cascade",
    },
    "mid_high" : {
        0 : "Filter_HESE+Taupede",
        1 : "EvtGen_HESE",
        2 : "Level7_Cascade_cascade",
        3 : "Level8_Cascade_cascade",
    },
    "level4_6" : {
        0 : "Level4_Cascade",
        1 : "Level5_Cascade_cascade",
        2 : "Level5_Cascade_muon",
        3 : "Level5_Cascade_hybrid",
        4 : "Level6_Cascade_cascade",
        5 : "Level6_Cascade_muon",
        6 : "Level6_Cascade_hybrid",
    }
}

In [30]:
def extract_mean_usage( variable, dataset_id, df, energy ):

    # Find the number of tasks in this dataset
    tasks = df.index.get_level_values('task').unique()
    tasks = tasks[tasks >= 0]  # just in case

    result = {"total" : 0}

    for i, task in enumerate(tasks):
        # Mask for this dataset and task
        mask = (df.index.get_level_values('dataset') == dataset_id) & \
            (df.index.get_level_values('task') == task)
        df_mask = df[mask]
        mean = df_mask[variable].mean()

        if energy == "cascade":
            mean = mean
        elif energy == "mid" or energy == "high":
            if task == 0 or task == 2:
                mean *= 3 # 3 iterations of taupede
        else:
            if task == 0:
                mean *= 3 # 3 iterations of taupede

        result[i] = mean; result["total"] += mean
    return result

In [31]:
def obtain_file_size( dataset_id ):

    true_id = datasets_tau_reco[dataset_id]["true_dataset"]

    file_paths = [f"/data/sim/IceCube/2023/filtered/level8/cascade/neutrino-generator/cascade/{true_id}/0000000-0000999/"]

    if "low" not in datasets_tau_reco[dataset_id]["energy"]:
        file_paths += [f"/data/sim/IceCube/2023/filtered/HESE/neutrino-generator/evtgen/{true_id}/0000000-0000999/" ]

    result = {"total" : 0}

    for i,file_path in enumerate(file_paths):

        sizes = []
        for fname in os.listdir(file_path):
            sizes.append(os.path.getsize(os.path.join(file_path, fname)))

        avg_size = np.mean(sizes) / 1e9  # in GB
        result[i] = avg_size

        result["total"] +=avg_size

    return result

In [32]:
# Collect rows in a list
rows = []
for dataset_id, info in datasets_tau_reco.items():

    df = datasets_tau_reco[dataset_id]["df"]

    energy = datasets_tau_reco[dataset_id]["energy"]

    mean_cpu_hours = extract_mean_usage( "time_used",dataset_id, df, energy )
    average_total_file_size = obtain_file_size( dataset_id )["total"]

    row = {
        "dataset": info["true_dataset"],
        "type": f'{info["flavor"]}_{info["energy"]}',
        "nfiles": info["nfiles"],
        "cpu_hours": int(info["nfiles"]*mean_cpu_hours["total"]), # for bright + deepcore/bright
        "Space (GB)": int(info["nfiles"]*average_total_file_size), # I will only save evtgen output
    }
    rows.append(row)

# Convert list of dicts → DataFrame
df = pd.DataFrame(rows)

# Add a final row with sums
sum_row = {
    "dataset": "Total",
    "type": "",
    "nfiles": df["nfiles"].sum(),
    "cpu_hours": df["cpu_hours"].sum(),
    "Space (GB)": df["Space (GB)"].sum()
}

df = pd.concat([df, pd.DataFrame([sum_row])], ignore_index=True)

print(df)


  dataset        type  nfiles  cpu_hours  Space (GB)
0   22645    NuMu_mid    5000      29514         113
1   22644   NuMu_high   15000      56434         240
2   22635  NuTau_high   20000      81340         518
3   22634   NuTau_mid    4000      36264         190
4   22613     NuE_mid    4000      66124         342
5   22612    NuE_high   20000      86576         506
6   Total               68000     356252        1909


In [33]:
datasets_extra = {
  22672: {"flavor": "NuMu", "energy": "low", "true_dataset": 22672, "njobs" : 8000, "nfiles" : 7218, "similar_dataset" : [23436, 23155] },
  22671: {"flavor": "NuMu", "energy": "mid", "true_dataset": 22671, "njobs" : 5000, "nfiles" : 4687, "similar_dataset" : [23435, 23154]},
  22670: {"flavor": "NuMu", "energy": "high", "true_dataset": 22670, "njobs" : 15000, "nfiles" : 9688, "similar_dataset" : [23434, 23153]},
  22668: {"flavor": "NuTau", "energy": "high", "true_dataset": 22668, "njobs" : 20000, "nfiles" : 16563, "similar_dataset" : [23433, 23152]},
  22667: {"flavor": "NuTau", "energy": "mid", "true_dataset": 22667, "njobs" : 4000, "nfiles" : 3763, "similar_dataset" : [23432, 23151]},
  22666: {"flavor": "NuTau", "energy": "low", "true_dataset": 22666, "njobs" : 1000, "nfiles" : 989, "similar_dataset" : [23431, 23150]},
  22665: {"flavor": "NuE", "energy": "low", "true_dataset": 22665, "njobs" : 1000, "nfiles" : 989, "similar_dataset" : [23430, 23149]},
  22664: {"flavor": "NuE", "energy": "mid", "true_dataset": 22664, "njobs" : 4000, "nfiles" : 3747, "similar_dataset" : [23429, 23148]},
  22663: {"flavor": "NuE", "energy": "high", "true_dataset": 22663, "njobs" : 20000, "nfiles" : 19693, "similar_dataset" : [23428, 23147]}
}

In [34]:
def obtain_cascade_file_size( dataset_id ):

    file_paths = [f"/data/sim/IceCube/2023/filtered/level4/cascade/neutrino-generator/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level5/cascade/neutrino-generator/cascade/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level5/cascade/neutrino-generator/hybrid/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level5/cascade/neutrino-generator/muon/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level6/cascade/neutrino-generator/cascade/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level6/cascade/neutrino-generator/hybrid/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level6/cascade/neutrino-generator/muon/{dataset_id}/0000000-0000999/"]


    result = {"total" : 0}

    for i,file_path in enumerate(file_paths):

        sizes = []
        for j,fname in enumerate(os.listdir(file_path)):
            sizes.append(os.path.getsize(os.path.join(file_path, fname)))
            if j > 20: continue

        avg_size = np.mean(sizes) / 1e9  # in GB
        result[i] = avg_size

        result["total"] +=avg_size

    return result

In [35]:
datasets_tau_reco = {
  23436: {"flavor": "NuMu", "energy": "low", "true_dataset": 22646, "nfiles" : 8000},
  23435: {"flavor": "NuMu", "energy": "mid", "true_dataset": 22645, "nfiles" : 5000},
  23434: {"flavor": "NuMu", "energy": "high", "true_dataset": 22644, "nfiles" : 15000},
  23433: {"flavor": "NuTau", "energy": "high", "true_dataset": 22635, "nfiles" : 20000},
  23432: {"flavor": "NuTau", "energy": "mid", "true_dataset": 22634, "nfiles" : 4000},
  23431: {"flavor": "NuTau", "energy": "low", "true_dataset": 22633, "nfiles" : 1000},
  23430: {"flavor": "NuE", "energy": "low", "true_dataset": 22614, "nfiles" : 1000},
  23429: {"flavor": "NuE", "energy": "mid", "true_dataset": 22613, "nfiles" : 4000},
  23428: {"flavor": "NuE", "energy": "high", "true_dataset": 22612, "nfiles" : 20000}
}

# open datasets
for dataset_id in datasets_tau_reco:
    datasets_tau_reco[dataset_id]["df"] = df = pd.read_hdf(f"/data/user/tvaneede/GlobalFit/reco_processing/notebooks/benchmark_tau_reco_iceprod/requirements/data/{dataset_id}.hdf5", key=f'/{dataset_id}')

In [36]:
# lets do the extra datasets:
rows = []
for dataset_id, info in datasets_extra.items():

    dataset_id_HESE = info["similar_dataset"][0]
    dataset_id_cascade = info["similar_dataset"][1]
    dataset_id_true = datasets_tau_reco[dataset_id_HESE]["true_dataset"]
    dataset_energy = info["energy"]

    # HESE Processing
    df = datasets_tau_reco[dataset_id_HESE]["df"]
    mean_cpu_hours_HESE = extract_mean_usage( "time_used",dataset_id_HESE, df, dataset_energy )["total"] 
    average_total_file_size_HESE = obtain_file_size( dataset_id_HESE )["total"]  

    # cascade processing
    df = datasets_level4_6[dataset_id_cascade]["df"]
    mean_cpu_hours_cascade = extract_mean_usage( "time_used",dataset_id_cascade, df, "cascade" )["total"]
    average_total_file_size_cascade = obtain_cascade_file_size( dataset_id_true )["total"]

    # total
    if "low" not in datasets_extra[dataset_id]["energy"]:
        total_mean_cpu_hours = int(info["nfiles"]* (mean_cpu_hours_HESE + mean_cpu_hours_cascade))
        total_average_file_size = int(info["nfiles"]*(average_total_file_size_HESE + average_total_file_size_cascade))
    else:
        total_mean_cpu_hours = int(info["nfiles"]* (mean_cpu_hours_cascade))
        total_average_file_size = int(info["nfiles"]*(average_total_file_size_cascade))

    row = {
        "dataset": info["true_dataset"],
        "type": f'{info["flavor"]}_{info["energy"]}',
        "nfiles": info["nfiles"],
        # "cpu_hours HESE": int(info["nfiles"]*mean_cpu_hours_HESE) if "low" not in datasets_extra[dataset_id]["energy"] else 0,
        # "Space (GB) HESE": int(info["nfiles"]*average_total_file_size_HESE) if "low" not in datasets_extra[dataset_id]["energy"] else 0,
        # "cpu_hours Casc": int(info["nfiles"]*mean_cpu_hours_cascade),
        # "Space (GB) Casc": int(info["nfiles"]*average_total_file_size_cascade),
        "cpu_hours": int(total_mean_cpu_hours),
        "Space (GB)": int(total_average_file_size), 
    }
    rows.append(row)

# Convert list of dicts → DataFrame
df = pd.DataFrame(rows)

# Add a final row with sums
sum_row = {
    "dataset": "Total",
    "type": "",
    "nfiles": df["nfiles"].sum(),
    # "cpu_hours HESE": df["cpu_hours HESE"].sum(),
    # "cpu_hours Casc": df["cpu_hours Casc"].sum(),
    "cpu_hours": df["cpu_hours"].sum(),
    "Space (GB)": df["Space (GB)"].sum()
}

df = pd.concat([df, pd.DataFrame([sum_row])], ignore_index=True)

print(df)

  dataset        type  nfiles  cpu_hours  Space (GB)
0   22672    NuMu_low    7218      24831         687
1   22671    NuMu_mid    4687      38551         544
2   22670   NuMu_high    9688      50557         377
3   22668  NuTau_high   16563      94238        1140
4   22667   NuTau_mid    3763      44626         706
5   22666   NuTau_low     989       6558         234
6   22665     NuE_low     989      11166         431
7   22664     NuE_mid    3747      75498        1162
8   22663    NuE_high   19693     117573        1380
9   Total               67337     463598        6661
