In [1]:
import os, sys
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import numpy as np


In [6]:
datasets_snowstorm = {
  23460: {"flavor": "NuTau", "energy": "high", "nfiles" : 22000},
  23459: {"flavor": "NuTau", "energy": "mid", "nfiles" : 4000},
  23458: {"flavor": "NuTau", "energy": "low", "nfiles" : 1100},

  23457: {"flavor": "NuE", "energy": "high", "nfiles" : 24000},
  23456: {"flavor": "NuE", "energy": "mid", "nfiles" : 4200},
  23455: {"flavor": "NuE", "energy": "low", "nfiles" : 900},

  23454: {"flavor": "NuMu", "energy": "high", "nfiles" : 18000},
  23453: {"flavor": "NuMu", "energy": "mid", "nfiles" : 5400},
  23452: {"flavor": "NuMu", "energy": "low", "nfiles" : 8100},
  23451: {"flavor": "NuMu", "energy": "lowlow", "nfiles" : 4000},

}

In [7]:
# open datasets
for dataset_id in datasets_snowstorm:
    datasets_snowstorm[dataset_id]["df"] = df = pd.read_hdf(f"/data/user/tvaneede/GlobalFit/reco_processing/notebooks/benchmark_snowstorm_iceprod/requirements/data/{dataset_id}.hdf5", key=f'/{dataset_id}')


In [16]:
task_dict = {
    "high" : {
        0  : "NuGen+CORSIKA+Polyplopia+MuonProp",
        1  : "PhotonProp",
        2  : "Detector+L1+L2",
        3  : "Filter_HESE+Taupede",
        4  : "EvtGen_HESE",
        5  : "FinalLevel_DiffuseNuMu",
        6  : "Level3_Cascade",
        7  : "Level4_Cascade",
        8  : "Level5_Cascade_cascade",
        9  : "Level5_Cascade_muon",
        10 : "Level5_Cascade_hybrid",
        11 : "Level6_Cascade_cascade",
        12 : "Level6_Cascade_muon",
        13 : "Level6_Cascade_hybrid",
        14 : "Level7_Cascade_cascade",
        15 : "Level8_Cascade_cascade",
    },
    "mid" : {
        0  : "NuGen+CORSIKA+Polyplopia+MuonProp",
        1  : "PhotonProp",
        2  : "Detector+L1+L2",
        3  : "Filter_HESE+Taupede",
        4  : "EvtGen_HESE",
        5  : "FinalLevel_DiffuseNuMu",
        6  : "Level3_Cascade",
        7  : "Level4_Cascade",
        8  : "Level5_Cascade_cascade",
        9  : "Level5_Cascade_muon",
        10 : "Level5_Cascade_hybrid",
        11 : "Level6_Cascade_cascade",
        12 : "Level6_Cascade_muon",
        13 : "Level6_Cascade_hybrid",
        14 : "Level7_Cascade_cascade",
        15 : "Level8_Cascade_cascade",
    },
    "low" : {
        0  : "NuGen+CORSIKA+Polyplopia+MuonProp",
        1  : "PhotonProp",
        2  : "Detector+L1+L2",
        5  : "FinalLevel_DiffuseNuMu",
        6  : "Level3_Cascade",
        7  : "Level4_Cascade",
        8  : "Level5_Cascade_cascade",
        9  : "Level5_Cascade_muon",
        10 : "Level5_Cascade_hybrid",
        11 : "Level6_Cascade_cascade",
        12 : "Level6_Cascade_muon",
        13 : "Level6_Cascade_hybrid",
    },
    "lowlow" : {
        0  : "NuGen+CORSIKA+Polyplopia+MuonProp",
        1  : "PhotonProp",
        2  : "Detector+L1+L2",
        5  : "FinalLevel_DiffuseNuMu",
        6  : "Level3_Cascade",
        7  : "Level4_Cascade",
        8  : "Level5_Cascade_cascade",
        9  : "Level5_Cascade_muon",
        10 : "Level5_Cascade_hybrid",
        11 : "Level6_Cascade_cascade",
        12 : "Level6_Cascade_muon",
        13 : "Level6_Cascade_hybrid",
    },
}


In [23]:
def extract_mean_usage( variable, dataset_id, df, energy ):

    # Find the number of tasks in this dataset
    tasks = df.index.get_level_values('task').unique()
    tasks = tasks[tasks >= 0]  # just in case

    result = {"total" : 0}

    for i, task in enumerate(tasks):
        # Mask for this dataset and task
        mask = (df.index.get_level_values('dataset') == dataset_id) & \
            (df.index.get_level_values('task') == task)
        df_mask = df[mask]
        mean = df_mask[variable].mean()
        
        if energy == "mid" or energy == "high":
            if task == 3 or task == 14:
                mean *= 3 # 3 iterations of taupede

        result[i] = mean; 
        result["total"] += mean
    return result

In [18]:
def obtain_file_size( dataset_id ):

    file_paths = [f"/data/sim/IceCube/2023/generated/neutrino-generator/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level2/neutrino-generator/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level3/cascade/neutrino-generator/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level4/cascade/neutrino-generator/{dataset_id}/0000000-0000999/",

                  f"/data/sim/IceCube/2023/filtered/level5/cascade/neutrino-generator/cascade/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level5/cascade/neutrino-generator/hybrid/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level5/cascade/neutrino-generator/muon/{dataset_id}/0000000-0000999/",

                  f"/data/sim/IceCube/2023/filtered/level6/cascade/neutrino-generator/cascade/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level6/cascade/neutrino-generator/hybrid/{dataset_id}/0000000-0000999/",
                  f"/data/sim/IceCube/2023/filtered/level6/cascade/neutrino-generator/muon/{dataset_id}/0000000-0000999/",

                  f"/data/sim/IceCube/2023/filtered/level8/cascade/neutrino-generator/cascade/{dataset_id}/0000000-0000999/"]

    if "low" not in datasets_snowstorm[dataset_id]["energy"]:
        file_paths += [f"/data/sim/IceCube/2023/filtered/HESE/neutrino-generator/evtgen/{dataset_id}/0000000-0000999/" ]

    result = {"total" : 0}

    for i,file_path in enumerate(file_paths):

        sizes = []
        for fname in os.listdir(file_path):
            sizes.append(os.path.getsize(os.path.join(file_path, fname)))

        avg_size = np.mean(sizes) / 1e9  # in GB
        result[i] = avg_size

        result["total"] +=avg_size

    return result

In [24]:
# Collect rows in a list
rows = []
for dataset_id, info in datasets_snowstorm.items():

    df = datasets_snowstorm[dataset_id]["df"]

    energy = datasets_snowstorm[dataset_id]["energy"]

    mean_cpu_hours = extract_mean_usage( "time_used",dataset_id, df, energy )
    average_total_file_size = obtain_file_size( dataset_id )["total"]

    row = {
        "dataset": dataset_id,
        "type": f'{info["flavor"]}_{info["energy"]}',
        "nfiles": info["nfiles"],
        "cpu_hours": int(info["nfiles"]*mean_cpu_hours["total"]), # for bright + deepcore/bright
        "Space (GB)": int(info["nfiles"]*average_total_file_size), # I will only save evtgen output
    }
    rows.append(row)

# Convert list of dicts → DataFrame
df = pd.DataFrame(rows)

# Add a final row with sums
sum_row = {
    "dataset": "Total",
    "type": "",
    "nfiles": df["nfiles"].sum(),
    "cpu_hours": df["cpu_hours"].sum(),
    "Space (GB)": df["Space (GB)"].sum()
}

df = pd.concat([df, pd.DataFrame([sum_row])], ignore_index=True)

print(df)


   dataset         type  nfiles  cpu_hours  Space (GB)
0    23460   NuTau_high   22000      98488        2830
1    23459    NuTau_mid    4000      41395         979
2    23458    NuTau_low    1100      22359         616
3    23457     NuE_high   24000     106311        3263
4    23456      NuE_mid    4200      73399        1514
5    23455      NuE_low     900      28297         690
6    23454    NuMu_high   18000      91281        2824
7    23453     NuMu_mid    5400      40019        1286
8    23452     NuMu_low    8100      44984        3579
9    23451  NuMu_lowlow    4000       5648         657
10   Total                91700     552181       18238
