In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from glob import glob
import os,sys
import itertools
from copy import deepcopy

In [2]:
name = "900steps_100fits"
work_path = "/data/user/tvaneede/GlobalFit/reco_processing/bdt/training/optimize_training/optimize_cuts/output/"
folder_path = f"{work_path}/{name}"

In [5]:
import os
import pandas as pd

# Get all folders in that directory
folders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]

rows = []
for f in folders:
    # Split mcd, flux, feature
    try: 
        mcd_part, rest = f.split("_flux-", 1)
        mcd_part = mcd_part.split("-")[-1]
    except ValueError: mcd_part, rest = f, ""
    
    try: flux_part, feat_part = rest.split("_feat-", 1)
    except ValueError: flux_part, feat_part = rest, ""
    
    rows.append({
        "name": f,
        "model_configs_name": mcd_part,
        "flux_model_name": flux_part,
        "features_list_name": feat_part
    })

    df = pd.DataFrame(rows)
df = df.sort_values("name").reset_index(drop=True)
df["model_path"] = df["name"].apply(lambda n: os.path.join(work_path, name, n))

# the flux doesnt matter, so I only do HESE
flux_mask = df["flux_model_name"] == "hese"
df = df[flux_mask]

In [6]:
def obtain_optimal_bdt_score( model_path ):
    model_name = model_path.split("/")[-1]
    df = pd.read_parquet(f"{model_path}/optimization.parquet")
    df["variance_ratio_sum"]  = df["variance_nue_ratio"] + df["variance_nutau_ratio"]
    optimal_row = df.loc[df["variance_ratio_sum"].idxmin()].copy()
    optimal_row["model_name"] = model_name
    return optimal_row

In [7]:
optimal_df = df.loc[df["model_path"].notna(), "model_path"].apply(obtain_optimal_bdt_score).apply(pd.Series)
df = df.join(optimal_df)

In [11]:
variables = ["features_list_name","model_configs_name","cut_bdt1","cut_bdt2","variance_ratio_sum",'n_cascade', 'n_double','n_track']
df_sorted = df.sort_values("variance_ratio_sum", ascending=True)
df_sorted[variables]

Unnamed: 0,features_list_name,model_configs_name,cut_bdt1,cut_bdt2,variance_ratio_sum,n_cascade,n_double,n_track
33,11features_plus_rloglmilli_econf_evtgen,simpletopology,0.366667,0.5,0.832659,235700.0,52827.0,81920.0
26,11features_plus_rloglmilli_econf_evtgen,flavor,0.366667,0.433333,0.833246,235348.0,56601.0,78498.0
27,13features,flavor,0.333333,0.5,0.838998,235187.0,54300.0,80960.0
32,11features_ibr_plus_rloglmilli_econf_evtgen,simpletopology,0.533333,0.5,0.842834,238803.0,52191.0,79453.0
20,11features_ibr_plus_rloglmilli_econf_evtgen,flavor,0.433333,0.466667,0.844398,236796.0,54742.0,78909.0
19,11features_ibr_idc_plus_rloglmilli_econf_evtgen,flavor,0.5,0.533333,0.84875,237257.0,50859.0,82331.0
25,11features_plus_rloglmilli,flavor,0.5,0.466667,0.848842,236969.0,56208.0,77270.0
21,11features_plus_econf,flavor,0.466667,0.433333,0.851067,236146.0,55689.0,78612.0
31,11features_ibr_idc_plus_rloglmilli_econf_evtgen,simpletopology,0.433333,0.466667,0.853128,237524.0,54203.0,78720.0
22,11features_plus_evtgen,flavor,0.433333,0.4,0.854953,236388.0,59403.0,74656.0


In [12]:
# with 900 steps
feature_list_mask = df["features_list_name"] == "11features_plus_rloglmilli_econf_evtgen"
df[feature_list_mask][["features_list_name","model_configs_name","cut_bdt1","cut_bdt2","variance_ratio_sum",'n_cascade', 'n_double','n_track']]

Unnamed: 0,features_list_name,model_configs_name,cut_bdt1,cut_bdt2,variance_ratio_sum,n_cascade,n_double,n_track
26,11features_plus_rloglmilli_econf_evtgen,flavor,0.366667,0.433333,0.833246,235348.0,56601.0,78498.0
33,11features_plus_rloglmilli_econf_evtgen,simpletopology,0.366667,0.5,0.832659,235700.0,52827.0,81920.0


Now lets study the binning analysis

In [13]:
sys.path.append("/data/user/tvaneede/GlobalFit/reco_processing/bdt/training/optimize_training/optimize_cuts")
from bins import bins_settings

In [15]:
work_path = "/data/user/tvaneede/GlobalFit/reco_processing/bdt/training/optimize_training/optimize_cuts/output/optimize_binning_100steps_100fits/"
df_bins = pd.DataFrame(
    itertools.product(
        ["simpletopology"],
        ["hese"],
        ["11features_plus_rloglmilli_econf_evtgen"],
        bins_settings.keys(),
    ),
    columns=[
        "model_configs_name",
        "flux_model_name",
        "features_list_name",
        "bins_setting_name",
    ],
)
df_bins["name"] = (
    "mcd-" + df_bins["model_configs_name"]
    + "_flux-" + df_bins["flux_model_name"]
    + "_feat-" + df_bins["features_list_name"]
    + "_bins-" + df_bins["bins_setting_name"]
)

df_bins["model_path"] = df_bins["name"].apply(lambda n: os.path.join(work_path, n))
optimal_df = df_bins.loc[df_bins["model_path"].notna(), "model_path"].apply(obtain_optimal_bdt_score).apply(pd.Series)
df_bins = df_bins.join(optimal_df)

In [16]:
df_bins[["bins_setting_name","cut_bdt1","cut_bdt2","variance_ratio_sum",'n_cascade', 'n_double','n_track']]

Unnamed: 0,bins_setting_name,cut_bdt1,cut_bdt2,variance_ratio_sum,n_cascade,n_double,n_track
0,13logE_10logL,0.4,0.5,0.833108,236101.0,52686.0,81660.0
1,10logE_10logL,0.4,0.5,0.833271,236101.0,52686.0,81660.0
2,13logE_4bdt1_4bdt2,0.5,0.5,0.841044,237240.0,52292.0,80915.0
3,8logE_5bdt1_5bdt2,0.5,0.5,0.8418,237240.0,52292.0,80915.0
4,5logE_5bdt1_5bdt2,0.5,0.5,0.841806,237240.0,52292.0,80915.0
5,13logE_10bdtprod,0.3,0.4,0.820572,235265.0,58649.0,76533.0
6,10logE_10bdtprod,0.3,0.4,0.821243,235265.0,58649.0,76533.0
7,10logE_20bdtprod,0.3,0.4,0.819841,235265.0,58649.0,76533.0


In [13]:
df[["model_configs_name","cut_bdt1","cut_bdt2","variance_ratio_sum",'n_cascade', 'n_double','n_track']]

Unnamed: 0,model_configs_name,cut_bdt1,cut_bdt2,variance_ratio_sum,n_cascade,n_double,n_track
0,simpletopology,0.366667,0.5,0.832659,235700.0,52827.0,81920.0


In [16]:
df_sorted = df_bins.sort_values("variance_ratio_sum", ascending=True)
df_sorted[["bins_setting_name","cut_bdt1","cut_bdt2","variance_ratio_sum",'n_cascade', 'n_double','n_track']]

Unnamed: 0,bins_setting_name,cut_bdt1,cut_bdt2,variance_ratio_sum,n_cascade,n_double,n_track
7,10logE_20bdtprod,0.3,0.4,0.819841,235265.0,58649.0,76533.0
5,13logE_10bdtprod,0.3,0.4,0.820572,235265.0,58649.0,76533.0
6,10logE_10bdtprod,0.3,0.4,0.821243,235265.0,58649.0,76533.0
0,13logE_10logL,0.4,0.5,0.833108,236101.0,52686.0,81660.0
1,10logE_10logL,0.4,0.5,0.833271,236101.0,52686.0,81660.0
2,13logE_4bdt1_4bdt2,0.5,0.5,0.841044,237240.0,52292.0,80915.0
3,8logE_5bdt1_5bdt2,0.5,0.5,0.8418,237240.0,52292.0,80915.0
4,5logE_5bdt1_5bdt2,0.5,0.5,0.841806,237240.0,52292.0,80915.0


In [20]:
df_sorted = df_bins.sort_values("variance_ratio_sum", ascending=True)
cols = [
    'bins_setting_name','variance_ratio_sum',
    'rate_tracks_NuE', 'rate_tracks_NuMu', 'rate_tracks_NuTau',
    'rate_tracks_conv', 
    'rate_cascades_NuE', 'rate_cascades_NuMu', 'rate_cascades_NuTau',
    'rate_cascades_conv',
    'rate_doubles_NuE', 'rate_doubles_NuMu', 'rate_doubles_NuTau',
    'rate_doubles_conv'
]

out = df_sorted[cols].rename(
    columns=lambda c: c
        .replace('rate_tracks_', '')
        .replace('rate_cascades_', '')
        .replace('rate_doubles_', '')
)

out = out.apply(
    lambda s: s.round(3) if s.dtype.kind in "fc" else s
)

out

Unnamed: 0,bins_setting_name,variance_ratio_sum,NuE,NuMu,NuTau,conv,NuE.1,NuMu.1,NuTau.1,conv.1,NuE.2,NuMu.2,NuTau.2,conv.2
7,10logE_20bdtprod,0.82,0.154,14.111,1.806,10.777,57.975,6.645,28.164,14.782,0.114,0.358,4.379,0.123
5,13logE_10bdtprod,0.821,0.154,14.111,1.806,10.777,57.975,6.645,28.164,14.782,0.114,0.358,4.379,0.123
6,10logE_10bdtprod,0.821,0.154,14.111,1.806,10.777,57.975,6.645,28.164,14.782,0.114,0.358,4.379,0.123
0,13logE_10logL,0.833,0.147,14.031,1.795,10.459,58.023,6.871,28.292,15.124,0.072,0.212,4.261,0.1
1,10logE_10logL,0.833,0.147,14.031,1.795,10.459,58.023,6.871,28.292,15.124,0.072,0.212,4.261,0.1
2,13logE_4bdt1_4bdt2,0.841,0.095,13.796,1.688,10.147,58.089,7.109,28.455,15.438,0.058,0.209,4.205,0.098
3,8logE_5bdt1_5bdt2,0.842,0.095,13.796,1.688,10.147,58.089,7.109,28.455,15.438,0.058,0.209,4.205,0.098
4,5logE_5bdt1_5bdt2,0.842,0.095,13.796,1.688,10.147,58.089,7.109,28.455,15.438,0.058,0.209,4.205,0.098


Checking all parameters again with new bdt prod binning

In [17]:
sys.path.append("/data/user/tvaneede/GlobalFit/reco_processing/bdt/training/optimize_training")
from features_list_dict import features_list_dict
from flux_model_dict import flux_model_dict
from model_configs_dict import model_configs_dict

In [20]:
work_path = "/data/user/tvaneede/GlobalFit/reco_processing/bdt/training/optimize_training/optimize_cuts/output/optimize_binning_900steps_100fits_complete/"
df_bins = pd.DataFrame(
    itertools.product(
        ["simpletopology"],
        ["hese"],
        features_list_dict.keys(),
        ["13logE_10bdtprod"],
    ),
    columns=[
        "model_configs_name",
        "flux_model_name",
        "features_list_name",
        "bins_setting_name",
    ],
)
df_bins["name"] = (
    "mcd-" + df_bins["model_configs_name"]
    + "_flux-" + df_bins["flux_model_name"]
    + "_feat-" + df_bins["features_list_name"]
    + "_bins-" + df_bins["bins_setting_name"]
)

df_bins["model_path"] = df_bins["name"].apply(lambda n: os.path.join(work_path, n))
optimal_df = df_bins.loc[df_bins["model_path"].notna(), "model_path"].apply(obtain_optimal_bdt_score).apply(pd.Series)
df_bins = df_bins.join(optimal_df)
df_sorted = df_bins.sort_values("variance_ratio_sum", ascending=True)
df_sorted[["features_list_name","cut_bdt1","cut_bdt2","variance_ratio_sum",'n_cascade', 'n_double','n_track']]

Unnamed: 0,features_list_name,cut_bdt1,cut_bdt2,variance_ratio_sum,n_cascade,n_double,n_track
10,11features_plus_rloglmilli_econf_evtgen,0.333333,0.366667,0.819249,235822.0,60591.0,74034.0
13,11features_plus_rloglmilli_econf,0.133333,0.266667,0.823573,232477.0,71270.0,66700.0
0,13features,0.166667,0.4,0.82688,233417.0,61971.0,75059.0
4,11features_plus_econf,0.2,0.266667,0.831099,234013.0,70011.0,66423.0
11,11features_ibr_plus_rloglmilli_econf_evtgen,0.533333,0.4,0.831209,239022.0,57665.0,73760.0
7,11features_plus_rloglmilli,0.366667,0.4,0.839482,236534.0,61556.0,72357.0
12,11features_ibr_idc_plus_rloglmilli_econf_evtgen,0.2,0.3,0.839564,234124.0,67508.0,68815.0
5,11features_plus_millirlogl,0.233333,0.366667,0.841525,234998.0,65767.0,69682.0
3,11features_plus_evtgen,0.333333,0.333333,0.844582,236079.0,66059.0,68309.0
6,11features_plus_milliE,0.166667,0.233333,0.846688,233403.0,80400.0,56644.0
