## Notebook for ML training 

In [2]:
import utils 
import ml_framework.data_fetcher  
import json
import json
import gzip

###  1) Extract dataset metadata info

Sample metadata management utilities allowing to quickly access desired information.
Each analysis can organise samples in a similar way with specific categories and parameters such as rucio DIDs, n_files, n_events, MCcampagin, AMI tags, sample full size... 

All saved into a json dict file 

In [3]:
fname = "ntuple_production/file_metadata.json.gz"
with gzip.open(fname) as f:
    dataset_info = json.loads(f.read().decode())

Custom utilities to list sample categories with TopCP outputs

In [4]:
samples_with_ntuples=utils.check_for_ntuples(dataset_info)

In [5]:
print(json.dumps(samples_with_ntuples, indent = 4))

{
    "Wt": {
        "samples": 12,
        "samples_with_ntuples": 12,
        "total_output_GB": 208.24
    },
    "Wt_DS": {
        "samples": 12,
        "samples_with_ntuples": 12,
        "total_output_GB": 165.76
    },
    "Wt_H7": {
        "samples": 12,
        "samples_with_ntuples": 6,
        "total_output_GB": 82.79
    },
    "data": {
        "samples": 7,
        "samples_with_ntuples": 7,
        "total_output_GB": 6.68
    },
    "diboson": {
        "samples": 108,
        "samples_with_ntuples": 108,
        "total_output_GB": 915.4
    },
    "rare_top": {
        "samples": 18,
        "samples_with_ntuples": 6,
        "total_output_GB": 96.63
    },
    "st_schan": {
        "samples": 12,
        "samples_with_ntuples": 12,
        "total_output_GB": 30.69
    },
    "st_tchan": {
        "samples": 12,
        "samples_with_ntuples": 12,
        "total_output_GB": 121.12
    },
    "ttV": {
        "samples": 48,
        "samples_with_ntuples": 48,
       

In [6]:
total_gb = sum(v["total_output_GB"] for v in samples_with_ntuples.values())
print(f"Total size of Ntuples: {total_gb/1_000: .1f} Tb")

Total size of Ntuples:  21.3 Tb


Ntuple list builder to retrieve dids of each ntuple container

In [7]:
def build_ntuple_list(metadata: dict, categories: list[str] | str):
    if isinstance(categories, str):
        categories = [categories]
    containers={}
    
    for category in categories:
        dataset=metadata.get(category, {})
    
        for container, info in dataset.items():
            if info["output"] is None:
                continue
            did, _, campaign = utils.dsid_rtag_campaign(container)
            containers[f"{category}_{did}_{campaign}"] = "user.alheld:"+info["output"][:-1]

    return containers

Get DSID containers for wjets

In [8]:
data_containers = build_ntuple_list(dataset_info, "data")

In [9]:
data_containers

{'data_data_data15': 'user.alheld:user.alheld.periodAllYear.physics_Main.DAOD_PHYSLITE.grp15_v01_p6697.IC-v1_output',
 'data_data_data16': 'user.alheld:user.alheld.periodAllYear.physics_Main.DAOD_PHYSLITE.grp16_v01_p6697.IC-v1_output',
 'data_data_data17': 'user.alheld:user.alheld.periodAllYear.physics_Main.DAOD_PHYSLITE.grp17_v01_p6697.IC-v1_output',
 'data_data_data18': 'user.alheld:user.alheld.periodAllYear.physics_Main.DAOD_PHYSLITE.grp18_v01_p6697.IC-v1_output',
 'data_data_data22': 'user.alheld:user.alheld.periodAllYear.physics_Main.DAOD_PHYSLITE.grp22_v02_p6700.IC-v1_output',
 'data_data_data23': 'user.alheld:user.alheld.periodAllYear.physics_Main.DAOD_PHYSLITE.grp23_v01_p6700.IC-v1_output',
 'data_data_data24': 'user.alheld:user.alheld.periodAllYear.physics_Main.DAOD_PHYSLITE.grp24_v01_p6700.IC-v1_output'}

In [36]:
ntuples = build_ntuple_list(dataset_info, ["ttbar_nom", "wjets"])

In [44]:
size_bkg = sum(samples_with_ntuples[b]["total_output_GB"] for b in ["wjets", "ttbar_nom"])
print(f"ttbar_nominal and W+jets are split into {len(ntuples)} datasets for a total of {size_bkg/1_000:.1f} TB of TopCP Ntuples")

ttbar_nominal and W+jets are split into 63 datasets for a total of 6.7 TB of TopCP Ntuples


#### Get NTuples for TTbar nominal events

### 2) Use ServiceX to check the file structure w/o pulling data
To prepare your ServiceX query, you can check what branches are available in the targeted samples. 
Interface to be improved

In [47]:
from servicex_analysis_utils import get_structure

did=ntuples["wjets_700785_mc23e"]

#Single sample example
array_structure=get_structure(did, array_out=True)
record=array_structure[did].content
print(f"{type(record)} contains 3 fields: {record.fields}")

Output()

<class 'awkward.types.recordtype.RecordType'> contains 3 fields: ['EventLoop_FileExecuted', 'EventLoop_JobStats', 'reco']


In [48]:
#get branches from reco ttree
reco_branches = record.contents[2].fields

print(f"Total of {len(reco_branches)} branches in {did}\n")

pt_variations = [b for b in reco_branches if "jet_pt" in b] 
print(
    f"{len(pt_variations)} available jet_pt variables: \n",
    "\n ".join(pt_variations[0:5]),
)


Total of 3517 branches in user.alheld:user.alheld.700785.Sh.DAOD_PHYSLITE.e8514_s4369_r16083_p6697.IC-v1_output

117 available jet_pt variables: 
 jet_pt_NOSYS
 jet_pt_JET_BJES_Response__1up
 jet_pt_JET_BJES_Response__1down
 jet_pt_JET_EffectiveNP_Detector1__1up
 jet_pt_JET_EffectiveNP_Detector1__1down


### 3) Build query and fetch samples with the `data_fetcher` module

In [49]:
import ml_framework.data_fetcher as fetcher

#### 3.a) Define event-level cuts on available branches:

- At least 4 jets with pT > 25 GeV

- At least one jet in the [-2.2, 2.2] eta region

- At least 2 electrons with no pT cutoff

- at least 2 electrons in the [-2.2, 2.2] eta region

- At least 2 btagged jets with the 77% WP (GN2)

In [50]:
# Build list of awkward mask exprenssions on fields with branch names
cuts = [
    "(num(jet_pt_NOSYS[jet_pt_NOSYS > 26000]) > 3)",
    "(num(jet_eta[abs(jet_eta) < 2.2]) > 0)",
    "(num(jet_GN2v01_FixedCutBEff_77_select[jet_GN2v01_FixedCutBEff_77_select > 0]) > 1)",
    "(num(el_pt_NOSYS) > 1)",
    "(num(el_eta[abs(el_eta) < 2.2]) > 0)"
]

#### 3.b) Define the branch selection to be dumped by ServiceX workers in the result files

Branches that are read by uproot in the backend with `filter_name`

In [51]:
branches = [
    "jet_pt_NOSYS",
    "jet_eta",
    "jet_phi",
    "jet_GN2v01_FixedCutBEff_77_select",
    "el_pt_NOSYS",
    "el_eta",
    "el_phi",
    "met_met_NOSYS",
    "met_phi_NOSYS",
    "met_significance_NOSYS",
    "met_sumet_NOSYS",
]

#### 3.c) Configure the transformation request 
- tree_name to open
- samples to get (list of samples allowed)
- number of files per sample
- output folder for .parquet result files

In [58]:
config = fetcher.RunConfig(
    tree_name="reco",
    dataset=ntuples.values(),
    output_folder="./ml_framework/data/",
    request_name=ntuples.keys(),
    join_result_parquet = False,
)

#### 3.d) Construct the ServiceX query object
Using `ServiceXQuery` the full FuncADL query is constructed and the ServiceX request specifications is taken care for you

In [59]:
query = fetcher.ServiceXQuery(selection = branches, cuts = cuts, config = config)

#### 3.d) Send the query to the ServiceX transformers

Run `deliver` and parse kwargs to the servicex call.

In [60]:
import time


In [61]:
start = time.time()
f=query.deliver(ignore_local_cache=True)
end=time.time()

Output()

In [64]:
print(f"Transformed and downloaded {len(config.dataset)} samples in {(end-start)/60:.1f} minutes")

Transformed and downloaded 63 samples in 4.3 minutes


In [None]:
query.write_to_parquet

### 4) "Seperate notebook" to load the result files etc etc ...

In [None]:
!ls -lh ./ml_framework/data

In [None]:
import awkward as ak

In [None]:
result = ak.from_parquet(config.output_folder+"data_data_data22.parquet")

In [None]:
result.fields

In [None]:
ak.mean(result["met_sig"])

# to do 
- fix utils issues
- build trainer NN 