## Notebook for ML training 

In [1]:
import utils 
import ml_framework.data_fetcher  
import json
import gzip

###  1) Extract dataset metadata info

In [2]:
fname = "ntuple_production/file_metadata.json.gz"
with gzip.open(fname) as f:
    dataset_info = json.loads(f.read().decode())

#### Get rucio containers for signal and background

In [3]:
total_sig = 0
total_wjets = 0
total_ttbar = 0
signal_containers = {}
wjets_containers = {}
ttbar_containers = {}

for container, metadata in dataset_info.get("Hplus_cb", {}).items():
    evts = metadata.get("nevts_input", 0)
    total_sig += evts
    mass = utils.hplus_signal_mass(container)
    ntuples = metadata.get("output", 0)
    _, _, campaign = utils.dsid_rtag_campaign(container)
    signal_containers[f"{mass}_{campaign}"] = {'DSID': container, 'Events': evts, 'Ntuples': ntuples}

for container, metadata in dataset_info.get("wjets", {}).items():
    evts = metadata.get("nevts_input", 0)
    total_wjets += evts
    ntuples = metadata.get("output", 0)
    did, _, campaign = utils.dsid_rtag_campaign(container)
    wjets_containers[f"{did}_{campaign}"] = {'DSID': container, 'Events': evts, 'Ntuples': ntuples}

for container, metadata in dataset_info.get("ttbar_nom", {}).items():
    evts = metadata.get("nevts_input", 0)
    total_ttbar += evts
    ntuples = metadata.get("output", 0)
    did, _, campaign = utils.dsid_rtag_campaign(container)
    ttbar_containers[f"{did}_{campaign}"] = {'DSID': container, 'Events': evts, 'Ntuples': ntuples}

print(f"H+ -> cb signal in {len(signal_containers)} DSIDs for {total_sig:.0e} total events")
print(f"wjets in {len(wjets_containers)} DSIDs for {total_wjets:.0e} total events")
print(f"ttbar_nom in {len(ttbar_containers)} DSIDs for {total_ttbar:.0e} total events")

H+ -> cb signal in 40 DSIDs for 9e+06 total events
wjets in 54 DSIDs for 7e+09 total events
ttbar_nom in 9 DSIDs for 2e+09 total events


#### Get NTuples for TTbar nominal events

In [4]:
ntuples=[]
for value in ttbar_containers.values():
    ntuples.append("user.alheld:"+value["Ntuples"][:-1])  # curent fix for output in metadata

In [5]:
print(
    f"Rucio DSIDs ntuples for ttbar nominal: \n",
    "\n ".join(ntuples),
)

Rucio DSIDs ntuples for ttbar nominal: 
 user.alheld:user.alheld.410470.PhPy8EG.DAOD_PHYSLITE.e6337_s3681_r13144_r13146_p6697.IC-v1_output
 user.alheld:user.alheld.410470.PhPy8EG.DAOD_PHYSLITE.e6337_s3681_r13145_r13146_p6697.IC-v1_output
 user.alheld:user.alheld.410470.PhPy8EG.DAOD_PHYSLITE.e6337_s3681_r13167_r13146_p6697.IC-v1_output
 user.alheld:user.alheld.601229.PhPy8EG.DAOD_PHYSLITE.e8514_s4159_r15530_p6697.IC-v1_output
 user.alheld:user.alheld.601229.PhPy8EG.DAOD_PHYSLITE.e8514_s4162_r15540_p6697.IC-v1_output
 user.alheld:user.alheld.601229.PhPy8EG.DAOD_PHYSLITE.e8514_s4369_r16083_p6697.IC-v1_output
 user.alheld:user.alheld.601230.PhPy8EG.DAOD_PHYSLITE.e8514_s4159_r15530_p6697.IC-v1_output
 user.alheld:user.alheld.601230.PhPy8EG.DAOD_PHYSLITE.e8514_s4162_r15540_p6697.IC-v1_output
 user.alheld:user.alheld.601230.PhPy8EG.DAOD_PHYSLITE.e8514_s4369_r16083_p6697.IC-v1_output


### 2) Use ServiceX to check the file structure w/o pulling data
To prepare your ServiceX query, you can check what branches are available in the targeted samples. 
Interface to be improved

In [6]:
from servicex_analysis_utils import get_structure

#Single sample example
array_structure=get_structure(ntuples[0], array_out=True)

record=array_structure[ntuples[0]].content
print(f"{type(record)} contains 3 fields: {record.fields}")

Output()

<class 'awkward.types.recordtype.RecordType'> contains 3 fields: ['EventLoop_FileExecuted', 'EventLoop_JobStats', 'reco']


In [7]:
#get branches from reco ttree
reco_branches = record.contents[2].fields

print(f"Total of {len(reco_branches)} branches in {ntuples[0]}\n")

pt_variations = [b for b in reco_branches if "jet_pt" in b] 
print(
    f"{len(pt_variations)} available jet_pt variables: \n",
    "\n ".join(pt_variations[0:5]),
)


Total of 3876 branches in user.alheld:user.alheld.410470.PhPy8EG.DAOD_PHYSLITE.e6337_s3681_r13144_r13146_p6697.IC-v1_output

125 available jet_pt variables: 
 jet_pt_NOSYS
 jet_pt_JET_BJES_Response__1up
 jet_pt_JET_BJES_Response__1down
 jet_pt_JET_EffectiveNP_Detector1__1up
 jet_pt_JET_EffectiveNP_Detector1__1down


In [8]:
btag = [b for b in reco_branches if "GN2v01_FixedCutBEff_77" in b]
print(
    f"{len(btag)} available btag selections @ 77 WP: \n",
    "\n ".join(btag[0:5]),
)

146 available btag selections @ 77 WP: 
 jet_GN2v01_FixedCutBEff_77_select
 jet_select_GN2v01_FixedCutBEff_77_NOSYS
 jet_select_GN2v01_FixedCutBEff_77_EG_RESOLUTION_AF3__1down
 jet_select_GN2v01_FixedCutBEff_77_EG_RESOLUTION_AF3__1up
 jet_select_GN2v01_FixedCutBEff_77_EG_RESOLUTION_ALL__1down


### 3) Build query and fetch samples with the `data_fetcher` module

In [9]:
import ml_framework.data_fetcher as fetcher

#### 3.a) Define event-level cuts on available branches:

- At least 4 jets with pT > 25 GeV and one jet in the barrel

- At least 2 electrons with no pT cutoff

In [10]:
def jet_pt(evt):
    return evt["jet_pt_NOSYS"].Where(lambda pt: pt > 25_000).Count() > 3

def jet_eta(evt):
    return evt["jet_eta"].Where(lambda eta: abs(eta) < 2.2).Count() > 0

def el_count(evt):
    return evt["el_pt_NOSYS"].Count()>1

cuts = [jet_pt, jet_eta, el_count]

#### 3.b) Define the branch selection to be dumped by ServiceX workers in the result files

- Branch labels in the final files
- Branches to be saved w or w/o final transformation

In [11]:
def branch_select(evt):
    return {
        "jet_pt": evt["jet_pt_NOSYS"].Select(lambda pt: pt / 1000.0),
        "btag": evt["jet_GN2v01_FixedCutBEff_77_select"],
    }

#### 3.c) Configure the transformation request 
- tree_name to open
- samples to get (list of samples allowed)
- number of files per sample
- output folder for .parquet result files

In [12]:
config = fetcher.RunConfig(
    tree_name="reco",
    dataset=ntuples[0],
    output_folder="./ml_framework/data/",
    files_per_sample=2,
    request_name="TTbar_nom"
)

#### 3.d) Construct the ServiceX query object
Using `ServiceXQuery` the full FuncADL query is constructed and the ServiceX request specifications is taken care for you

In [13]:
query = fetcher.ServiceXQuery(cuts, branch_select, config)

#### 3.d) Send the query to the ServiceX transformers

Run `deliver` and parse kwargs to the servicex call.

In [14]:
query.deliver(ignore_local_cache=True)

Output()

ReturnValueException: Exception occurred while making ServiceX request.
Traceback (most recent call last):
  File "/Users/acordeir/Documents/IRIS-HEP/IntegChallenge/integration-challenge/atlas/.pixi/envs/default/lib/python3.13/site-packages/servicex/query_core.py", line 730, in as_files_async
    return await self.submit_and_download(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        signed_urls_only=False, expandable_progress=progress
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Users/acordeir/Documents/IRIS-HEP/IntegChallenge/integration-challenge/atlas/.pixi/envs/default/lib/python3.13/site-packages/servicex/query_core.py", line 400, in submit_and_download
    _ = await monitor_task  # raise exception, if it is there
        ^^^^^^^^^^^^^^^^^^
  File "/Users/acordeir/Documents/IRIS-HEP/IntegChallenge/integration-challenge/atlas/.pixi/envs/default/lib/python3.13/site-packages/servicex/query_core.py", line 515, in transform_status_listener
    raise ServiceXException(err_str)
servicex.query_core.ServiceXException: Request "TTbar_nom" was canceled


### 4) "Seperate notebook" to load the result files etc etc ...

In [None]:
!ls ./ml_framework/data

In [None]:
import awkward as ak

In [None]:
result = ak.from_parquet("./ml_framework/data/user.alheld:user.alheld.410470.PhPy8EG.DAOD_PHYSLITE.e6337_s3681_r13144_r13146_p6697.IC-v1_output.parquet")

In [None]:
result.fields

In [None]:
result["btag"]

In [None]:
config.output_folder

# to do 
- Fix name request length / file name
- add btag cut
- fix utils issues 
- test multi sample
- test 20tb ntuples 