In [1]:
from collections import defaultdict

import numpy as np
import matplotlib.pylab as plt
import pandas as pd

# scikit-hep
import awkward as ak
import uproot
import vector
vector.register_awkward()
import correctionlib

Matplotlib is building the font cache; this may take a moment.


In [None]:
from tqdm import tqdm

def process_with_selection(input_file, output_file, tree_name, selection_function, chunk_size=100000):
    """
    Process a ROOT file by applying a selection function on chunks of data
    and saving the filtered results to a new file.
    
    Parameters:
    -----------
    input_file : str
        Path to input ROOT file
    output_file : str
        Path to output ROOT file
    tree_name : str
        Name of the TTree to process
    selection_function : callable
        Function that takes arrays and returns a boolean mask for selection
    chunk_size : int
        Number of events to process per chunk
    """
    # # First, get the total number of entries to create a proper progress bar
    # try:
    #     with uproot.open(f"{input_file}") as f:
    #         total_entries = f[tree_name].num_entries
    #     print(f"Found {total_entries} entries in {input_file}")
    # except Exception as e:
    #     print(f"Could not determine total entries: {e}")
    #     print("Progress bar will show chunks instead of percentage.")
    #     total_entries = None

    output = None
    output_tree = None

    for arrays in tqdm(uproot.iterate(input_file, step_size=chunk_size)):

        mask = selection_function(arrays)

        # Skip if no events pass selection
        if not np.any(mask):
            continue

        branches = arrays.fields
        
        # Filter the arrays
        filtered_arrays = {branch: arrays[branch][mask] for branch in branches}
        
        # For the first chunk, create the output file
        if output is None:
            output = uproot.recreate(output_file)
            
            # Create a dictionary of branch types
            branch_types = {}
            for branch in branches:
                # Get a sample value to determine the type
                sample = filtered_arrays[branch]
                if isinstance(sample, ak.Array):
                    # For awkward arrays, we need to get the type differently
                    if len(sample) > 0:
                        # Use the actual data type from the filtered array
                        branch_types[branch] = filtered_arrays[branch].type
                    else:
                        # If no events pass selection in first chunk, use original interpretation
                        branch_types[branch] = tree[branch].interpretation
                else:
                    # For simple numpy arrays
                    branch_types[branch] = np.dtype(sample.dtype)
            
            # Create the output tree with proper types
            output_tree = output.mktree(tree_name, branch_types)
        
        # Write the filtered data for available branches only
        available_branches = set(filtered_arrays.keys()) & set(branch_types.keys())
        filtered_data_to_write = {branch: filtered_arrays[branch] for branch in available_branches}
        output_tree.extend(filtered_data_to_write)
    
    # Close the output file if it was created
    if output is not None:
        output.close()


# Example usage
def my_selection(arrays: ak.Array, is_data: bool = False) -> ak.Array:
    """Example selection function that filters events based on some criteria"""
    idx = arrays["HLT_TkMu50"] * (arrays["PuppiMET_pt"] > 50)
    if is_data:
        mask = build_lumi_mask('Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt', tree, verbose=False)
        idx = idx * mask
    return idx


def my_mc_selection(arrays: ak.Array) -> ak.Array:
    return my_selection(arrays, is_data=False)


def my_data_selection(arrays: ak.Array) -> ak.Array:
    return my_selection(arrays, is_data=True)



process_with_selection(
    "root://eospublic.cern.ch//eos/opendata/cms/Run2016H/SingleMuon/NANOAOD/UL2016_MiniAODv2_NanoAODv9-v1/*/*.root:Events",
    f"data-filtered.root",
    "Events",
    my_mc_selection,
    chunk_size=100000
)


# import glob

# datasets = {
#     "signal": glob.glob("root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/ZprimeToTT_M2000_W20_TuneCP2_PSweights_13TeV-madgraph-pythiaMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/*/*.root"),
#     "tt_semilep": glob.glob("root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/*/*.root"),
#     "tt_had": glob.glob("root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/TTToHadronic_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/*/*.root"),
#     "tt_lep": glob.glob("root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/TTTo2L2Nu_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/*/*.root"),
#     "Wjets": glob.glob("root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/WJetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/*/*.root"),
#     "data": glob.glob("root://eospublic.cern.ch//eos/opendata/cms/Run2016H/SingleMuon/NANOAOD/UL2016_MiniAODv2_NanoAODv9-v1/*/*.root")
# }

# print(datasets)

# # Apply the processing with your selection
# for name in ["signal", "tt_semilep", "tt_had", "tt_lep", "Wjets"]:
#     file_path = datasets[name]
#     process_with_selection(
#         file_path,
#         f"{name}-filtered.root",
#         "Events",
#         my_mc_selection
#     )


Could not determine total entries: File did not open properly: [ERROR] Server responded with an error: [3011] Unable to open file /eos/opendata/cms/Run2016H/SingleMuon/NANOAOD/UL2016_MiniAODv2_NanoAODv9-v1/*/*.root; No such file or directory



Processing chunks: 0it [00:00, ?it/s]


KeyInFileError: not found: 'Events'

    Available keys: 'Electron_pt', 'Electron_eta', 'Electron_dz', 'Electron_mass', 'Electron_r9', 'Electron_dxy', 'Electron_hoe', 'Electron_phi', 'Electron_sieie', 'Electron_sip3d', 'Electron_ip3d', 'Electron_cutBased', 'Electron_jetIdx'...

in file root://eospublic.cern.ch:1094//eos/opendata/cms/Run2016H/SingleMuon/NANOAOD/UL2016_MiniAODv2_NanoAODv9-v1/120000/61FC1E38-F75C-6B44-AD19-A9894155874E.root
in object /Events;1