In [1]:
import os, sys, uproot
import numpy as np
import awkward
from tqdm import tqdm
import pandas as pd

In [10]:
work_dir='/grid_mnt/vol__vol_U__u/llr/cms/hakimi/WZ_analysis'
data_dir='/data_CMS/cms/hakimi/WZ_analysis/samples/WLLJJ_WToLNu_EWK_TuneCP5_13TeV_madgraph-madspin-pythia8/RunIIAutumn18NanoAODv6-Nano25Oct2019_102X_upgrade2018_realistic_v20-v1/NANOAODSIM/'
files=os.listdir(data_dir)
input_files=[f for f in files if ".root" in f]

In [3]:
def extract_scalar_data(events, branches, entrystop=None, progressbar=False):
    data = {}

    data["event"] = events.array("event", entrystop=entrystop)

    for br in tqdm(branches, disable=not progressbar):
        data[br] = events.array(br, entrystop=entrystop).flatten()

    return pd.DataFrame(data)

In [4]:
def extract_vector_data(events, branches, entrystop=10, progressbar=False):
    def get(branch, flat=True):
        a = events.array(branch, entrystop=entrystop)
        if flat:
            return a.flatten()
        else:
            return a

    if len(branches) == 0:
        return {}

    first_branch_jagged = get(branches[0], flat=False)
    first_branch_flat = first_branch_jagged.flatten()

    event_jagged = get("event") + awkward.JaggedArray(
        first_branch_jagged.starts, first_branch_jagged.stops, np.zeros(len(first_branch_flat), dtype=np.int)
    )

    data = {}

    data["event"] = event_jagged.flatten()

    data[branches[0]] = first_branch_flat

    for br in tqdm(branches, disable=not progressbar):
        if br == branches[0]:
            continue

        if br in events:
            data[br] = get(br)
        else:
            print('Warning! Branch "' + br + '" not found in input file and skipped.')

    return pd.DataFrame(data)


In [5]:
def nanoaod_to_parquet(input_files, out_dir, entrystop=None, input_prefix="", progressbar=False):
    def save_parquet(df, name):
        df.to_parquet(os.path.join(out_dir, name + ".parquet.gzip"), compression="gzip", index=False)

    out_dir = os.path.expanduser(out_dir)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    n_input_files = len(input_files)

    for i, input_file in enumerate(input_files):

        log_prefix = ""
        if i > 0:
            log_prefix = f"[{i+1}/{n_input_files}] "

        def log(s):
            print(log_prefix + s)

        log("Opening input file " + input_file)
        root_file = uproot.open(input_prefix + input_file)

        events = root_file["Events"]

        branches = [br.decode("ascii") for br in events.keys()]

        n_events = len(events[branches[0]])

        prefixes = list(set([b.split("_")[0] for b in branches]))

        vector_groups_present = [p for p in prefixes if "n" + p in events]

        scalar_branches = ["n" + s for s in vector_groups_present]

        for br in branches:
            if br == "event":
                continue
            if not br.split("_")[0] in vector_groups_present:
                scalar_branches.append(br)

        basename = os.path.basename(input_file)[:-5]

        log("Loading DataFrame for scalar branches")
        df_scalar = extract_scalar_data(events, scalar_branches, entrystop=entrystop, progressbar=progressbar)
        log("Saving DataFrame parquet Scalar")
        save_parquet(df_scalar, basename + "_Scalar")
        del df_scalar

        processed_branches = scalar_branches + ["event"]

        for group in vector_groups_present:
            log("Loading DataFrame for object group " + group)
            filtered_branches = list(filter(lambda br: br == group or br.startswith(group + "_"), branches))
            df = extract_vector_data(events, filtered_branches, entrystop=entrystop, progressbar=progressbar)
            log("Saving DataFrame parquet " + group)
            save_parquet(df, basename + "_" + group)
            processed_branches += filtered_branches
            del df

        # make sure we considered all the branches
        assert [b for b in branches if not b in processed_branches] == []



In [6]:
filename=data_dir+'/'+input_files[0]
root_file=uproot.open(filename)

In [7]:
events=root_file['Events']

branches=[br.decode('utf8') for br in events.keys()]

n_events=len(events[branches[0]])
print(n_events)

prefixes = list(set([b.split("_")[0] for b in branches]))

vector_groups_present = [p for p in prefixes if "n" + p in events]

scalar_branches = ["n" + s for s in vector_groups_present]

260400


In [8]:
out_dir=data_dir+'/parquet'
input_prefix=data_dir

In [11]:
nanoaod_to_parquet(input_files, out_dir, entrystop=None, input_prefix=data_dir, progressbar=True)

Opening input file ACD761A6-DBF4-2A49-A80D-E34FE198A9C4.root


  1%|          | 8/1144 [00:00<00:15, 73.86it/s]

Loading DataFrame for scalar branches


100%|██████████| 1144/1144 [00:11<00:00, 98.56it/s] 


Saving DataFrame parquet Scalar
Loading DataFrame for object group PSWeight


100%|██████████| 1/1 [00:00<00:00, 2414.68it/s]


Saving DataFrame parquet PSWeight
Loading DataFrame for object group GenPart


100%|██████████| 8/8 [00:04<00:00,  2.25it/s]


Saving DataFrame parquet GenPart
Loading DataFrame for object group GenDressedLepton


100%|██████████| 6/6 [00:00<00:00, 12.70it/s]


Saving DataFrame parquet GenDressedLepton
Loading DataFrame for object group LHEPart


100%|██████████| 5/5 [00:00<00:00,  5.33it/s]


Saving DataFrame parquet LHEPart


  0%|          | 0/42 [00:00<?, ?it/s]

Loading DataFrame for object group FatJet


100%|██████████| 42/42 [00:03<00:00, 14.80it/s]


Saving DataFrame parquet FatJet
Loading DataFrame for object group GenJetAK8


100%|██████████| 6/6 [00:00<00:00, 11.35it/s]


Saving DataFrame parquet GenJetAK8
Loading DataFrame for object group IsoTrack


100%|██████████| 14/14 [00:00<00:00, 19.73it/s]


Saving DataFrame parquet IsoTrack


  0%|          | 0/14 [00:00<?, ?it/s]

Loading DataFrame for object group SubJet


100%|██████████| 14/14 [00:01<00:00, 13.46it/s]


Saving DataFrame parquet SubJet
Loading DataFrame for object group Tau


100%|██████████| 46/46 [00:03<00:00, 15.37it/s]


Saving DataFrame parquet Tau
Loading DataFrame for object group LHEPdfWeight


100%|██████████| 1/1 [00:00<00:00, 6523.02it/s]


Saving DataFrame parquet LHEPdfWeight
Loading DataFrame for object group LHEScaleWeight


100%|██████████| 1/1 [00:00<00:00, 2478.90it/s]


Saving DataFrame parquet LHEScaleWeight


  0%|          | 0/51 [00:00<?, ?it/s]

Loading DataFrame for object group Muon


100%|██████████| 51/51 [00:03<00:00, 18.90it/s]


Saving DataFrame parquet Muon
Loading DataFrame for object group SoftActivityJet


100%|██████████| 3/3 [00:00<00:00,  6.25it/s]


Saving DataFrame parquet SoftActivityJet


  0%|          | 0/61 [00:00<?, ?it/s]

Loading DataFrame for object group Electron


100%|██████████| 61/61 [00:04<00:00, 14.62it/s]


Saving DataFrame parquet Electron
Loading DataFrame for object group CorrT1METJet


100%|██████████| 5/5 [00:01<00:00,  3.77it/s]


Saving DataFrame parquet CorrT1METJet
Loading DataFrame for object group Jet


100%|██████████| 36/36 [00:06<00:00,  5.67it/s]


Saving DataFrame parquet Jet
Loading DataFrame for object group Photon


100%|██████████| 30/30 [00:02<00:00, 16.73it/s]


Saving DataFrame parquet Photon


100%|██████████| 1/1 [00:00<00:00, 2880.70it/s]

Loading DataFrame for object group LHEReweightingWeight
Saving DataFrame parquet LHEReweightingWeight
Loading DataFrame for object group GenJet



100%|██████████| 6/6 [00:01<00:00,  5.02it/s]


Saving DataFrame parquet GenJet
Loading DataFrame for object group SV


100%|██████████| 14/14 [00:01<00:00, 12.97it/s]


Saving DataFrame parquet SV


  0%|          | 0/7 [00:00<?, ?it/s]

Loading DataFrame for object group GenVisTau


100%|██████████| 7/7 [00:00<00:00, 20.42it/s]


Saving DataFrame parquet GenVisTau
Loading DataFrame for object group TrigObj


100%|██████████| 10/10 [00:02<00:00,  5.63it/s]


Saving DataFrame parquet TrigObj


  0%|          | 0/6 [00:00<?, ?it/s]

Loading DataFrame for object group FsrPhoton


100%|██████████| 6/6 [00:00<00:00, 33.33it/s]


Saving DataFrame parquet FsrPhoton
Loading DataFrame for object group OtherPV


100%|██████████| 1/1 [00:00<00:00, 7598.38it/s]


Saving DataFrame parquet OtherPV
Loading DataFrame for object group SubGenJetAK8


100%|██████████| 4/4 [00:00<00:00,  7.07it/s]


Saving DataFrame parquet SubGenJetAK8
[2/3] Opening input file 5B67AFF3-5CDC-8F4B-9FE6-AF31B1CD248C.root


  1%|▏         | 17/1144 [00:00<00:06, 165.35it/s]

[2/3] Loading DataFrame for scalar branches


100%|██████████| 1144/1144 [00:05<00:00, 209.08it/s]


[2/3] Saving DataFrame parquet Scalar


100%|██████████| 1/1 [00:00<00:00, 6990.51it/s]

[2/3] Loading DataFrame for object group PSWeight





[2/3] Saving DataFrame parquet PSWeight
[2/3] Loading DataFrame for object group GenPart


100%|██████████| 8/8 [00:02<00:00,  4.80it/s]


[2/3] Saving DataFrame parquet GenPart


  0%|          | 0/6 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group GenDressedLepton


100%|██████████| 6/6 [00:00<00:00, 27.96it/s]


[2/3] Saving DataFrame parquet GenDressedLepton
[2/3] Loading DataFrame for object group LHEPart


100%|██████████| 5/5 [00:00<00:00, 11.19it/s]


[2/3] Saving DataFrame parquet LHEPart


 10%|▉         | 4/42 [00:00<00:01, 35.56it/s]

[2/3] Loading DataFrame for object group FatJet


100%|██████████| 42/42 [00:01<00:00, 27.58it/s]


[2/3] Saving DataFrame parquet FatJet


  0%|          | 0/6 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group GenJetAK8


100%|██████████| 6/6 [00:00<00:00, 27.00it/s]


[2/3] Saving DataFrame parquet GenJetAK8


 29%|██▊       | 4/14 [00:00<00:00, 39.96it/s]

[2/3] Loading DataFrame for object group IsoTrack


100%|██████████| 14/14 [00:00<00:00, 40.29it/s]


[2/3] Saving DataFrame parquet IsoTrack


 29%|██▊       | 4/14 [00:00<00:00, 38.51it/s]

[2/3] Loading DataFrame for object group SubJet


100%|██████████| 14/14 [00:00<00:00, 33.41it/s]


[2/3] Saving DataFrame parquet SubJet


  0%|          | 0/46 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group Tau


100%|██████████| 46/46 [00:01<00:00, 30.10it/s]


[2/3] Saving DataFrame parquet Tau
[2/3] Loading DataFrame for object group LHEPdfWeight


100%|██████████| 1/1 [00:00<00:00, 2908.67it/s]


[2/3] Saving DataFrame parquet LHEPdfWeight
[2/3] Loading DataFrame for object group LHEScaleWeight


100%|██████████| 1/1 [00:00<00:00, 3480.75it/s]


[2/3] Saving DataFrame parquet LHEScaleWeight


  0%|          | 0/51 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group Muon


100%|██████████| 51/51 [00:01<00:00, 31.51it/s]


[2/3] Saving DataFrame parquet Muon
[2/3] Loading DataFrame for object group SoftActivityJet


100%|██████████| 3/3 [00:00<00:00, 13.40it/s]


[2/3] Saving DataFrame parquet SoftActivityJet


  0%|          | 0/61 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group Electron


100%|██████████| 61/61 [00:02<00:00, 29.47it/s]


[2/3] Saving DataFrame parquet Electron


  0%|          | 0/5 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group CorrT1METJet


100%|██████████| 5/5 [00:00<00:00,  8.99it/s]


[2/3] Saving DataFrame parquet CorrT1METJet


  0%|          | 0/36 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group Jet


100%|██████████| 36/36 [00:03<00:00, 11.32it/s]


[2/3] Saving DataFrame parquet Jet


  0%|          | 0/30 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group Photon


100%|██████████| 30/30 [00:00<00:00, 30.88it/s]


[2/3] Saving DataFrame parquet Photon


100%|██████████| 1/1 [00:00<00:00, 2659.67it/s]

[2/3] Loading DataFrame for object group LHEReweightingWeight
[2/3] Saving DataFrame parquet LHEReweightingWeight
[2/3] Loading DataFrame for object group GenJet



100%|██████████| 6/6 [00:00<00:00, 10.83it/s]


[2/3] Saving DataFrame parquet GenJet


  0%|          | 0/14 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group SV


100%|██████████| 14/14 [00:00<00:00, 27.24it/s]


[2/3] Saving DataFrame parquet SV


  0%|          | 0/7 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group GenVisTau


100%|██████████| 7/7 [00:00<00:00, 37.23it/s]


[2/3] Saving DataFrame parquet GenVisTau
[2/3] Loading DataFrame for object group TrigObj


100%|██████████| 10/10 [00:01<00:00,  9.91it/s]


[2/3] Saving DataFrame parquet TrigObj


100%|██████████| 6/6 [00:00<00:00, 59.89it/s]

[2/3] Loading DataFrame for object group FsrPhoton
[2/3] Saving DataFrame parquet FsrPhoton



100%|██████████| 1/1 [00:00<00:00, 8112.77it/s]


[2/3] Loading DataFrame for object group OtherPV
[2/3] Saving DataFrame parquet OtherPV


  0%|          | 0/4 [00:00<?, ?it/s]

[2/3] Loading DataFrame for object group SubGenJetAK8


100%|██████████| 4/4 [00:00<00:00, 16.49it/s]


[2/3] Saving DataFrame parquet SubGenJetAK8
[3/3] Opening input file 5AB48A52-FD72-1849-B6A7-88F510896145.root


  0%|          | 3/1144 [00:00<00:43, 26.17it/s]

[3/3] Loading DataFrame for scalar branches


100%|██████████| 1144/1144 [00:28<00:00, 40.62it/s]


[3/3] Saving DataFrame parquet Scalar
[3/3] Loading DataFrame for object group PSWeight


100%|██████████| 1/1 [00:00<00:00, 7463.17it/s]


[3/3] Saving DataFrame parquet PSWeight
[3/3] Loading DataFrame for object group GenPart


100%|██████████| 8/8 [00:10<00:00,  1.03s/it]


[3/3] Saving DataFrame parquet GenPart
[3/3] Loading DataFrame for object group GenDressedLepton


100%|██████████| 6/6 [00:01<00:00,  5.51it/s]


[3/3] Saving DataFrame parquet GenDressedLepton
[3/3] Loading DataFrame for object group LHEPart


100%|██████████| 5/5 [00:02<00:00,  2.33it/s]


[3/3] Saving DataFrame parquet LHEPart
[3/3] Loading DataFrame for object group FatJet


100%|██████████| 42/42 [00:07<00:00,  7.06it/s]


[3/3] Saving DataFrame parquet FatJet
[3/3] Loading DataFrame for object group GenJetAK8


100%|██████████| 6/6 [00:01<00:00,  5.67it/s]


[3/3] Saving DataFrame parquet GenJetAK8
[3/3] Loading DataFrame for object group IsoTrack


100%|██████████| 14/14 [00:01<00:00,  8.47it/s]


[3/3] Saving DataFrame parquet IsoTrack
[3/3] Loading DataFrame for object group SubJet


100%|██████████| 14/14 [00:02<00:00,  6.94it/s]


[3/3] Saving DataFrame parquet SubJet
[3/3] Loading DataFrame for object group Tau


100%|██████████| 46/46 [00:09<00:00,  6.48it/s]


[3/3] Saving DataFrame parquet Tau
[3/3] Loading DataFrame for object group LHEPdfWeight


100%|██████████| 1/1 [00:00<00:00, 2490.68it/s]


[3/3] Saving DataFrame parquet LHEPdfWeight
[3/3] Loading DataFrame for object group LHEScaleWeight


100%|██████████| 1/1 [00:00<00:00, 2587.48it/s]


[3/3] Saving DataFrame parquet LHEScaleWeight
[3/3] Loading DataFrame for object group Muon


100%|██████████| 51/51 [00:08<00:00,  7.71it/s]


[3/3] Saving DataFrame parquet Muon
[3/3] Loading DataFrame for object group SoftActivityJet


100%|██████████| 3/3 [00:01<00:00,  2.85it/s]


[3/3] Saving DataFrame parquet SoftActivityJet
[3/3] Loading DataFrame for object group Electron


100%|██████████| 61/61 [00:10<00:00,  7.97it/s]


[3/3] Saving DataFrame parquet Electron
[3/3] Loading DataFrame for object group CorrT1METJet


100%|██████████| 5/5 [00:02<00:00,  1.72it/s]


[3/3] Saving DataFrame parquet CorrT1METJet
[3/3] Loading DataFrame for object group Jet


100%|██████████| 36/36 [00:15<00:00,  5.25it/s]


[3/3] Saving DataFrame parquet Jet
[3/3] Loading DataFrame for object group Photon


100%|██████████| 30/30 [00:05<00:00,  7.83it/s]


[3/3] Saving DataFrame parquet Photon
[3/3] Loading DataFrame for object group LHEReweightingWeight


100%|██████████| 1/1 [00:00<00:00, 6990.51it/s]


[3/3] Saving DataFrame parquet LHEReweightingWeight
[3/3] Loading DataFrame for object group GenJet


100%|██████████| 6/6 [00:02<00:00,  2.64it/s]


[3/3] Saving DataFrame parquet GenJet
[3/3] Loading DataFrame for object group SV


100%|██████████| 14/14 [00:02<00:00,  4.55it/s]


[3/3] Saving DataFrame parquet SV
[3/3] Loading DataFrame for object group GenVisTau


100%|██████████| 7/7 [00:00<00:00,  8.06it/s]


[3/3] Saving DataFrame parquet GenVisTau
[3/3] Loading DataFrame for object group TrigObj


100%|██████████| 10/10 [00:05<00:00,  2.43it/s]


[3/3] Saving DataFrame parquet TrigObj
[3/3] Loading DataFrame for object group FsrPhoton


100%|██████████| 6/6 [00:00<00:00, 12.16it/s]


[3/3] Saving DataFrame parquet FsrPhoton
[3/3] Loading DataFrame for object group OtherPV


100%|██████████| 1/1 [00:00<00:00, 4369.07it/s]


[3/3] Saving DataFrame parquet OtherPV
[3/3] Loading DataFrame for object group SubGenJetAK8


100%|██████████| 4/4 [00:01<00:00,  3.04it/s]


[3/3] Saving DataFrame parquet SubGenJetAK8
