In [None]:
import os, sys, uproot
import numpy as np
import awkward
from tqdm import tqdm
import pandas as pd

In [20]:
work_dir='/grid_mnt/vol__vol_U__u/llr/cms/hakimi/WZ_analysis'
data_dir='/data_CMS/cms/hakimi/WZ_analysis/samples/WZTo3LNu_TuneCUETP8M1_13TeV-powheg-pythia8/RunIISummer16NanoAODv6-PUMoriond17_Nano25Oct2019_102X_mcRun2_asymptotic_v7_ext1-v1/NANOAODSIM/'
input_files=os.listdir(data_dir)
print(input_files)

['E037C3F4-35A9-2043-9B69-F9E87C026901.root', 'DC4746E0-ABE0-BB46-84D1-5E544CD88101.root', 'D293F7E8-8B3B-1344-83A0-ED51D54198E0.root', 'C91964CB-5E00-A245-B1E8-F1BA4547E113.root', 'A6702E73-CFE1-6D40-99F2-F815D928814B.root', '83B9A4A8-293F-BF4B-9D1C-4A8905BBE50A.root', '7CCF75DB-411A-F44F-B2A0-34A82E4FE31A.root', '79029903-D90C-5F4D-BBF3-C76D4026CBDC.root', '67477F26-322E-704F-9F70-4A3EF57D93BA.root', '668D2A08-0842-B64C-979C-DE2FC179A80A.root', '62BE2E52-9CEF-1D4C-8D18-650C874FCB89.root', '384CFBD6-83A4-FA4C-9681-35E8A7F17066.root', '31622C40-2B3E-7646-AA6A-AF25CC483FA1.root', '1959EF33-8FC0-1041-B77A-1B6657FCA9D1.root', 'parquet']


In [22]:
def extract_scalar_data(events, branches, entrystop=None, progressbar=False):
    data = {}

    data["event"] = events.array("event", entrystop=entrystop)

    for br in tqdm(branches, disable=not progressbar):
        data[br] = events.array(br, entrystop=entrystop).flatten()

    return pd.DataFrame(data)

In [16]:
def extract_vector_data(events, branches, entrystop=10, progressbar=False):
    def get(branch, flat=True):
        a = events.array(branch, entrystop=entrystop)
        if flat:
            return a.flatten()
        else:
            return a

    if len(branches) == 0:
        return {}

    first_branch_jagged = get(branches[0], flat=False)
    first_branch_flat = first_branch_jagged.flatten()

    event_jagged = get("event") + awkward.JaggedArray(
        first_branch_jagged.starts, first_branch_jagged.stops, np.zeros(len(first_branch_flat), dtype=np.int)
    )

    data = {}

    data["event"] = event_jagged.flatten()

    data[branches[0]] = first_branch_flat

    for br in tqdm(branches, disable=not progressbar):
        if br == branches[0]:
            continue

        if br in events:
            data[br] = get(br)
        else:
            print('Warning! Branch "' + br + '" not found in input file and skipped.')

    return pd.DataFrame(data)


In [17]:
def nanoaod_to_parquet(input_files, out_dir, entrystop=None, input_prefix="", progressbar=False):
    def save_parquet(df, name):
        df.to_parquet(os.path.join(out_dir, name + ".parquet.gzip"), compression="gzip", index=False)

    out_dir = os.path.expanduser(out_dir)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    n_input_files = len(input_files)

    for i, input_file in enumerate(input_files):

        log_prefix = ""
        if i > 0:
            log_prefix = f"[{i+1}/{n_input_files}] "

        def log(s):
            print(log_prefix + s)

        log("Opening input file " + input_file)
        root_file = uproot.open(input_prefix + input_file)

        events = root_file["Events"]

        branches = [br.decode("ascii") for br in events.keys()]

        n_events = len(events[branches[0]])

        prefixes = list(set([b.split("_")[0] for b in branches]))

        vector_groups_present = [p for p in prefixes if "n" + p in events]

        scalar_branches = ["n" + s for s in vector_groups_present]

        for br in branches:
            if br == "event":
                continue
            if not br.split("_")[0] in vector_groups_present:
                scalar_branches.append(br)

        basename = os.path.basename(input_file)[:-5]

        log("Loading DataFrame for scalar branches")
        df_scalar = extract_scalar_data(events, scalar_branches, entrystop=entrystop, progressbar=progressbar)
        log("Saving DataFrame parquet Scalar")
        save_parquet(df_scalar, basename + "_Scalar")
        del df_scalar

        processed_branches = scalar_branches + ["event"]

        for group in vector_groups_present:
            log("Loading DataFrame for object group " + group)
            filtered_branches = list(filter(lambda br: br == group or br.startswith(group + "_"), branches))
            df = extract_vector_data(events, filtered_branches, entrystop=entrystop, progressbar=progressbar)
            log("Saving DataFrame parquet " + group)
            save_parquet(df, basename + "_" + group)
            processed_branches += filtered_branches
            del df

        # make sure we considered all the branches
        assert [b for b in branches if not b in processed_branches] == []



In [10]:
filename=data_dir+'/'+input_files[0]
root_file=uproot.open(filename)

In [11]:
events=root_file['Events']

branches=[br.decode('utf8') for br in events.keys()]

n_events=len(events[branches[0]])
print(n_events)

prefixes = list(set([b.split("_")[0] for b in branches]))

vector_groups_present = [p for p in prefixes if "n" + p in events]

scalar_branches = ["n" + s for s in vector_groups_present]

1438010


In [12]:
branches

['run',
 'luminosityBlock',
 'event',
 'HTXS_Higgs_pt',
 'HTXS_Higgs_y',
 'HTXS_stage1_1_cat_pTjet25GeV',
 'HTXS_stage1_1_cat_pTjet30GeV',
 'HTXS_stage1_1_fine_cat_pTjet25GeV',
 'HTXS_stage1_1_fine_cat_pTjet30GeV',
 'HTXS_stage_0',
 'HTXS_stage_1_pTjet25',
 'HTXS_stage_1_pTjet30',
 'HTXS_njets25',
 'HTXS_njets30',
 'btagWeight_CSVV2',
 'btagWeight_CMVA',
 'CaloMET_phi',
 'CaloMET_pt',
 'CaloMET_sumEt',
 'ChsMET_phi',
 'ChsMET_pt',
 'ChsMET_sumEt',
 'nCorrT1METJet',
 'CorrT1METJet_area',
 'CorrT1METJet_eta',
 'CorrT1METJet_muonSubtrFactor',
 'CorrT1METJet_phi',
 'CorrT1METJet_rawPt',
 'nElectron',
 'Electron_deltaEtaSC',
 'Electron_dr03EcalRecHitSumEt',
 'Electron_dr03HcalDepth1TowerSumEt',
 'Electron_dr03TkSumPt',
 'Electron_dr03TkSumPtHEEP',
 'Electron_dxy',
 'Electron_dxyErr',
 'Electron_dz',
 'Electron_dzErr',
 'Electron_eCorr',
 'Electron_eInvMinusPInv',
 'Electron_energyErr',
 'Electron_eta',
 'Electron_hoe',
 'Electron_ip3d',
 'Electron_jetPtRelv2',
 'Electron_jetRelIso',
 'Elect

In [15]:
out_dir=data_dir+'/parquet'
input_prefix=data_dir

In [None]:
nanoaod_to_parquet(input_files, out_dir, entrystop=None, input_prefix=data_dir, progressbar=True)

Opening input file E037C3F4-35A9-2043-9B69-F9E87C026901.root


  0%|          | 0/1038 [00:00<?, ?it/s]

Loading DataFrame for scalar branches


100%|██████████| 1038/1038 [00:45<00:00, 22.88it/s]


Saving DataFrame parquet Scalar
Loading DataFrame for object group GenDressedLepton


100%|██████████| 6/6 [00:01<00:00,  3.68it/s]


Saving DataFrame parquet GenDressedLepton
Loading DataFrame for object group LHEReweightingWeight


100%|██████████| 1/1 [00:00<00:00, 2629.66it/s]


Saving DataFrame parquet LHEReweightingWeight
Loading DataFrame for object group Jet


100%|██████████| 36/36 [00:19<00:00,  3.29it/s]


Saving DataFrame parquet Jet
Loading DataFrame for object group SubGenJetAK8


100%|██████████| 4/4 [00:00<00:00,  5.00it/s]


Saving DataFrame parquet SubGenJetAK8
Loading DataFrame for object group SoftActivityJet


100%|██████████| 3/3 [00:02<00:00,  1.30it/s]


Saving DataFrame parquet SoftActivityJet
Loading DataFrame for object group GenVisTau


100%|██████████| 7/7 [00:01<00:00,  5.07it/s]


Saving DataFrame parquet GenVisTau
Loading DataFrame for object group LHEPart


100%|██████████| 5/5 [00:04<00:00,  1.23it/s]


Saving DataFrame parquet LHEPart
Loading DataFrame for object group SV


100%|██████████| 14/14 [00:03<00:00,  4.14it/s]


Saving DataFrame parquet SV
Loading DataFrame for object group PSWeight


100%|██████████| 1/1 [00:00<00:00, 5983.32it/s]


Saving DataFrame parquet PSWeight
Loading DataFrame for object group LHEPdfWeight


100%|██████████| 1/1 [00:00<00:00, 2642.91it/s]


Saving DataFrame parquet LHEPdfWeight
Loading DataFrame for object group FatJet


100%|██████████| 42/42 [00:06<00:00,  7.17it/s]


Saving DataFrame parquet FatJet
Loading DataFrame for object group Electron


100%|██████████| 71/71 [00:20<00:00,  4.61it/s]


Saving DataFrame parquet Electron
Loading DataFrame for object group GenPart


100%|██████████| 8/8 [00:18<00:00,  1.79s/it]


Saving DataFrame parquet GenPart
Loading DataFrame for object group Tau


100%|██████████| 46/46 [00:14<00:00,  3.78it/s]


Saving DataFrame parquet Tau
Loading DataFrame for object group SubJet


100%|██████████| 14/14 [00:02<00:00,  6.75it/s]


Saving DataFrame parquet SubJet
Loading DataFrame for object group IsoTrack


100%|██████████| 14/14 [00:02<00:00,  6.14it/s]


Saving DataFrame parquet IsoTrack
Loading DataFrame for object group Photon


100%|██████████| 32/32 [00:08<00:00,  4.42it/s]


Saving DataFrame parquet Photon
Loading DataFrame for object group GenJetAK8


100%|██████████| 6/6 [00:00<00:00,  5.93it/s]


Saving DataFrame parquet GenJetAK8
Loading DataFrame for object group LHEScaleWeight


100%|██████████| 1/1 [00:00<00:00, 2510.06it/s]


Saving DataFrame parquet LHEScaleWeight
Loading DataFrame for object group TrigObj


100%|██████████| 10/10 [00:06<00:00,  2.27it/s]


Saving DataFrame parquet TrigObj
Loading DataFrame for object group GenJet


100%|██████████| 6/6 [00:03<00:00,  1.83it/s]


Saving DataFrame parquet GenJet
Loading DataFrame for object group Muon


100%|██████████| 50/50 [00:13<00:00,  4.49it/s]


Saving DataFrame parquet Muon
Loading DataFrame for object group CorrT1METJet


100%|██████████| 5/5 [00:04<00:00,  1.08it/s]


Saving DataFrame parquet CorrT1METJet
Loading DataFrame for object group OtherPV


100%|██████████| 1/1 [00:00<00:00, 2011.66it/s]


Saving DataFrame parquet OtherPV
Loading DataFrame for object group FsrPhoton


100%|██████████| 6/6 [00:00<00:00,  8.03it/s]


Saving DataFrame parquet FsrPhoton
[2/15] Opening input file DC4746E0-ABE0-BB46-84D1-5E544CD88101.root


  0%|          | 0/1038 [00:00<?, ?it/s]

[2/15] Loading DataFrame for scalar branches


100%|██████████| 1038/1038 [00:52<00:00, 19.70it/s]


[2/15] Saving DataFrame parquet Scalar


In [24]:
import pyarrow
