In [1]:
import os
from collections import namedtuple
import pandas
import numpy

DatasetFileInfo = namedtuple("DatasetFileInfo", ["path", "root_basenames", "data_frames", "suffix"])


In [2]:
def expand_dataset_info(path):
    root_basenames = set()
    data_frames = set()
    for f in os.listdir(path):
        tmp = f.split("_")
        root_basenames.add(tmp[0])
        tmp = "_".join(tmp[1:]).split(".")
        data_frames.add(tmp[0])
        suffix = "." + ".".join(tmp[1:])

    return DatasetFileInfo(path, sorted(list(root_basenames)), sorted(list(data_frames)), suffix)


In [3]:
def is_first_in_event(event):
    return numpy.concatenate([[1], event[1:] != event[:-1]])


In [4]:
def indices_in_event(event):
    r = numpy.arange(len(event))
    t = is_first_in_event(event) * r
    w = numpy.where(t)[0]
    t[w[1:]] -= t[w[:-1]]
    return r - numpy.cumsum(t)


In [5]:
class ParquetSingleFileHandler(object):

    import pandas

    def __init__(self, file_template, data_frames):
        self._file_template = file_template
        self._data_frames = data_frames

    def keys(self):
        return self._data_frames[:]

    def __repr__(self):
        return "<ParquetSingleFileHandler for " + self._file_template + ">"

    def __contains__(self, key):
        return key in self._data_frames

    def __getitem__(self, key):
        if not key in self:
            raise ValueError("Dataset does not contain data frame named " + key + ".")
        df = pandas.read_parquet(self._file_template.format(key))
        if key == "Scalar":
            df = df.set_index("event")
        else:
            df[key] = indices_in_event(df["event"].values)
            df["event"] = df["event"].astype(int)
            df = df.set_index(["event", key])
        return df



In [6]:
class ParquetFilesHandler(object):
    def __init__(self, path):
        self._path, self._root_basenames, self._data_frames, self._suffix = expand_dataset_info(path)

    def __len__(self):
        return len(self._root_basenames)

    def keys(self):
        return self._data_frames[:]

    def __contains__(self, key):
        return key in self._data_frames

    def __getitem__(self, key):
        data_frame = None
        if hasattr(key, "__len__"):
            i, data_frame = key
        else:
            i = key
        file_template = os.path.join(self._path, self._root_basenames[i] + "_{}" + self._suffix)
        file_handler = ParquetSingleFileHandler(file_template, self._data_frames)
        if data_frame:
            return file_handler[data_frame]
        else:
            return file_handler
        
    def __iter__(self):
        for i in range(len(self)):
            yield self[i]



In [7]:
def open(path):
    return ParquetFilesHandler(path)



In [8]:
def loc(df, selected_events, columns=None):
    e = df.index.values
    mask = numpy.in1d(e, selected_events)
    if columns is None:
        return df.loc[mask]
    else:
        return df.loc[mask, columns]

In [9]:
data_dir='/data_CMS/cms/hakimi/WZ_analysis/samples/WZTo3LNu_TuneCUETP8M1_13TeV-powheg-pythia8/RunIISummer16NanoAODv6-PUMoriond17_Nano25Oct2019_102X_mcRun2_asymptotic_v7_ext1-v1/NANOAODSIM/parquet/'


In [10]:
dataset = open(data_dir)

In [11]:
dataset._data_frames

['CorrT1METJet',
 'Electron',
 'FatJet',
 'FsrPhoton',
 'GenDressedLepton',
 'GenJet',
 'GenJetAK8',
 'GenPart',
 'GenVisTau',
 'IsoTrack',
 'Jet',
 'LHEPart',
 'LHEPdfWeight',
 'LHEReweightingWeight',
 'LHEScaleWeight',
 'Muon',
 'OtherPV',
 'PSWeight',
 'Photon',
 'SV',
 'Scalar',
 'SoftActivityJet',
 'SubGenJetAK8',
 'SubJet',
 'Tau',
 'TrigObj']

In [13]:
%%time
from tqdm import tqdm
file_range= range(len(dataset))
df_elec=[dataset[i]["Electron"] for i in tqdm(file_range)]

100%|██████████| 14/14 [00:06<00:00,  2.01it/s]

CPU times: user 1min 2s, sys: 12.4 s, total: 1min 15s
Wall time: 6.89 s





In [14]:
df_elec[0].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Electron_deltaEtaSC,Electron_dr03EcalRecHitSumEt,Electron_dr03HcalDepth1TowerSumEt,Electron_dr03TkSumPt,Electron_dr03TkSumPtHEEP,Electron_dxy,Electron_dxyErr,Electron_dz,Electron_dzErr,Electron_eCorr,...,Electron_mvaFall17V2noIso_WP80,Electron_mvaFall17V2noIso_WP90,Electron_mvaFall17V2noIso_WPL,Electron_mvaSpring16GP_WP80,Electron_mvaSpring16GP_WP90,Electron_mvaSpring16HZZ_WPL,Electron_seedGain,Electron_genPartIdx,Electron_genPartFlav,Electron_cleanmask
event,Electron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3,0,0.068176,0.0,0.0,0.0,0.0,0.001885,0.003662,-0.004566,0.004578,1.0,...,False,False,False,False,False,False,12,-1,0,1
4,0,-0.038269,0.0,0.0,0.0,0.0,-0.008774,0.003845,0.000833,0.004822,1.002267,...,True,True,True,False,True,True,12,19,1,1
4,1,-0.036163,0.0,0.0,0.0,0.0,-0.001481,0.002777,-0.000108,0.005005,1.008639,...,True,True,True,True,True,True,12,23,1,1
5,0,0.06897,0.0,0.0,0.0,0.0,-0.004002,0.002502,0.011337,0.004333,0.986106,...,True,True,True,True,True,True,12,21,1,1
2,0,-0.039215,1.277344,0.957031,0.0,0.0,0.006199,0.002991,-0.009575,0.003967,0.999152,...,True,True,True,True,True,True,12,23,1,1


Unnamed: 0_level_0,Unnamed: 1_level_0,Electron_deltaEtaSC,Electron_dr03EcalRecHitSumEt,Electron_dr03HcalDepth1TowerSumEt,Electron_dr03TkSumPt,Electron_dr03TkSumPtHEEP,Electron_dxy,Electron_dxyErr,Electron_dz,Electron_dzErr,Electron_eCorr,...,Electron_mvaFall17V2noIso_WP80,Electron_mvaFall17V2noIso_WP90,Electron_mvaFall17V2noIso_WPL,Electron_mvaSpring16GP_WP80,Electron_mvaSpring16GP_WP90,Electron_mvaSpring16HZZ_WPL,Electron_seedGain,Electron_genPartIdx,Electron_genPartFlav,Electron_cleanmask
event,Electron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2255191,0,-0.01281,0.0,0.0,0.0,0.0,-0.028183,0.011108,-5.207031,0.011841,1.01657,...,True,True,True,True,True,True,12,14,1,1
2255191,1,0.000936,0.0,0.0,0.0,0.0,-0.010521,0.032715,-5.210938,0.055664,1.040309,...,False,True,True,True,True,True,12,15,1,1
2255181,0,0.038605,0.0,0.0,0.0,0.0,0.000984,0.002686,0.005398,0.002686,1.007097,...,False,False,True,False,False,True,12,22,1,1
2255181,1,0.028885,0.0,0.0,0.0,0.0,-0.004784,0.007263,-0.001058,0.007812,0.972478,...,True,True,True,True,True,True,12,24,1,1
2255189,0,0.071106,1.699219,0.0,0.0,0.0,-0.000106,0.001465,-0.001097,0.001862,1.022241,...,True,True,True,True,True,True,12,17,1,1
