In [None]:
import os

In [None]:
import uproot, awkward
import numpy as np
import glob
import bcolz
import time
import os
import cupy
import shutil

In [None]:
fns = glob.glob("/nvmedata/store/data/Run2017E/SingleMuon/NANOAOD/Nano14Dec2018-v1/**/*.root")

In [None]:
dd = uproot.open("/nvmedata/store/data/Run2017E/SingleMuon/NANOAOD/Nano14Dec2018-v1/10000/FCD50135-3590-2245-BA35-EFE64788BD9A.root")

In [None]:
arrs2 = {}
ks = ["Jet_pt", "Jet_eta", "Jet_phi", "Jet_mass"]
for fn in fns[:5]:
    dd = uproot.open(fn)
    arrs = dd.get("Events").arrays(ks)
    for k in ks:
        if k not in arrs2:
            arrs2[k] = []
        arrs2[k] += [arrs[bytes(k, 'ascii')]]

for k in ks:
    arrs2[k] = awkward.array.jagged.JaggedArray.concatenate(arrs2[k])

In [None]:
bcolz.blosc_set_nthreads(16)

In [None]:
class JaggedArrayStruct:
    attr_names_dtypes = [("offsets", np.int64), ("pt", np.float32), ("eta", np.float32), ("phi", np.float32), ("mass", np.float32)]
    def __init__(self, offsets, pt, eta, phi, mass):
        self.offsets = offsets
        self.pt = pt
        self.eta = eta
        self.phi = phi
        self.mass = mass
    
    def size_mb(self):
        s = 0
        for attr, dtype in JaggedArrayStruct.attr_names_dtypes:
            s += getattr(self, attr).size
        return s/1024.0/1024.0

    def save_npz(self, fn):
        with open(fn, "wb") as fi:
            np.savez(fn, offsets=self.offsets, pt=self.pt, eta=self.eta, phi=self.phi, mass=self.mass)
            
    def save_memmap(self, fn):
        for attr, dtype in self.attr_names_dtypes:
            arr = getattr(self, attr)
            m = np.memmap(fn + ".{0}.mmap".format(attr), dtype=arr.dtype, mode='write',
                shape=(len(arr))
            )
            m[:] = arr[:]
            
    def save_bcolz(self, fn, clevel, cname):
        os.makedirs(fn)
        for attr, dtype in self.attr_names_dtypes:
            arr = getattr(self, attr)
            arr_bcolz = bcolz.carray(arr, rootdir="{0}/{1}".format(fn, attr), mode='w',
                expectedlen=len(arr), cparams=bcolz.cparams(clevel=clevel, cname=cname)
            )
    
    @staticmethod
    def load_bcolz(fn):
        arrs = []
        for attr, dtype in JaggedArrayStruct.attr_names_dtypes:
            a = bcolz.open(rootdir="{0}/{1}".format(fn, attr))
            arrs += [cupy.array(a)]
        return JaggedArrayStruct(*arrs)
    
    @staticmethod
    def load_memmap(fn):
        arrs = []
        for attr, dtype in JaggedArrayStruct.attr_names_dtypes:
            m = np.memmap(fn + ".{0}.mmap".format(attr), dtype=dtype, mode='r')
            arr = cupy.array(m)
            arrs += [arr]
        return JaggedArrayStruct(*arrs)
        
    @staticmethod
    def load_npz(fn):
        with open(fn, "rb") as fi:
            dd = np.load(fi)
            arrs = []
            for attr, dtype in JaggedArrayStruct.attr_names_dtypes:
                arrs += [cupy.array(dd[attr])]
            return JaggedArrayStruct(*arrs)

In [None]:
s = JaggedArrayStruct(
    arrs2["Jet_pt"].offsets,
    arrs2["Jet_pt"].content,
    arrs2["Jet_eta"].content,
    arrs2["Jet_phi"].content,
    arrs2["Jet_mass"].content
)

In [None]:
import os
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

In [None]:
fn = "/optane/jpata/test.bcolz"
for i in range(3):
    
    t0 = time.time()

    if os.path.isdir(fn):
        shutil.rmtree(fn,)
    s.save_bcolz(fn, 5, "blosclz")
    if i==0:
        print("size on disk: ", get_size(fn)/1024.0/1024.0)

    t1 = time.time()
    dt = t1 - t0
    print("save bcolz {0:.2f} MB/s".format(s.size_mb()/dt))
    
    t0 = time.time()

    s2 = JaggedArrayStruct.load_bcolz(fn)

    t1 = time.time()
    dt = t1 - t0
    print("load bcolz {0:.2f} MB/s".format(s.size_mb()/dt))

In [None]:
fn = "/nvmedata/test_mmap/test"
for i in range(3):

    t0 = time.time()
    s.save_memmap(fn)

    t1 = time.time()
    dt = t1 - t0

    if i==0:
        print("size on disk: ", get_size("/nvmedata/test_mmap")/1024.0/1024.0)

    print("save memmap {0:.2f} MB/s".format(s.size_mb()/dt))

            
    t0 = time.time()

    JaggedArrayStruct.load_memmap(fn)

    t1 = time.time()
    dt = t1 - t0
    print("load memmap {0:.2f} MB/s".format(s.size_mb()/dt))


In [None]:
t0 = time.time()

s.save_npz("test.npz")

t1 = time.time()
dt = t1 - t0
print("{0:.2f} MB/s".format(s.size_mb()/dt))

In [None]:
t0 = time.time()

JaggedArrayStruct.load_npz("test.npz")

t1 = time.time()
dt = t1 - t0
print("{0:.2f} MB/s".format(s.size_mb()/dt))