In [1]:
import setGPU
import numba, cupy
import uproot
import numpy as np
from numba import cuda

setGPU: Setting GPU to: 0


In [2]:
!nvidia-smi

Fri Oct 11 15:46:04 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX TIT...  On   | 00000000:0C:00.0 Off |                  N/A |
| 22%   36C    P8    16W / 250W |      1MiB / 12212MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX TIT...  On   | 00000000:0D:00.0 Off |                  N/A |
| 22%   34C    P8    15W / 250W |      1MiB / 12212MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

In [3]:
fi = uproot.open("/storage/user/jpata/opendata_files/TTJets_SemiLeptMGDecays-merged/1.root")

In [4]:
tt = fi.get("aod2nanoaod/Events")

In [5]:
pt = tt.array("Jet_pt")

In [65]:
pt.shape

(2249911,)

In [66]:
def sumpt_event(data, offsets, ret):
    ret[:] = 0
    for iev in range(len(offsets)-1):
        i0 = offsets[iev]
        i1 = offsets[iev+1]
        for j in range(i0, i1):
            ret[iev] += data[j]
    return ret

sum1 = np.zeros(len(pt.offsets)-1, dtype=np.float64)
%timeit sumpt_event(pt.content, pt.offsets, sum1)

12.3 s ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [67]:
%timeit -n3 pt.sum()

87.9 ms ± 295 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [62]:
@numba.njit(parallel=True)
def sumpt_event(data, offsets, ret):
    ret[:] = 0
    for iev in numba.prange(len(offsets)-1):
        i0 = offsets[iev]
        i1 = offsets[iev+1]
        for j in range(i0, i1):
            ret[iev] += data[j]
    return ret

In [68]:
sum2 = np.zeros(len(pt.offsets)-1, dtype=np.float64)
%timeit -n3 sumpt_event(pt.content, pt.offsets, sum2)

12 s ± 4.87 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [57]:
@cuda.jit
def sumpt_event_cudakernel(data, offsets, ret):
    xi = cuda.grid(1)
    xstride = cuda.gridsize(1)

    for iev in range(xi, offsets.shape[0]-1, xstride):
        ret[iev] = 0.0
        start = offsets[iev]
        end = offsets[iev + 1]
        for ielem in range(start, end):
            ret[iev] += data[ielem]

In [69]:
sum3 = cupy.zeros(len(pt.offsets)-1, dtype=cupy.float32)
d1 = cupy.array(pt.content, dtype=cupy.float32)
d2 = cupy.array(pt.offsets, dtype=cupy.uint64)

In [70]:
%%timeit
sumpt_event_cudakernel[128,1024](d1, d2, sum3)
cuda.synchronize()

2.18 ms ± 14.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
