In [1]:
import awkward as ak
import cupy as cp
import cudf
import numpy as np
import akimbo.cudf
import subprocess

def gpu_mem():
    print(subprocess.check_output("nvidia-smi | grep py", shell=True).split()[-2].decode())

ak.__version__, akimbo.__version__

('2.6.7', '2024.8.1.dev17+gff760f4.d20240812')

In [2]:
df = cudf.read_parquet("/floppy/code/awkward/s.parquet")
gpu_mem()

170MiB


In [3]:
df.dtypes

a    list
dtype: object

In [4]:
df.iloc[0]  # each element is list-of-lists

a    [[1, 2, 3], [], [4, 5]]
Name: 0, dtype: list

In [5]:
# series accessor
df.a.ak

In [6]:
# allows all ak.* namespace, many identical to numpy equivalents
dir(df.a.ak)

['Mask',
 'all',
 'almost_equal',
 'angle',
 'annotations',
 'any',
 'apply',
 'argcartesian',
 'argcombinations',
 'argmax',
 'argmin',
 'argsort',
 'array',
 'attrs',
 'awkward',
 'backend',
 'behavior',
 'behaviors',
 'broadcast_arrays',
 'broadcast_fields',
 'builder',
 'cartesian',
 'categories',
 'combinations',
 'concatenate',
 'contents',
 'copy',
 'corr',
 'count',
 'count_nonzero',
 'covar',
 'cpp_type',
 'cppyy',
 'drop_none',
 'dt',
 'enforce_type',
 'errors',
 'explode',
 'fields',
 'fill_none',
 'firsts',
 'flatten',
 'forms',
 'forth',
 'from_arrow',
 'from_arrow_schema',
 'from_avro_file',
 'from_buffers',
 'from_categorical',
 'from_cupy',
 'from_dlpack',
 'from_feather',
 'from_iter',
 'from_jax',
 'from_json',
 'from_numpy',
 'from_parquet',
 'from_rdataframe',
 'from_regular',
 'full_like',
 'highlevel',
 'imag',
 'index',
 'is_categorical',
 'is_none',
 'is_tuple',
 'is_valid',
 'isclose',
 'jax',
 'layout',
 'linear_fit',
 'local_index',
 'mask',
 'max',
 'mean',


In [7]:
df.a.ak.sum(axis=None)

array(28000000)

In [8]:
# if output was array-like, it stays on the GPU
type(_)

cupy.ndarray

In [9]:
# fast reduction across three levels of nesting
%timeit df.a.ak.sum(axis=None)

4.15 ms ± 75.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
# ufunc maintains structure
np.negative(df.a.ak)

0          [[-1, -2, -3], [], [-4, -5]]
1          [[-1, -2, -3], [], [-4, -5]]
2          [[-1, -2, -3], [], [-4, -5]]
3          [[-1, -2, -3], [], [-4, -5]]
4          [[-1, -2, -3], [], [-4, -5]]
                       ...             
1999995                      [[-6, -7]]
1999996                      [[-6, -7]]
1999997                      [[-6, -7]]
1999998                      [[-6, -7]]
1999999                      [[-6, -7]]
Length: 2000000, dtype: list

In [11]:
gpu_mem()  # created new arrays on GPU, made new cuDF series

252MiB


In [12]:
# operator overload
print((df.a.ak + 1).head())

0    [[2, 3, 4], [], [5, 6]]
1    [[2, 3, 4], [], [5, 6]]
2    [[2, 3, 4], [], [5, 6]]
3    [[2, 3, 4], [], [5, 6]]
4    [[2, 3, 4], [], [5, 6]]
dtype: list


#### numba

In [13]:
import numba.cuda
ak.numba.register_and_check()

@numba.cuda.jit(extensions=[ak.numba.cuda])
def inner_sum(array, out):
    tid = numba.cuda.grid(1)
    if tid < len(array):
        out[tid] = 0
        for x in array[tid]:
            for y in x:
                out[tid] += y

out = cp.empty(len(df.a), dtype="int32")
blocksize = 256
numblocks = (len(df.a) + blocksize - 1) // blocksize

df.a.ak.apply(lambda x: inner_sum[numblocks, blocksize](ak.drop_none(x, axis=0), out))
out


array([15, 15, 15, ..., 13, 13, 13], dtype=int32)

In [14]:
%timeit df.a.ak.apply(lambda x: inner_sum[numblocks, blocksize](ak.drop_none(x, axis=0), out))

4.32 ms ± 119 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
gpu_mem() 

260MiB
