# Lesson 5: Python on GPUs (workbook)

In [None]:
# Python standard library
import math

# Scientific Python ecosystem
import numpy as np
import cupy as cp
import numba as nb
import numba.cuda

# Particle physics tools
import awkward as ak
import uproot

## CuPy

In [None]:
array_in_ram = np.random.uniform(0, 1, 100000000)

In [None]:
array_on_gpu = cp.asarray(array_in_ram)

In [None]:
%%timeit -r1 -n10

array_in_ram[:] += 1

In [None]:
%%timeit -r1 -n10

array_on_gpu[:] += 1

In [None]:
%%timeit -r1 -n10

cp.asarray(array_in_ram)   # from RAM to GPU

In [None]:
%%timeit -r1 -n10

array_on_gpu.get()         # from GPU to RAM

In [None]:
array = np.array([0.0, 1.1, 2.2, 3.3, 4.4, 5.5])

array[np.array([2, 3, 5, 6, 7, 8])]

In [None]:
array = cp.array([0.0, 1.1, 2.2, 3.3, 4.4, 5.5])

array[cp.array([2, 3, 5, 6, 7, 8])]

In [None]:
a = cp.random.uniform(5, 10, 1000000)
b = cp.random.uniform(10, 20, 1000000)
c = cp.random.uniform(-0.1, 0.1, 1000000)

In [None]:
intpow = cp.ElementwiseKernel("float64 x, int64 n", "float64 out", '''
    out = 1.0;
    for (int i = 0;  i < n;  i++) {
        out *= x;
    }
''', "intpow")
intpow

In [None]:
intpow(b, 2)

In [None]:
b**2

In [None]:
quadratic_formula = cp.ElementwiseKernel("float64 a, float64 b, float64 c", "float64 out", '''
    out = (-b + sqrt(b*b - 4*a*c)) / (2*a);
''', "quadratic_formula")

quadratic_formula(a, b, c)

In [None]:
%%timeit -r1 -n1000

(-b + cp.sqrt(b**2 - 4*a*c)) / (2*a)

In [None]:
%%timeit -r1 -n1000

quadratic_formula(a, b, c)

In [None]:
quadratic_formula_raw = cp.RawKernel(r'''
extern "C" __global__
void quadratic_formula_raw(const double* a, const double* b, const double* c, int length, double* out) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < length) {
        out[i] = (-b[i] + sqrt(b[i]*b[i] - 4*a[i]*c[i])) / (2*a[i]);
    }
}
''', "quadratic_formula_raw")

out = cp.empty_like(a)

threads_per_block = 1024
blocks_per_grid = int(np.ceil(len(out) / 1024))

quadratic_formula_raw((blocks_per_grid,), (threads_per_block,), (a, b, c, len(out), out))

out

## Numba

In [None]:
@nb.cuda.jit
def quadratic_formula_numba_cuda(a, b, c, out):
    i = nb.cuda.grid(1)   # 1-dimensional
    if i < len(out):
        out[i] = (-b[i] + math.sqrt(b[i]**2 - 4*a[i]*c[i])) / (2*a[i])

out = cp.empty_like(a)

threads_per_block = 1024
blocks_per_grid = int(np.ceil(len(out) / 1024))

quadratic_formula_numba_cuda[blocks_per_grid, threads_per_block](a, b, c, out)

out

## JAX

## Awkward Array

In [None]:
with uproot.open("data/SMHiggsToZZTo4L.root:Events") as tree:
    events_pt, events_eta, events_phi, events_charge = tree.arrays(
        ["Electron_pt", "Electron_eta", "Electron_phi", "Electron_charge"], how=tuple
    )

In [None]:
electrons = ak.to_backend(
    ak.zip({
        "pt": events_pt,
        "eta": events_eta,
        "phi": events_phi,
        "charge": events_charge,
    },
    with_name="Momentum4D",
), "cuda")

electrons

In [None]:
e1, e2 = ak.unzip(ak.combinations(electrons, 2))
z_mass = np.sqrt(
    2*e1.pt*e2.pt * (np.cosh(e1.eta - e2.eta) - np.cos(e1.phi - e2.phi))
)
np.max(z_mass, axis=-1)

In [None]:
ak.numba.register_and_check()

@nb.cuda.jit(extensions=[ak.numba.cuda])
def mass_of_heaviest_dielectron(electrons, out):
    thread_idx = nb.cuda.grid(1)
    if thread_idx < len(electrons):
        electrons_in_one_event = electrons[thread_idx]
        for i, e1 in enumerate(electrons_in_one_event):
            for e2 in electrons_in_one_event[i + 1:]:
                if e1.charge != e2.charge:
                    m = math.sqrt(
                        2*e1.pt*e2.pt * (math.cosh(e1.eta - e2.eta) - math.cos(e1.phi - e2.phi))
                    )
                    if m > out[thread_idx]:
                        out[thread_idx] = m

threads_per_block = 1024
blocks_per_grid = int(np.ceil(len(electrons) / 1024))

out = cp.zeros(len(electrons), dtype=np.float32)
mass_of_heaviest_dielectron[blocks_per_grid, threads_per_block](electrons, out)

out

In [None]:
@nb.cuda.jit(extensions=[ak.numba.cuda], device=True)
def compute_mass(event):
    out = np.float32(0)
    for i, e1 in enumerate(event):
        for e2 in event[i + 1:]:
            if e1.charge != e2.charge:
                m = math.sqrt(
                    2*e1.pt*e2.pt * (math.cosh(e1.eta - e2.eta) - math.cos(e1.phi - e2.phi))
                )
                if m > out:
                    out = m
    return out

@nb.cuda.jit(extensions=[ak.numba.cuda])
def mass_of_heaviest_dielectron_2(events, out):
    thread_idx = nb.cuda.grid(1)
    if thread_idx < len(events):
        out[thread_idx] = compute_mass(events[thread_idx])

# same threads_per_block, blocks_per_grid

out = cp.zeros(len(electrons), dtype=np.float32)
mass_of_heaviest_dielectron_2[blocks_per_grid, threads_per_block](electrons, out)

out