In [1]:
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray

In [2]:
data_size = 50_000_000
host_data = np.float32(np.random.random(data_size))

In [3]:
%%time

host_data_x2 = host_data * np.float32(2)

Wall time: 67.8 ms


In [4]:
device_data = gpuarray.to_gpu(host_data)

In [5]:
%%time

# need compile time
device_data_x2 = device_data * np.float32(2)

Wall time: 3.41 s


In [6]:
%%timeit -n 10

device_data_x2 = device_data * np.float32(2)

12.4 ms ± 7.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
from_device = device_data_x2.get()

np.allclose(from_device, host_data_x2)

True

In [8]:
from pycuda.elementwise import ElementwiseKernel

gpu_2x_ker = ElementwiseKernel(
    "float *in, float *out",
    "out[i] = 2 * in[i];",
    "gpu_2x_ker"
)

In [9]:
device_data_2x = gpuarray.empty_like(device_data)

In [10]:
# need compile time
gpu_2x_ker(device_data, device_data_2x)

In [11]:
%%timeit -n 10

gpu_2x_ker(device_data, device_data_2x)

17.8 µs ± 11.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
from_device = device_data_2x.get()
np.allclose(from_device, host_data_x2)

True