In [7]:
import cupy as cp
import numpy as np
import time

for xp in [np,cp]:
    start = time.time()
    x = xp.random.rand(100000)
    W = xp.random.rand(100000,5000)
    y = xp.dot(x,W)
    end = time.time()
    print(f"Runtime by {xp.__name__}: {end-start}")


Runtime by numpy: 4.458429336547852
Runtime by cupy: 4.644579648971558


In [7]:
for xp in [np,cp]:
    start = time.time()
    x = xp.arange(1,30000000)
    l2 = xp.linalg.norm(x)
    end = time.time()
    print(f"Runtime by {xp.__name__}: {end-start}")

Runtime by numpy: 0.16229581832885742
Runtime by cupy: 0.0009274482727050781


In [None]:
print(x.device)
with cp.cuda.Device(1):
    x_on_gpu1 = cp.array([1, 2, 3, 4, 5])
x_on_gpu0 = cp.array([1, 2, 3, 4, 5])

In [8]:
current_stream = cp.cuda.get_current_stream()
current_stream

<Stream 0 (device -1)>

In [None]:
#move array to a device
x_cpu = np.array([1,2,3])
x_gpu = cp.asarray(x_cpu)
#accepts np.ndarray
with cp.cuda.device(0):
    x_gpu_0 = cp.ndarray([1,2,3])
with cp.cuda.device(1):
    x_gpu_1 = cp.asarray(x_gpu_0)

In [16]:
x_gpu = cp.array([1,2,3])
x_cpu = cp.asnumpy(x_gpu)
#or
x_cpu = x_gpu.get()

In [17]:
x_cpu

array([1, 2, 3])

In [18]:
def softplus(x):
    xp = cp.get_array_module(x)
    print("Using:", xp.__name__)
    return xp.maximum(0,x) + xp.log1p(xp.exp(-abs(x))) 

In [21]:
softplus(x_gpu)
softplus(x_cpu)

Using: cupy
Using: numpy


array([1.31326169, 2.12692801, 3.04858735])

In [28]:
x_cpu = np.array([1,2,3])
y_cpu = np.array([4,5,6])
print(x_cpu + y_cpu)
x_gpu = cp.asarray(x_cpu)
#x_gpu + y_cpu
cp.asnumpy(x_gpu)+y_cpu
cp.asnumpy(x_gpu)+cp.asnumpy(y_cpu)
x_gpu + cp.asarray(y_cpu)
cp.asarray(x_gpu) + cp.asarray(y_cpu)

[5 7 9]


array([5, 7, 9])

In [30]:
#Elementwise kernel
squared_diff = cp.ElementwiseKernel(
    'float32 x, float32 y', #input
    'float32 z', #output
    'z = (x-y)*(x-y)', #loop body
    'squared_diff') #name    

In [35]:
x = cp.arange(10, dtype = np.float32).reshape(2,5)
y = cp.arange(5, dtype = np.float32)
squared_diff(x,y)
squared_diff(x,5)

array([[25., 16.,  9.,  4.,  1.],
       [ 0.,  1.,  4.,  9., 16.]], dtype=float32)

In [36]:
z = cp.empty((2,5),dtype = np.float32)
squared_diff(x,y,z)

array([[ 0.,  0.,  0.,  0.,  0.],
       [25., 25., 25., 25., 25.]], dtype=float32)

In [37]:
#Type-generic kernel
squared_diff_generic = cp.ElementwiseKernel(
    'T x, T y', #input
    'T z', #output
    'z = (x-y)*(x-y)', #loop body
    'squared_diff_generic') #name    

In [39]:
#Type-generic kernel
squared_diff_generic = cp.ElementwiseKernel(
    'T x, T y', #input
    'T z', #output
    '''
        T diff = x-y
        z = diff*diff''', #loop body
    'squared_diff_generic') #name    

In [40]:
#Type-generic kernel
squared_diff_super_generic = cp.ElementwiseKernel(
    'X x, Y y', #input
    'Z z', #output
    'z = (x-y)*(x-y)', #loop body
    'squared_diff_super_generic') #name    

In [47]:
#raw indexing 
add_reverse = cp.ElementwiseKernel(
    'T x, raw T y', 'T z', 
    'z = x + y[_ind.size() - i - 1]',
    'add_reverse')
# z = x + y[::-1]
print(x)
print(y)
add_reverse(x,y)


[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]
[0. 1. 2. 3. 4.]


array([[9., 9., 9., 9., 9.],
       [9., 9., 9., 9., 9.]], dtype=float32)

In [6]:
#TextureObject() # no idea how it works

In [12]:
#Reduction kernels
l2norm_kernel = cp.ReductionKernel(
    'T x', #input
    'T y', #output
    'x * x', # map
    'a+b', #reduce
    'y = sqrt(a)', #post-production map 
    '0',#identity value
    'l2norm' #kernel name
)
x = cp.arange(10, dtype = cp.float32).reshape(2,5)
l2norm_kernel(x,axis=1)

array([ 5.477226 , 15.9687195], dtype=float32)

In [13]:
#raw kernel by c code 
add_kernel = cp.RawKernel(r'''
extern "C" __global__
void my_add(const float* x1, const float* x2, float* y){
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    y[tid] = x1[tid] + x2[tid];
}
''', 'my_add')
x1 = cp.arange(25,dtype=cp.float32).reshape(5,5)
x2 = cp.arange(25,dtype=cp.float32).reshape(5,5)
y = cp.zeros((5,5), dtype=cp.float32)
add_kernel((5,),(5,),(x1,x2,y))
y

array([[ 0.,  2.,  4.,  6.,  8.],
       [10., 12., 14., 16., 18.],
       [20., 22., 24., 26., 28.],
       [30., 32., 34., 36., 38.],
       [40., 42., 44., 46., 48.]], dtype=float32)

In [15]:
#using complex variables 
complex_kernel = cp.RawKernel(r'''
#include <cupy/complex.cuh>
extern "C" __global__
void my_func(const complex<float>* x1, const complex<float>* x2, complex<float>* y,float a){
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    y[tid] = x1[tid] + a*x2[tid];
}
''', 'my_func')
x1 = cp.arange(25,dtype=cp.complex64).reshape(5,5)
x2 = cp.arange(25,dtype=cp.complex64).reshape(5,5)
y = cp.zeros((5,5), dtype=cp.complex64)
complex_kernel((5,),(5,),(x1,x2,y,cp.float32(2.0)))
y

array([[ 0.+0.j,  3.+0.j,  6.+0.j,  9.+0.j, 12.+0.j],
       [15.+0.j, 18.+0.j, 21.+0.j, 24.+0.j, 27.+0.j],
       [30.+0.j, 33.+0.j, 36.+0.j, 39.+0.j, 42.+0.j],
       [45.+0.j, 48.+0.j, 51.+0.j, 54.+0.j, 57.+0.j],
       [60.+0.j, 63.+0.j, 66.+0.j, 69.+0.j, 72.+0.j]], dtype=complex64)

In [19]:
add_kernel = cp.RawKernel(r'''
extern "C" __global__
void my_add(const float* x1, const float* x2, float* y){
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    y[tid] = x1[tid] + x2[tid];
}
''', 'my_add')
add_kernel.attributes
add_kernel.max_dynamic_shared_size_bytes
add_kernel.max_dynamic_shared_size_bytes = 50000
add_kernel.max_dynamic_shared_size_bytes

50000

In [None]:
# primitive types and numpy scalars passed by value
# array should be nd array
# no validation by cupy 
# dtype should match the one in c kernel 
# cp.float32 >> float* 
# cp.uint64 >> unsigned long long* 
# float3 : cupy doesn't support but you can cast in kernel only 
# int >> long long 
# float >> double 
# complex >> cuDoubleComplex
# bool >> bool 
