In [None]:
%env CUPY_ACCELERATORS = cub
import cupy as cp
import numpy as np
import time
from cupyx.profiler import benchmark

def my_func():
    x = cp.random.rand(10000,10000)
    W = cp.random.rand(10000,10000)
    y = cp.dot(x,W)

start_gpu = cp.cuda.Event()
end_gpu = cp.cuda.Event()
start_gpu.record()
start_cpu = time.perf_counter()
out = my_func()
end_cpu = time.perf_counter()
end_gpu.record()
end_gpu.synchronize()
t_gpu = cp.cuda.get_elapsed_time(start_gpu, end_gpu)
t_cpu = end_cpu - start_cpu
print(f"Runtime by CPU: {t_cpu}")
print(f"Runtime by GPU: {t_gpu}")

def my_func(a):
    return cp.sqrt(cp.sum(a**2, axis=-1))
a = cp.random.random((1024, 2048))
print(benchmark(my_func,(a,), n_repeat=100))

In [None]:
import cupy as cp
import numpy as np

for xp in [np,cp]:
    start = time.time()
    x = xp.arange(1,30000000)
    l2 = xp.linalg.norm(x)
    end = time.time()
    print(f"Runtime by {xp.__name__}: {end-start}")

In [None]:
import cupy as cp
import numpy as np

x_cpu = np.arange(1,30000000)
l2_cpu = np.linalg.norm(x_cpu)

x_gpu = cp.arange(1,30000000)
l2_gpu = cp.linalg.norm(x_gpu)

In [1]:
import cupy as cp
from cupyx.profiler import benchmark
a = cp.random.random((256, 256, 256), dtype=cp.float32)
print(benchmark(a.sum, (), n_repeat=100))  

sum                 :    CPU:   23.618 us   +/-10.574 (min:   14.739 / max:   64.493) us     GPU-0:10884.833 us   +/-269.084 (min:10702.848 / max:11978.752) us


In [None]:
for xp in [np,cp]:
    start = time.time()
    x = xp.arange(1,30000000)
    l2 = xp.linalg.norm(x)
    end = time.time()
    print(f"Runtime by {xp.__name__}: {end-start}")

In [None]:
print(x.device)
with cp.cuda.Device(1):
    x_on_gpu1 = cp.array([1, 2, 3, 4, 5])
x_on_gpu0 = cp.array([1, 2, 3, 4, 5])

In [None]:
current_stream = cp.cuda.get_current_stream()
current_stream

In [None]:
#move array to a device
x_cpu = np.array([1,2,3])
x_gpu = cp.asarray(x_cpu)
#accepts np.ndarray
with cp.cuda.device(0):
    x_gpu_0 = cp.ndarray([1,2,3])
with cp.cuda.device(1):
    x_gpu_1 = cp.asarray(x_gpu_0)

In [None]:
x_gpu = cp.array([1,2,3])
x_cpu = cp.asnumpy(x_gpu)
#or
x_cpu = x_gpu.get()

In [None]:
x_cpu

In [None]:
def softplus(x):
    xp = cp.get_array_module(x)
    print("Using:", xp.__name__)
    return xp.maximum(0,x) + xp.log1p(xp.exp(-abs(x))) 

In [None]:
softplus(x_gpu)
softplus(x_cpu)

In [None]:
x_cpu = np.array([1,2,3])
y_cpu = np.array([4,5,6])
print(x_cpu + y_cpu)
x_gpu = cp.asarray(x_cpu)
#x_gpu + y_cpu
cp.asnumpy(x_gpu)+y_cpu
cp.asnumpy(x_gpu)+cp.asnumpy(y_cpu)
x_gpu + cp.asarray(y_cpu)
cp.asarray(x_gpu) + cp.asarray(y_cpu)

In [None]:
#Elementwise kernel
squared_diff = cp.ElementwiseKernel(
    'float32 x, float32 y', #input
    'float32 z', #output
    'z = (x-y)*(x-y)', #loop body
    'squared_diff') #name    

In [None]:
x = cp.arange(10, dtype = np.float32).reshape(2,5)
y = cp.arange(5, dtype = np.float32)
squared_diff(x,y)
squared_diff(x,5)

In [None]:
z = cp.empty((2,5),dtype = np.float32)
squared_diff(x,y,z)

In [None]:
#Type-generic kernel
squared_diff_generic = cp.ElementwiseKernel(
    'T x, T y', #input
    'T z', #output
    'z = (x-y)*(x-y)', #loop body
    'squared_diff_generic') #name    

In [None]:
#Type-generic kernel
squared_diff_generic = cp.ElementwiseKernel(
    'T x, T y', #input
    'T z', #output
    '''
        T diff = x-y
        z = diff*diff''', #loop body
    'squared_diff_generic') #name    

In [None]:
#Type-generic kernel
squared_diff_super_generic = cp.ElementwiseKernel(
    'X x, Y y', #input
    'Z z', #output
    'z = (x-y)*(x-y)', #loop body
    'squared_diff_super_generic') #name    

In [None]:
#raw indexing 
add_reverse = cp.ElementwiseKernel(
    'T x, raw T y', 'T z', 
    'z = x + y[_ind.size() - i - 1]',
    'add_reverse')
# z = x + y[::-1]
print(x)
print(y)
add_reverse(x,y)


In [None]:
#TextureObject() # no idea how it works

In [None]:
#Reduction kernels
l2norm_kernel = cp.ReductionKernel(
    'T x', #input
    'T y', #output
    'x * x', # map
    'a+b', #reduce
    'y = sqrt(a)', #post-production map 
    '0',#identity value
    'l2norm' #kernel name
)
x = cp.arange(10, dtype = cp.float32).reshape(2,5)
l2norm_kernel(x,axis=1)

In [None]:
#raw kernel by c code 
add_kernel = cp.RawKernel(r'''
extern "C" __global__
void my_add(const float* x1, const float* x2, float* y){
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    y[tid] = x1[tid] + x2[tid];
}
''', 'my_add')
x1 = cp.arange(25,dtype=cp.float32).reshape(5,5)
x2 = cp.arange(25,dtype=cp.float32).reshape(5,5)
y = cp.zeros((5,5), dtype=cp.float32)
add_kernel((5,),(5,),(x1,x2,y))
y

In [None]:
#using complex variables 
complex_kernel = cp.RawKernel(r'''
#include <cupy/complex.cuh>
extern "C" __global__
void my_func(const complex<float>* x1, const complex<float>* x2, complex<float>* y,float a){
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    y[tid] = x1[tid] + a*x2[tid];
}
''', 'my_func')
x1 = cp.arange(25,dtype=cp.complex64).reshape(5,5)
x2 = cp.arange(25,dtype=cp.complex64).reshape(5,5)
y = cp.zeros((5,5), dtype=cp.complex64)
complex_kernel((5,),(5,),(x1,x2,y,cp.float32(2.0)))
y

In [None]:
add_kernel = cp.RawKernel(r'''
extern "C" __global__
void my_add(const float* x1, const float* x2, float* y){
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    y[tid] = x1[tid] + x2[tid];
}
''', 'my_add')
add_kernel.attributes
add_kernel.max_dynamic_shared_size_bytes

add_kernel.max_dynamic_shared_size_bytes = 50000
add_kernel.max_dynamic_shared_size_bytes

In [None]:
# primitive types and numpy scalars passed by value
# array should be nd array
# no validation by cupy 
# dtype should match the one in c kernel 
# cp.float32 >> float* 
# cp.uint64 >> unsigned long long* 
# float3 : cupy doesn't support but you can cast in kernel only 
# int >> long long 
# float >> double 
# complex >> cuDoubleComplex
# bool >> bool 


In [None]:
#custom user types
import numpy as np 
names = ['x','y','z']
types = [np.float32]*3
float3 = np.dtype({'names':names,'formats':types})
arg = np.random.rand(3).astype(np.float32).view(float3)
print(arg)
arg['x'] = 42.0
print(arg)

In [None]:
import numpy as np
float5x5 = np.dtype({'names':['dummy'],'formats':[(np.float32,(5,5))]})
arg = np.random.rand(25).astype(np.float32).view(float5x5)
print(arg.itemsize)
arg

In [None]:
loaded_from_source = r'''
extern "C"{

__global__ void test_sum(const float* x1, const float* x2, float* y, unsigned int N)
{ 
    unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if(tid < N)
    {
        y[tid] = x1[tid] + x2[tid];
    }
}

__global__ void test_multiply(const float* x1, const float* x2, float* y, unsigned int N)
{ 
    unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if(tid < N)
    {
        y[tid] = x1[tid] * x2[tid];
    }
}

}'''
module = cp.RawModule(code=loaded_from_source)
ker_sum = module.get_function('test_sum')
ker_times = module.get_function('test_multiply')
N = 10
x1 = cp.arange(N**2, dtype = cp.float32).reshape(N,N)
x2 = cp.ones((N,N),dtype= cp.float32)
y = cp.zeros((N,N),dtype = cp.float32)
ker_sum((N,),(N,),(x1,x2,y,N**2))
assert cp.allclose(y,x1+x2)
print(y)
print(y.device)
ker_times((N,),(N,),(x1,x2,y,N**2))
assert cp.allclose(y,x1*x2)
y


In [None]:
#c++
code = r'''
template<typename T>
__global__ void fx3(T* arr, int N){
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid<N){
        arr[tid] = arr[tid]*3;
    }
}
'''
name_exp = ['fx3<float>', 'fx3<double>']
mod = cp.RawModule(code = code,options = ('-std=c++11',),
                   name_expressions = name_exp)
ker_float = mod.get_function(name_exp[0])
N = 10
a = cp.arange(N,dtype=cp.float32)
print(a)
ker_float((1,),(N,),(a,N))

ker_double = mod.get_function(name_exp[1])
a = cp.arange(N,dtype=cp.float64)
print(a)
ker_double((1,),(N,),(a,N))
a

In [None]:
@cp.fuse()
def squared_diff(x,y):
    return (x-y)*(x-y)
x = cp.arange(10, dtype = np.float32).reshape(2,5)
y = cp.arange(5, dtype = np.float32)
squared_diff(x,y)

In [None]:
#jit kernel function 
from cupyx import jit 
@jit.rawkernel()
def elementwise_copy(x,y,size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.gridDim.x * jit.blockDim.x
    for i in range(tid,size,ntid):
        y[i] = x[i]
size = cp.uint32(2**22)
x = cp.random.normal(size = (size,), dtype = cp.float32)
y = cp.empty((size,),dtype=cp.float32)

print(x)
print(y)
print(size)
elementwise_copy((128,),(1024,),(x,y,size))
assert(x==y).all()
y

elementwise_copy[128,1024](x,y,size)
assert(x==y).all()
y

In [None]:
#streams and events
a_np = np.arange(10)
s = cp.cuda.Stream()
s
with s:
    a_cp = cp.asarray(a_np)
    b_cp = cp.sum(a_cp)
    assert s == cp.cuda.get_current_stream()

In [None]:
#or by using use() method 
s = cp.cuda.Stream()
s.use() 
b_np = cp.asnumpy(b_cp)
assert s == cp.cuda.get_current_stream()
cp.cuda.Stream.null.use()
assert cp.cuda.Stream.null == cp.cuda.get_current_stream()

In [None]:
cp.cuda.Stream.null

In [None]:
e1 = cp.cuda.Event()
e1.record()
a_cp = b_cp * a_cp + 8
e2 = cp.cuda.get_current_stream().record()
s2 = cp.cuda.Stream()
s2.wait_event(e2)
with s2: 
    a_np = cp.asnumpy(a_cp)
e2.synchronize()
t=cp.cuda.get_elapsed_time(e1,e2) #only include the compute time not the copy time

In [None]:
!nvidia_smi