<a href="https://colab.research.google.com/github/jeenraju/Projects/blob/main/GPU_Cupy_numba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cupy as cp

In [None]:
import numpy as np

In [None]:
array_cpu = np.random.randint(0,255, size=(2000,2000))

In [None]:
array_cpu

array([[ 37,   9,  75, ..., 216, 159,   0],
       [ 26,  15,  75, ...,  85, 240, 220],
       [113,  12,  57, ...,  43,  50,  40],
       ...,
       [ 94, 126,  88, ...,  37,  60,  82],
       [  4,  37, 253, ...,  61, 250, 190],
       [111, 229, 168, ..., 106, 143, 160]])

In [None]:
array_cpu.nbytes/ 1e6

32.0

To send the array to GPU:

In [None]:
array_gpu = cp.asarray(array_cpu)

In [None]:
array_gpu

array([[ 37,   9,  75, ..., 216, 159,   0],
       [ 26,  15,  75, ...,  85, 240, 220],
       [113,  12,  57, ...,  43,  50,  40],
       ...,
       [ 94, 126,  88, ...,  37,  60,  82],
       [  4,  37, 253, ...,  61, 250, 190],
       [111, 229, 168, ..., 106, 143, 160]])

In [None]:
%%timeit
cp.asarray(array_cpu)

6.08 ms ± 84.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
new_array_cpu = np.random.randint(0, 255, size=(4000, 4000))

In [None]:
%%timeit
cp.asarray(new_array_cpu)

24.6 ms ± 433 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
type(array_gpu)

cupy.ndarray

In [None]:
from scipy import fft

In [None]:
%%timeit
fft.fftn(array_cpu)

77 ms ± 5.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
fft.fftn(array_gpu)

TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.

In [None]:
from cupyx.scipy import fft as fft_gpu

In [None]:
%%timeit
fft_gpu.fftn(array_gpu)

181 µs ± 69.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
fft_cpu =fft.fftn(array_cpu)
fft_send_back = cp.asnumpy(fft_gpu.fftn(array_gpu))
np.allclose(fft_cpu, fft_send_back)

True

In [None]:
fft_gpu.fftn(array_cpu)

TypeError: The input array a must be a cupy.ndarray

In [None]:
fft.fftn(array_gpu)

TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.

Some nummpy functions will work on cupy array, for example:

In [None]:
np.max(array_gpu)

array(254)

In [None]:
type(np.max(array_gpu))

cupy.ndarray

We can also generate an array in gpu instead of creating in cpu and sending it to gpu.

In [None]:
cp.random.randint(0,255, size=(2000,2000))

array([[ 80,  39,  99, ...,   4,  59, 211],
       [ 20,  26,  97, ...,  63, 151,  45],
       [ 26, 102,  66, ...,  62,  29,  75],
       ...,
       [198,  32,   4, ...,  33, 161, 157],
       [173,   1, 245, ...,  89,  94, 131],
       [235,   5,  43, ...,  63, 227, 180]])

**NUMBA**

In [None]:
from numba import cuda

In [None]:
cuda.detect()

Found 1 CUDA devices
id 0             b'Tesla T4'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 4
                              PCI Bus ID: 0
                                    UUID: GPU-6ccf373c-8d20-1064-e95a-7da0dcf47eac
                                Watchdog: Disabled
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


True

In [None]:
d_array = cuda.to_device(array_cpu)
d_array

<numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7a8da5f6de50>

In [None]:
cp.asarray(d_array)

array([[ 37,   9,  75, ..., 216, 159,   0],
       [ 26,  15,  75, ...,  85, 240, 220],
       [113,  12,  57, ...,  43,  50,  40],
       ...,
       [ 94, 126,  88, ...,  37,  60,  82],
       [  4,  37, 253, ...,  61, 250, 190],
       [111, 229, 168, ..., 106, 143, 160]])

In [None]:
d_array.copy_to_host()

array([[ 37,   9,  75, ..., 216, 159,   0],
       [ 26,  15,  75, ...,  85, 240, 220],
       [113,  12,  57, ...,  43,  50,  40],
       ...,
       [ 94, 126,  88, ...,  37,  60,  82],
       [  4,  37, 253, ...,  61, 250, 190],
       [111, 229, 168, ..., 106, 143, 160]])

Matrix Multiplication

In [None]:
@cuda.jit
def matmul(A, B, C):
  """
  Perform square matrix multiplication of C = A * B
  """
  i, j = cuda.grid(2)
  if i < C.shape[0] and j < C.shape[1]:
    tmp = 0
    for k in range(A.shape[1]):
      tmp += A[i,k] * B[k,j]
    C[i,j] = tmp

In [None]:
cp.random.seed(42)
A = cp.random.uniform(1, 10, size=(2000,2000), dtype=np.float64)
B = cp.random.uniform(1, 10, size=(2000,2000), dtype=np.float64)
C = cp.zeros((2000,2000), dtype=np.float64)

In [None]:
C

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
threadsperblock = (16, 16)
blockspergrid_x = int(np.ceil(C.shape[0] / threadsperblock[0]))
blockspergrid_y = int(np.ceil(C.shape[1] / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)
print(blockspergrid)
print(f"The kernel will be executed with {threadsperblock[0]*blockspergrid_x}")

(125, 125)
The kernel will be executed with 2000


In [None]:
matmul[blockspergrid, threadsperblock](A, B, C)

ERROR:numba.cuda.cudadrv.driver:Call to cuLinkAddData results in CUDA_ERROR_UNSUPPORTED_PTX_VERSION


LinkerError: [222] Call to cuLinkAddData results in CUDA_ERROR_UNSUPPORTED_PTX_VERSION
ptxas application ptx input, line 9; fatal   : Unsupported .version 8.5; current version is '8.4'