In [4]:
import numpy as np
import cupy as cp
cp.__version__

'11.0.0'

In [3]:
from numba import cuda
cuda.detect()

Found 1 CUDA devices
id 0             b'Tesla T4'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 4
                              PCI Bus ID: 0
                                    UUID: GPU-ccb037d0-d99d-521f-e848-a486e5af6f4e
                                Watchdog: Disabled
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


True

## Sending data to GPU

In [8]:
array_cpu = np.random.randint(0,255,size=(4000,4000)) #this resides in main cpu memory
array_cpu

array([[227,  77, 223, ..., 243,  34, 172],
       [202, 126, 187, ..., 105, 214, 228],
       [125,  81, 228, ..., 139,  42, 222],
       ...,
       [ 35, 122,  97, ...,  49, 219,  79],
       [ 67,  50, 179, ..., 178,  75, 167],
       [ 66, 154,  62, ...,  86,  67,  82]])

In [5]:
# array size in MB
array_cpu.nbytes/1e6

128.0

In [9]:
# array in GPU memory
array_gpu = cp.asarray(array_cpu)

In [10]:
type(array_gpu)

cupy.ndarray

In [14]:
from scipy import fft

In [15]:
%%timeit

fft.fftn(array_cpu)

409 ms ± 73.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
from cupyx.scipy import fft as fft_gpu

In [17]:
%%timeit

fft_gpu.fftn(array_gpu)

102 µs ± 50.1 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
fft_cpu = fft.fftn(array_cpu)
fft_gpu_sent_back = cp.asarray(fft_gpu.fftn(array_gpu))
np.allclose(fft_cpu, fft_gpu_sent_back)

array(True)

In [5]:
%%time

# create array directly in gpu
gpu_array = cp.random.randint(0, 255, size=(4000, 4000))

CPU times: user 1.73 s, sys: 222 ms, total: 1.95 s
Wall time: 2.62 s


In [6]:
gpu_array

array([[143, 158, 253, ...,  73, 103,  27],
       [251, 148, 177, ..., 246, 211, 132],
       [ 22, 112,  97, ..., 224,  55, 226],
       ...,
       [228, 224,  33, ..., 107,  23, 237],
       [ 95, 252, 134, ..., 221, 227, 202],
       [249,  43, 126, ..., 251, 233, 193]])

## Numba device array

In [1]:
from numba import cuda

In [2]:
cuda.detect()

Found 1 CUDA devices
id 0             b'Tesla T4'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 4
                              PCI Bus ID: 0
                                    UUID: GPU-c4db7992-b570-548e-abda-16066c812fee
                                Watchdog: Disabled
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


True

In [9]:
# copy cpu array to gpu
gpu_array = cuda.to_device(array_cpu)
gpu_array

<numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7e8f04bde830>

In [10]:
cp.asarray(gpu_array)

array([[227,  77, 223, ..., 243,  34, 172],
       [202, 126, 187, ..., 105, 214, 228],
       [125,  81, 228, ..., 139,  42, 222],
       ...,
       [ 35, 122,  97, ...,  49, 219,  79],
       [ 67,  50, 179, ..., 178,  75, 167],
       [ 66, 154,  62, ...,  86,  67,  82]])

In [11]:
# copy gpu array to cpu (host)
gpu_array.copy_to_host()

array([[227,  77, 223, ..., 243,  34, 172],
       [202, 126, 187, ..., 105, 214, 228],
       [125,  81, 228, ..., 139,  42, 222],
       ...,
       [ 35, 122,  97, ...,  49, 219,  79],
       [ 67,  50, 179, ..., 178,  75, 167],
       [ 66, 154,  62, ...,  86,  67,  82]])

256