In [1]:
import tensorflow as tf

In [2]:
gpus = tf.config.list_physical_devices("GPU")

In [3]:
if gpus:
    for gpu in gpus:
        print("Found a GPU with the name:", gpu)
    
else:
        print("Failed to detect a GPU.")

Found a GPU with the name: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [4]:
!nvidia-smi

Wed Apr  3 20:25:10 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.86                 Driver Version: 551.86         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   47C    P4             13W /   35W |       0MiB /   8188MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [11]:
import numpy as np
from numba import cuda

In [21]:
# Define the CUDA kernel function
@cuda.jit
# @cuda.jit(gridsize=(arr.size // threadsperblock + 1, 1), blocksize=(threadsperblock,))
def add_to_array(arr):
  """
  This kernel adds 1.0 to each element of the input array.
  Each thread processes one element of the array.
  """
  # Get the unique thread index within a block
  idx = cuda.threadIdx.x

  # Check if the thread index is within the array bounds
  if idx < arr.size:
    arr[idx] += 1.0

In [22]:
# Create a sample NumPy array on the CPU (host)
arr = np.array([1.0, 2.0, 3.0])

In [23]:
# Allocate memory for the array on the GPU (device)
d_arr = cuda.to_device(arr)

In [33]:
# Configure the number of threads per block (adjust as needed)
threadsperblock = 512

In [34]:
gridsize = (arr.size // threadsperblock + 1, 1)
add_to_array[gridsize, threadsperblock](d_arr)

In [35]:
# Copy results back from GPU to CPU
host_arr = d_arr.copy_to_host()  # Use copy_to_host instead of to_host
arr = host_arr  # Assign the copied data to the original array

In [36]:
arr

array([3., 4., 5.])