# HPC assignment 17

In [35]:
!nvidia-smi

Tue Apr 16 04:33:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0              27W /  70W |    113MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## 1. To print hello message on the screen using kernal function

In [32]:
%%writefile hello_1_1.cu

#include <stdio.h>

__global__ void cuda_hello_1_1() {
    printf("Hello World from GPU with grid dimension (1, 1) and block dimension (1, 1)!\n");
}

int main() {
    cuda_hello_1_1<<<1,1>>>();
    cudaDeviceSynchronize(); // Make sure all GPU work is done before exiting
    return 0;
}

Writing hello_1_1.cu


In [33]:
!nvcc -o hello_1_1 hello_1_1.cu

In [34]:
!./hello_1_1

Hello World from GPU with grid dimension (1, 1) and block dimension (1, 1)!


## 2. To add two vectors of size 100 and 20000 and analyze the performance comparison between cpu and gpu processing

### GPU

In [5]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2024.1.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2024.1.1-py2.py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.1/85.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting mako (from pycuda)
  Downloading Mako-1.3.3-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pycuda
  Building wheel for pycuda (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2024.1-cp310-cp310-linux_x86_64.whl size=661204 sha256=f6a73d96

In [6]:
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import time

In [7]:
# CUDA kernel function to add two vectors
cuda_kernel_code = """
__global__ void vector_add(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}
"""

In [8]:
# Compile the CUDA kernel code
cuda_module = SourceModule(cuda_kernel_code)

# Get a reference to the CUDA kernel function
vector_add_cuda = cuda_module.get_function("vector_add")

In [9]:
def vector_add_gpu(a, b):
    n = a.size

    # Create device arrays
    a_gpu = cuda.mem_alloc(a.nbytes)
    b_gpu = cuda.mem_alloc(b.nbytes)
    c_gpu = cuda.mem_alloc(b.nbytes)

    # Copy data to device
    cuda.memcpy_htod(a_gpu, a)
    cuda.memcpy_htod(b_gpu, b)

    # Define block and grid dimensions
    block_dim = (256, 1, 1)
    grid_dim = ((n + block_dim[0] - 1) // block_dim[0], 1)

    start_time = time.time()

    # Launch the CUDA kernel
    vector_add_cuda(a_gpu, b_gpu, c_gpu, np.int32(n), block=block_dim, grid=grid_dim)

    # Synchronize threads to ensure all output is calculated
    cuda.Context.synchronize()

    end_time = time.time()

    # Copy result back to host
    c = np.empty_like(a)
    cuda.memcpy_dtoh(c, c_gpu)

    return c, end_time - start_time

In [10]:
vector_size_1 = 100
vector_size_2 = 20000
a = np.random.randn(vector_size_2).astype(np.float32)
b = np.random.randn(vector_size_2).astype(np.float32)

# Perform vector addition on GPU
result_gpu1, gpu_time1 = vector_add_gpu(a[:vector_size_1], b[:vector_size_1])
result_gpu2, gpu_time2 = vector_add_gpu(a[:vector_size_2], b[:vector_size_2])

In [11]:
print("Vector addition of size", vector_size_1, "on GPU took", gpu_time1, "seconds.")

print("Vector addition of size", vector_size_2, "on GPU took", gpu_time2, "seconds.")

Vector addition of size 100 on GPU took 0.0007691383361816406 seconds.
Vector addition of size 20000 on GPU took 7.128715515136719e-05 seconds.


* Vector addition of size 100 on GPU took 0.0007691383361816406 seconds.
* Vector addition of size 20000 on GPU took 7.128715515136719e-05 seconds.

### CPU

In [1]:
import numpy as np
import time

In [2]:
def vector_add_cpu(a, b):
    start_time = time.time()
    result = a + b
    end_time = time.time()
    return result, end_time - start_time

In [3]:
vector_size_1 = 100
vector_size_2 = 20000
a = np.random.randn(vector_size_2).astype(np.float32)
b = np.random.randn(vector_size_2).astype(np.float32)

# Perform vector addition on CPU
result_cpu1, cpu_time1 = vector_add_cpu(a[:vector_size_1], b[:vector_size_1])
result_cpu2, cpu_time2 = vector_add_cpu(a[:vector_size_2], b[:vector_size_2])

In [4]:
print("Vector addition of size", vector_size_1, "on CPU took", cpu_time1, "seconds.")
print("Vector addition of size", vector_size_2, "on CPU took", cpu_time2, "seconds.")

Vector addition of size 100 on CPU took 4.649162292480469e-05 seconds.
Vector addition of size 20000 on CPU took 8.153915405273438e-05 seconds.


* Vector addition of size 100 on CPU took 2.384185791015625e-05 seconds.
* Vector addition of size 20000 on CPU took 1.9788742065429688e-05 seconds.

## 3. To multply two matrix of size 20 X 20 and 1024 X 1024 analyze the performance comparison between cpu and gpu processing

* GPU

In [12]:
def matrix_multiply_gpu(a, b):
    # Define CUDA kernel code for matrix multiplication
    cuda_code = """
    __global__ void matrix_multiply(float *a, float *b, float *c, int n) {
        int row = blockIdx.y * blockDim.y + threadIdx.y;
        int col = blockIdx.x * blockDim.x + threadIdx.x;

        if (row < n && col < n) {
            float sum = 0.0;
            for (int i = 0; i < n; ++i) {
                sum += a[row * n + i] * b[i * n + col];
            }
            c[row * n + col] = sum;
        }
    }
    """

    # Compile CUDA kernel code
    mod = SourceModule(cuda_code)

    # Get kernel function
    matrix_multiply_cuda = mod.get_function("matrix_multiply")

    # Allocate memory on device
    a_gpu = cuda.mem_alloc(a.nbytes)
    b_gpu = cuda.mem_alloc(b.nbytes)
    c_gpu = cuda.mem_alloc(a.nbytes)

    # Copy input matrices to device
    cuda.memcpy_htod(a_gpu, a)
    cuda.memcpy_htod(b_gpu, b)

    # Define grid and block dimensions
    block_size = (16, 16, 1)
    grid_size = ((a.shape[1] + block_size[0] - 1) // block_size[0], (a.shape[0] + block_size[1] - 1) // block_size[1], 1)

    # Call CUDA kernel
    matrix_multiply_cuda(a_gpu, b_gpu, c_gpu, np.int32(a.shape[0]), block=block_size, grid=grid_size)

    # Copy result back to host
    c = np.empty_like(a)
    cuda.memcpy_dtoh(c, c_gpu)

    return c

In [13]:
# Function to generate random matrices
def generate_random_matrix(rows, cols):
    return np.random.rand(rows, cols).astype(np.float32)


In [14]:
# Function to measure time taken for matrix multiplication
def measure_time(matrix_size, func, *args):
    start_time = time.time()
    result = func(*args)
    end_time = time.time()
    return result, end_time - start_time

In [15]:
# Sizes of matrices to be multiplied
matrix_sizes = [(20, 20), (1024, 1024)]

In [17]:
for size in matrix_sizes:
    print(f"\nMatrix size: {size}")
    a = generate_random_matrix(*size)
    b = generate_random_matrix(*size)

    # GPU matrix multiplication
    gpu_result, gpu_time = measure_time(size, matrix_multiply_gpu, a, b)
    print(f"GPU time: {gpu_time:.6f} seconds")


Matrix size: (20, 20)
GPU time: 0.703994 seconds

Matrix size: (1024, 1024)
GPU time: 0.014648 seconds



* Matrix size: (20, 20)
* GPU time: 0.703994 seconds

* Matrix size: (1024, 1024)
* GPU time: 0.014648 seconds

* CPU

In [5]:
def multiply_matrices_cpu(a, b):
    return np.dot(a, b)

In [6]:
def compare_performance_multiply_matrices_cpu():
    # Generate random input matrices
    a = np.random.rand(1024, 1024)
    b = np.random.rand(1024, 1024)
    c = np.random.rand(20, 20)
    d = np.random.rand(20, 20)

    # CPU processing
    start_time_cpu1 = time.time()
    result_cpu1 = multiply_matrices_cpu(a, b)
    end_time_cpu1 = time.time()

    start_time_cpu2 = time.time()
    result_cpu2 = multiply_matrices_cpu(c, d)
    end_time_cpu2 = time.time()

    # Print performance comparison
    print("Performance Comparison for Multiplying Two Matrices (CPU):")
    print("CPU Time for 1024:", end_time_cpu1 - start_time_cpu1, "seconds")
    print("CPU Time for 20:", end_time_cpu2 - start_time_cpu2, "seconds")

In [7]:
compare_performance_multiply_matrices_cpu()

Performance Comparison for Multiplying Two Matrices (CPU):
CPU Time for 1024: 0.08771014213562012 seconds
CPU Time for 20: 0.0022869110107421875 seconds


* CPU Time for 1024: 0.12308359146118164 seconds
* CPU Time for 20: 0.0019140243530273438 seconds

## 4. To obtain CUDA device information and print the output

In [18]:
import pycuda.driver as cuda

# Initialize PyCUDA
cuda.init()

# Get the number of CUDA devices
num_devices = cuda.Device.count()

print("Number of CUDA devices:", num_devices)

# Iterate over each CUDA device and print its properties
for i in range(num_devices):
    device = cuda.Device(i)
    print("\nCUDA Device:", i)
    print("  Name:", device.name())
    print("  Compute Capability:", device.compute_capability())
    print("  Total Memory:", device.total_memory() / (1024 ** 3), "GB")
    print("  Max Threads per Block:", device.max_threads_per_block)
    print("  Multiprocessor Count:", device.multiprocessor_count)
    print("  Clock Rate:", device.clock_rate / 1e6, "GHz")


Number of CUDA devices: 1

CUDA Device: 0
  Name: Tesla T4
  Compute Capability: (7, 5)
  Total Memory: 14.74810791015625 GB
  Max Threads per Block: 1024
  Multiprocessor Count: 40
  Clock Rate: 1.59 GHz


Number of CUDA devices: 1

CUDA Device: 0
  * Name: Tesla T4
  * Compute Capability: (7, 5)
  * Total Memory: 14.74810791015625 GB
  * Max Threads per Block: 1024
  * Multiprocessor Count: 40
  * Clock Rate: 1.59 GHz