In [2]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2025.1.tar.gz (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.1.3-py3-none-any.whl.metadata (3.0 kB)
Collecting siphash24>=1.6 (from pytools>=2011.2->pycuda)
  Downloading siphash24-1.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Downloading pytools-2025.1.3-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading siphash24-1.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.w

In [9]:
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import time

# CUDA kernel code
kernel_code = """
__global__ void vectorAdd(const float *A, const float *B, float *C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}

__global__ void matrixMul(const float *A, const float *B, float *C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < K) {
        float sum = 0.0f;
        for (int i = 0; i < N; i++) {
            sum += A[row * N + i] * B[i * K + col];
        }
        C[row * K + col] = sum;
    }
}
"""

def get_positive_int(prompt):
    while True:
        try:
            value = int(input(prompt))
            if value <= 0:
                print("Please enter a positive integer.")
                continue
            return value
        except ValueError:
            print("Invalid input. Please enter a positive integer.")

def get_float_array(prompt, size):
    print(f"{prompt} ({size} values):")
    while True:
        try:
            values = input("Enter values separated by spaces: ").strip().split()
            if len(values) != size:
                print(f"Please enter exactly {size} values.")
                continue
            return np.array([float(v) for v in values]).astype(np.float32)
        except ValueError:
            print("Invalid input. Please enter valid floating-point numbers.")

def get_matrix(prompt, rows, cols):
    print(f"{prompt} ({rows}x{cols} matrix, enter {rows} rows with {cols} values each):")
    matrix = []
    for i in range(rows):
        while True:
            try:
                row = input(f"Row {i+1}: ").strip().split()
                if len(row) != cols:
                    print(f"Please enter exactly {cols} values for row {i+1}.")
                    continue
                row_values = [float(v) for v in row]
                matrix.append(row_values)
                break
            except ValueError:
                print("Invalid input. Please enter valid floating-point numbers.")
    return np.array(matrix).astype(np.float32)

def main():
    # Get vector dimensions and values
    print("Enter dimensions for vector addition:")
    N = get_positive_int("Vector size (N): ")
    h_A_vec = get_float_array("Vector A", N)
    h_B_vec = get_float_array("Vector B", N)
    h_C_vec = np.zeros(N, dtype=np.float32)
    vector_size = N * np.dtype(np.float32).itemsize

    # Get matrix dimensions and values
    print("\nEnter dimensions for matrix multiplication (A: M x N, B: N x K, C: M x K):")
    M = get_positive_int("Rows of matrix A (M): ")
    N_mat = get_positive_int("Columns of matrix A / Rows of matrix B (N): ")
    K = get_positive_int("Columns of matrix B (K): ")

    h_A_mat = get_matrix("Matrix A", M, N_mat)
    h_B_mat = get_matrix("Matrix B", N_mat, K)
    h_C_mat = np.zeros((M, K), dtype=np.float32)

    # Allocate device memory
    d_A_vec = cuda.mem_alloc(vector_size)
    d_B_vec = cuda.mem_alloc(vector_size)
    d_C_vec = cuda.mem_alloc(vector_size)

    d_A_mat = cuda.mem_alloc(M * N_mat * np.dtype(np.float32).itemsize)
    d_B_mat = cuda.mem_alloc(N_mat * K * np.dtype(np.float32).itemsize)
    d_C_mat = cuda.mem_alloc(M * K * np.dtype(np.float32).itemsize)

    # Copy inputs to device
    cuda.memcpy_htod(d_A_vec, h_A_vec)
    cuda.memcpy_htod(d_B_vec, h_B_vec)
    cuda.memcpy_htod(d_A_mat, h_A_mat)
    cuda.memcpy_htod(d_B_mat, h_B_mat)

    # Compile and get kernel functions
    mod = SourceModule(kernel_code)
    vector_add = mod.get_function("vectorAdd")
    matrix_mul = mod.get_function("matrixMul")

    # Set up grid and block dimensions for vector addition
    block_size_vec = 256
    grid_size_vec = (N + block_size_vec - 1) // block_size_vec

    # Execute vector addition kernel
    start_time = time.time()
    vector_add(
        d_A_vec, d_B_vec, d_C_vec, np.int32(N),
        block=(block_size_vec, 1, 1),
        grid=(grid_size_vec, 1)
    )
    vector_time = time.time() - start_time

    # Set up grid and block dimensions for matrix multiplication
    block_size_mat = (16, 16, 1)
    grid_size_mat = ((K + block_size_mat[0] - 1) // block_size_mat[0],
                     (M + block_size_mat[1] - 1) // block_size_mat[1])

    # Execute matrix multiplication kernel
    start_time = time.time()
    matrix_mul(
        d_A_mat, d_B_mat, d_C_mat,
        np.int32(M), np.int32(N_mat), np.int32(K),
        block=block_size_mat,
        grid=grid_size_mat
    )
    matrix_time = time.time() - start_time

    # Copy results back to host
    cuda.memcpy_dtoh(h_C_vec, d_C_vec)
    cuda.memcpy_dtoh(h_C_mat, d_C_mat)

    # Print vector addition result
    print("\nVector addition result:")
    print(np.round(h_C_vec, 2))

    print(f"\nVector addition time: {vector_time:.6f} seconds")

    # Print matrix multiplication result
    print("\nMatrix multiplication result:")
    print(np.round(h_C_mat, 2))

    print(f"\nMatrix multiplication time: {matrix_time:.6f} seconds")

if __name__ == "__main__":
    # Install PyCUDA in Colab if needed
    try:
        import pycuda
    except ImportError:
        !pip install pycuda
        import pycuda

    main()

Enter dimensions for vector addition:
Vector size (N): 5
Vector A (5 values):
Enter values separated by spaces: 1 2 3 4 5
Vector B (5 values):
Enter values separated by spaces: 1 2 3 4 5

Enter dimensions for matrix multiplication (A: M x N, B: N x K, C: M x K):
Rows of matrix A (M): 2
Columns of matrix A / Rows of matrix B (N): 3
Columns of matrix B (K): 2 
Matrix A (2x3 matrix, enter 2 rows with 3 values each):
Row 1: 1 2 3
Row 2: 1 2 3
Matrix B (3x2 matrix, enter 3 rows with 2 values each):
Row 1: 1 2 
Row 2: 1 2
Row 3: 1 2

Vector addition result:
[ 2.  4.  6.  8. 10.]

Vector addition time: 0.000083 seconds

Matrix multiplication result:
[[ 6. 12.]
 [ 6. 12.]]

Matrix multiplication time: 0.000025 seconds
