<a href="https://colab.research.google.com/github/idavoong/CSC369-Analysis/blob/main/CSC364_Lab_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Part 3, Task 1

import cupy as cp

# Define two vectors
x = cp.array([2, 4, 6, 8])
y = cp.array([1, 3, 5, 7])

# Perform element-wise multiplication (fill this part)
z = x * y

# Display the result
print("Result of element-wise multiplication:", cp.asnumpy(z))

Result of element-wise multiplication: [ 2 12 30 56]


In [3]:
# Part 3, Task 2

import cupy as cp

x = cp.array([2, 4, 6, 8])
y = cp.array([1, 3, 5, 7])

z = cp.dot(x, y)

print("Result of dot product:", cp.asnumpy(z))

Result of dot product: 100


In [4]:
# Part 3, Task 3

import cupy as cp

def squares(vector):
  return vector * vector

example = cp.array([1, 2, 3, 4, 5])
result = squares(example)
print(result)

[ 1  4  9 16 25]


In [5]:
# Task 6, Part A

import cupy as cp

# Define the custom CUDA kernel
custom_kernel = cp.RawKernel(r'''
extern "C" __global__
void elementwise_operation(const float *x, float *y, float *z, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        y[idx] = abs(x[idx] - z[idx]);
    }
}
''', 'elementwise_operation')

# Input and output arrays
N = 10**6
x = cp.arange(1, N+1, dtype=cp.float32)
y = cp.zeros_like(x)
z = cp.arange(1, N+1, dtype=cp.float32)

# Launch the custom kernel
threads_per_block = 256
blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
custom_kernel((blocks_per_grid,), (threads_per_block,), (x, y, z, N))

# Verify the result
print("First 10 results:", cp.asnumpy(z[:10]))

First 10 results: [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]


In [7]:
# Task 6, Part B

import cupy as cp

custom_kernel = cp.RawKernel(r'''
extern "C" __global__
void dot_product(const float *x, float *y, float *z, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    float temp = 0.0;
    while (idx < N) {
        temp += x[idx] * z[idx];
        idx += gridDim.x * blockDim.x;
    }

    y[blockIdx.x * blockDim.x + threadIdx.x] = temp;
}
''', 'dot_product')

N = 4
x = cp.array([2, 4, 6, 8], dtype=cp.float32)
z = cp.array([1, 3, 5, 7], dtype=cp.float32)
y = cp.zeros_like(x)

threads_per_block = 256
blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
custom_kernel((blocks_per_grid,), (threads_per_block,), (x, y, z, N))

print("Dot product:", cp.sum(y))

Dot product: 100.0


In [16]:
# Task 6, Part C

import cupy as cp

custom_kernel = cp.RawKernel(r'''
extern "C" __global__
void matmul(const float *A, const float *B, float *C, int M, int K, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < N) {
        float sum = 0.0;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}
''', 'matmul')

M, K, N = 4, 3, 2

A = cp.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=cp.float32)
B = cp.array([[1, 4], [2, 5], [3, 6]], dtype=cp.float32)
C = cp.zeros((M, N), dtype=cp.float32)

threads_per_block = (16, 16)
blocks_per_grid = ((N + threads_per_block[0] - 1) // threads_per_block[0],
                   (M + threads_per_block[1] - 1) // threads_per_block[1])

blocks_per_grid = (int(blocks_per_grid[0]), int(blocks_per_grid[1]))
threads_per_block = (int(threads_per_block[0]), int(threads_per_block[1]))

custom_kernel(blocks_per_grid, threads_per_block, (A, B, C, M, K, N))

print("Matrix C:\n", cp.asnumpy(C))

Matrix C:
 [[ 14.  32.]
 [ 32.  77.]
 [ 50. 122.]
 [ 68. 167.]]


In [17]:
# Part 7

import cupy as cp
import time

# Define a large input array on CPU
N = 20**7  # Size of the array
x = cp.array(N, dtype=cp.float32)

# Measure the time taken to square each element
start_time = time.time()
y = x**2
cpu_time = time.time() - start_time

print(f"CPU Time: {cpu_time:.6f} seconds")

CPU Time: 0.000712 seconds


Part 7 Response

Describe how and why the amount of time it takes to complete the task changes.

The amount of time it takes to complete the task changes because GPUs have thousands of cores and a higher memory bandwidth, which makes it suitable for calculating repetitive operations.