In [None]:
import numpy as np
import time
#from pycuda.compiler import SourceModule
from numba import njit, prange
import tensorflow as tf
import torch

# Create two random matrices

In [None]:
matrix1 = np.random.rand(400, 300)
matrix2 = np.random.rand(300, 500)

# Naive method

In [None]:
# Ensure the number of columns in matrix1 matches the number of rows in matrix2
if matrix1.shape[1] != matrix2.shape[0]:
    raise ValueError("Matrix dimensions do not match for multiplication")

# Initialize the result matrix with zeros
result = np.zeros((matrix1.shape[0], matrix2.shape[1]))

# Perform matrix multiplication using nested loops
start_time = time.time()

for i in range(matrix1.shape[0]):
    for j in range(matrix2.shape[1]):
        for k in range(matrix1.shape[1]):
            result[i][j] += matrix1[i][k] * matrix2[k][j]

end_time = time.time()
print("Time taken for matrix multiplication using nested loops:", end_time - start_time, "seconds")

print (result[0][0])


# Using jit compilation (and cpu cores)

In [None]:
@njit(parallel=True)
def matrix_multiply_numba(matrix1, matrix2):
    result = np.zeros((matrix1.shape[0], matrix2.shape[1]))
    for i in prange(matrix1.shape[0]):
        for j in range(matrix2.shape[1]):
            for k in range(matrix1.shape[1]):
                result[i, j] += matrix1[i, k] * matrix2[k, j]
    return result

# Perform matrix multiplication using Numba
start_time = time.time()
result_numba = matrix_multiply_numba(matrix1, matrix2)
end_time = time.time()

print("Time taken for matrix multiplication using Numba:", end_time - start_time, "seconds")
print(result_numba[0][0])

# Using CUDA jit compilation

In [None]:
from numba import cuda

@cuda.jit
def matrix_multiply_cuda(matrix1, matrix2, result):
    row, col = cuda.grid(2)
    if row < result.shape[0] and col < result.shape[1]:
        temp = 0
        for k in range(matrix1.shape[1]):
            temp += matrix1[row, k] * matrix2[k, col]
        result[row, col] = temp

# Allocate memory on the device
matrix1_device = cuda.to_device(matrix1)
matrix2_device = cuda.to_device(matrix2)
result_device = cuda.device_array((matrix1.shape[0], matrix2.shape[1]))

# Define thread and block dimensions
threads_per_block = (20, 20)
blocks_per_grid_x = (result.shape[0] + threads_per_block[0] - 1) // threads_per_block[0]
blocks_per_grid_y = (result.shape[1] + threads_per_block[1] - 1) // threads_per_block[1]
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

# Perform matrix multiplication using CUDA
start_time = time.time()
matrix_multiply_cuda[blocks_per_grid, threads_per_block](matrix1_device, matrix2_device, result_device)
cuda.synchronize()
end_time = time.time()

# Copy the result back to the host
result_cuda = result_device.copy_to_host()

print("Time taken for matrix multiplication using CUDA with Numba:", end_time - start_time, "seconds")
print(result_cuda[0][0])

# Using tensorflow

In [None]:
# Convert numpy arrays to TensorFlow tensors
matrix1_tf = tf.convert_to_tensor(matrix1, dtype=tf.float32)
matrix2_tf = tf.convert_to_tensor(matrix2, dtype=tf.float32)

# Perform matrix multiplication
start_time = time.time()
result_tf = tf.matmul(matrix1_tf, matrix2_tf)
end_time = time.time()

print("Time taken for matrix multiplication using TensorFlow:", end_time - start_time, "seconds")

print (result_tf.numpy().shape)

print (result_tf.numpy()[0][0])


# Using cupy

In [None]:
import cupy as cp

# Transfer matrices to GPU
matrix1_cupy = cp.asarray(matrix1)
matrix2_cupy = cp.asarray(matrix2)

# Perform matrix multiplication
start_time = time.time()
result_cupy = cp.matmul(matrix1_cupy, matrix2_cupy)
end_time = time.time()

print("Time taken for matrix multiplication using CuPy:", end_time - start_time, "seconds")

# Transfer result back to CPU and print the first element
result_cupy_cpu = cp.asnumpy(result_cupy)
print(result_cupy_cpu[0][0])

In [None]:
from pycuda.compiler import SourceModule
import numpy as np

import pycuda.driver as cuda
import pycuda.autoinit

# Define the CUDA kernel for matrix multiplication
kernel_code = """
__global__ void matrix_multiply(float *matrix1, float *matrix2, float *result, int width1, int height1, int width2) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < height1 && col < width2) {
        float temp = 0;
        for (int k = 0; k < width1; ++k) {
            temp += matrix1[row * width1 + k] * matrix2[k * width2 + col];
        }
        result[row * width2 + col] = temp;
    }
}
"""

# Compile the kernel
mod = SourceModule(kernel_code)
matrix_multiply = mod.get_function("matrix_multiply")

# Transfer matrices to GPU
matrix1_gpu = cuda.mem_alloc(matrix1.nbytes)
matrix2_gpu = cuda.mem_alloc(matrix2.nbytes)
result_gpu = cuda.mem_alloc(result.nbytes)

cuda.memcpy_htod(matrix1_gpu, matrix1)
cuda.memcpy_htod(matrix2_gpu, matrix2)

# Define thread and block dimensions
threads_per_block = (20, 20, 1)
blocks_per_grid_x = (result.shape[1] + threads_per_block[0] - 1) // threads_per_block[0]
blocks_per_grid_y = (result.shape[0] + threads_per_block[1] - 1) // threads_per_block[1]
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y, 1)

# Perform matrix multiplication using PyCUDA
start_time = time.time()
matrix_multiply(matrix1_gpu, matrix2_gpu, result_gpu,
                np.int32(matrix1.shape[1]), np.int32(matrix1.shape[0]), np.int32(matrix2.shape[1]),
                block=threads_per_block, grid=blocks_per_grid)
cuda.Context.synchronize()
end_time = time.time()

# Copy the result back to the host
result_pycuda = np.empty_like(result)
cuda.memcpy_dtoh(result_pycuda, result_gpu)

print("Time taken for matrix multiplication using PyCUDA:", end_time - start_time, "seconds")
print(result_pycuda[0][0])

In [None]:
! "C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\vsdevcmd.bat"


In [None]:
from cuda import cuda, nvrtc
import numpy as np

# Define the CUDA kernel for matrix multiplication
kernel_code = """
extern "C" __global__
void matrix_multiply(const float *matrix1, const float *matrix2, float *result, int width1, int height1, int width2) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < height1 && col < width2) {
        float temp = 0;
        for (int k = 0; k < width1; ++k) {
            temp += matrix1[row * width1 + k] * matrix2[k * width2 + col];
        }
        result[row * width2 + col] = temp;
    }
}
"""

# Compile the kernel
prog = nvrtc.nvrtcCreateProgram(kernel_code, "matrix_multiply.cu", 0, [], [])
nvrtc.nvrtcCompileProgram(prog, [])
ptx = nvrtc.nvrtcGetPTX(prog)

# Load the module and get the kernel function
module = cuda.cuModuleLoadData(ptx)
matrix_multiply = cuda.cuModuleGetFunction(module, "matrix_multiply")

# Allocate memory on the device
matrix1_device = cuda.cuMemAlloc(matrix1.nbytes)
matrix2_device = cuda.cuMemAlloc(matrix2.nbytes)
result_device = cuda.cuMemAlloc(result.nbytes)

cuda.cuMemcpyHtoD(matrix1_device, matrix1.tobytes(), matrix1.nbytes)
cuda.cuMemcpyHtoD(matrix2_device, matrix2.tobytes(), matrix2.nbytes)

# Define thread and block dimensions
threads_per_block = (20, 20, 1)
blocks_per_grid_x = (result.shape[1] + threads_per_block[0] - 1) // threads_per_block[0]
blocks_per_grid_y = (result.shape[0] + threads_per_block[1] - 1) // threads_per_block[1]
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y, 1)

# Perform matrix multiplication using CUDA
start_time = time.time()
cuda.cuLaunchKernel(matrix_multiply,
                    blocks_per_grid[0], blocks_per_grid[1], blocks_per_grid[2],
                    threads_per_block[0], threads_per_block[1], threads_per_block[2],
                    0, 0,
                    (matrix1_device, matrix2_device, result_device,
                     np.int32(matrix1.shape[1]), np.int32(matrix1.shape[0]), np.int32(matrix2.shape[1])),
                    0)
cuda.cuCtxSynchronize()
end_time = time.time()

# Copy the result back to the host
result_cuda_python = np.empty_like(result)
cuda.cuMemcpyDtoH(result_cuda_python, result_device, result.nbytes)

print("Time taken for matrix multiplication using cuda-python:", end_time - start_time, "seconds")
print(result_cuda_python[0][0])