In [None]:
# Install the nvcc4jupyter plugin
!pip install nvcc4jupyter

# Load the extension into the notebook
%load_ext nvcc4jupyter


In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        printf("Device Number: %d\n", i);
        printf("  Device name: %s\n", prop.name);
        printf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
        printf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
        printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
               2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }
    return 0;
}

In [None]:
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>

// Utility function to handle CUDA errors; use this after every CUDA call
inline cudaError_t checkCudaErr(cudaError_t result, char const *const func, const int line) {
    if (result != cudaSuccess) {
        std::cerr << "CUDA error = " << static_cast<int>(result) << " at " <<
        func << ":" << line << " '" << cudaGetErrorString(result) << "'" << std::endl;
        exit(1);
    }
    return result;
}
#define CUDA_CHECK(val) checkCudaErr((val), __func__, __LINE__)

// Kernel declarations (as provided earlier)
__global__ void matmul_rec_glob(float* A, float* B, float* C, int n, int k, int m) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < m) {
        float sum = 0.0f;
        for (int i = 0; i < k; ++i) {
            sum += A[row * k + i] * B[i * m + col];
        }
        C[row * m + col] = sum;
    }
}
__global__ void matmul_rec_shar(float* A, float* B, float* C, int n, int k, int m) {
    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;
    int row = by * blockDim.y + ty;
    int col = bx * blockDim.x + tx;
    float sum = 0.0f;

    __shared__ float sA[32][32]; // Tile size of 32x32
    __shared__ float sB[32][32];

    for (int t = 0; t < (k + 31) / 32; ++t) {
        if (row < n && (t * 32 + tx) < k)
            sA[ty][tx] = A[row * k + t * 32 + tx];
        else
            sA[ty][tx] = 0.0;

        if (col < m && (t * 32 + ty) < k)
            sB[ty][tx] = B[(t * 32 + ty) * m + col];
        else
            sB[ty][tx] = 0.0;

        __syncthreads();

        for (int i = 0; i < 32; ++i) {
            sum += sA[ty][i] * sB[i][tx];
        }
        __syncthreads();
    }

    if (row < n && col < m)
        C[row * m + col] = sum;
}


// Function to initialize matrices with some values
void initializeMatrix(float* matrix, int rows, int cols) {
    for (int i = 0; i < rows * cols; i++) {
        matrix[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
    }
}

int main() {
    int n = 640, k = 480, m = 800; // Example dimensions
    size_t sizeA = n * k * sizeof(float);
    size_t sizeB = k * m * sizeof(float);
    size_t sizeC = n * m * sizeof(float);

    float *h_A, *h_B, *h_C;
    float *d_A, *d_B, *d_C;

    // Allocate host memory
    h_A = (float*)malloc(sizeA);
    h_B = (float*)malloc(sizeB);
    h_C = (float*)malloc(sizeC);

    // Initialize matrices
    initializeMatrix(h_A, n, k);
    initializeMatrix(h_B, k, m);

    // Allocate device memory
    CUDA_CHECK(cudaMalloc((void**)&d_A, sizeA));
    CUDA_CHECK(cudaMalloc((void**)&d_B, sizeB));
    CUDA_CHECK(cudaMalloc((void**)&d_C, sizeC));

    // Copy data from host to device
    CUDA_CHECK(cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice));

    // Setup execution parameters
    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((m + threadsPerBlock.x - 1) / threadsPerBlock.x, (n + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch kernels
    matmul_rec_glob<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, n, k, m);
    CUDA_CHECK(cudaPeekAtLastError());  // Check for any errors launching the kernel
    CUDA_CHECK(cudaDeviceSynchronize());  // Sync after completion

    // Launch the shared memory version
    matmul_rec_shar<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, n, k, m);
    CUDA_CHECK(cudaPeekAtLastError());  // Check for any errors launching the kernel
    CUDA_CHECK(cudaDeviceSynchronize());  // Sync after completion

    // Copy result matrix back to host
    CUDA_CHECK(cudaMemcpy(h_C, d_C, sizeC, cudaMemcpyDeviceToHost));

    // Check results (simplified check)
    for (int i = 0; i < n * m; i++) {
        if (fabs(h_C[i]) > 1e-5) {
            std::cout << "Result verification failed at element " << i << ": " << h_C[i] << std::endl;
            break;
        }
    }

    // Free device memory
    CUDA_CHECK(cudaFree(d_A));
    CUDA_CHECK(cudaFree(d_B));
    CUDA_CHECK(cudaFree(d_C));

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}
