In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Wed_Nov_22_10:17:15_PST_2023
Cuda compilation tools, release 12.3, V12.3.107
Build cuda_12.3.r12.3/compiler.33567101_0


## Initialize a nvcc plugin for python notebook

In [2]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


## Load the plugin extension

In [3]:
%load_ext nvcc4jupyter

Detected platform "Kaggle". Running its setup...
Updating the package lists...
Installing nvidia-cuda-toolkit, this may take a few minutes...
Source files will be saved in "/tmp/tmpfghygz6i".


In [None]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>

#define MATRIX_HEIGHT 8192
#define MATRIX_WIDTH 512    // if matrix is too large, please close the verify which cost a lot of time
#define BLOCK_START_DIM 2    // Start dim of block in for loop，limit 32 in T4
#define MASK_SIZE 5          // convolution kernel size
#define FILTER_RADIUS (MASK_SIZE/2)  // radius of convolution kernel

// constant memory for mask
__constant__ int mask_c[2*FILTER_RADIUS+1][2*FILTER_RADIUS+1];

// kernel of tiled 2D-convolution
__global__ void convolution_tiled_2D_const_mem_kernel(int *input, int *mask ,int *output, int height, int width, int in_tile_dim, int out_tile_dim) {

    int col = blockIdx.x * out_tile_dim + threadIdx.x - FILTER_RADIUS;
    int row = blockIdx.y * out_tile_dim + threadIdx.y - FILTER_RADIUS;

    // Loading input tile
    extern __shared__ int sharedInput[];
    if(row >= 0 && row < height && col >= 0 && col < width) {
        sharedInput[threadIdx.y * in_tile_dim + threadIdx.x] = input[row * width + col];
    } else {
        sharedInput[threadIdx.y * in_tile_dim + threadIdx.x] = 0;
    }

    __syncthreads();

    // Calculating output elements
    int tileCol = threadIdx.x - FILTER_RADIUS;
    int tileRow = threadIdx.y - FILTER_RADIUS;

    // Turning off the threads at the edges of the block
    if (col >= 0 && col < width && row >= 0 && row < height) {
        if (tileCol >= 0 && tileCol < out_tile_dim && tileRow >= 0 && tileRow < out_tile_dim) {
            float Pvalue = 0;
            for (int fRow = 0; fRow < 2*FILTER_RADIUS+1; fRow++) {
                for (int fCol = 0; fCol < 2*FILTER_RADIUS+1; fCol++) {
                    Pvalue += mask_c[fRow][fCol] * sharedInput[(tileRow + fRow) * in_tile_dim + (tileCol + fCol)];
                }
            }
            output[row * width + col] = Pvalue;
        }
    }
}

// kernel of 2D-convolution
__global__ void gpu_matrix_convolute(int *input, int *mask, int *output, int height, int width)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if( col < width && row < height)
    {
      int pixVal = 0;
      int start_col = col - (MASK_SIZE / 2);
      int start_row = row - (MASK_SIZE / 2);

      // Get the of the surrounding box
      for(int i = 0; i < MASK_SIZE; ++i) {
        for(int j = 0; j < MASK_SIZE; ++j) {
          int cur_row = start_row + i;
          int cur_col = start_col + j;

          // Verify we have a valid image pixel
          if(cur_row > -1 && cur_row < height && cur_col > -1 && cur_col < width) {
            pixVal += input[cur_row * width + cur_col] * mask[i * MASK_SIZE + j];
          }
        }
      }
      output[row * width + col] = pixVal;
    }
}


// verify code implement convolution in serial
void verify(int *input, int *mask, int *result, int height, int width){
    int pixVal;
    // Intermediate value for more readable code
    int offset_r;
    int offset_c;
    // Go over each row
    for(int i = 0;i < height; i++){
        for(int j = 0; j < width; j++){
            pixVal = 0;
            for(int k = 0; k < MASK_SIZE; k++){
                offset_r = i - MASK_SIZE / 2 + k;
                for(int l = 0; l < MASK_SIZE; l++){
                    offset_c = j - MASK_SIZE / 2 + l;
                    if(offset_r >= 0 && offset_r < height){
                        if(offset_c >= 0 && offset_c < width){
                            pixVal += input[offset_r * width + offset_c] * mask[k * MASK_SIZE + l];
                        }
                    }
                }
            }
            // Fail if the results don't match
            if(result[i * width + j] != pixVal)
            {
                printf("fail convolution; ");
                return;
            }
        }
    }
    printf("successs convolution; ");
    return;
}


int main(int argc, char const *argv[])
{
    // retrieve some info about the CUDA device
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);
      printf("Device Number: %d\n", i);
      printf("  Device name: %s\n", prop.name);
      printf("  max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
      printf("  max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
      printf("  max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
      printf("  num SM: %d\n", prop.multiProcessorCount);
      printf("  num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
      printf("  num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
      printf("  Memory Clock Rate (KHz): %d\n",
           prop.memoryClockRate);
      printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
      printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }


    // execution
    

    
    // execute common 2D-convolution
    for(int block_size = BLOCK_START_DIM; block_size <= BLOCK_START_DIM * 8; (block_size *= 2))
    {
        // exception 
        if(block_size * block_size > 1024){
            printf("The block size is excceed\n\n");
            continue;
        }
        int *input, *mask, *output;
        cudaMallocManaged((void **) &input, sizeof(int)*MATRIX_HEIGHT*MATRIX_WIDTH);
        cudaMallocManaged((void **) &mask, sizeof(int)*MASK_SIZE*MASK_SIZE);
        cudaMallocManaged((void **) &output, sizeof(int)*MATRIX_HEIGHT*MATRIX_WIDTH);
    
        // initialize matrix A
        for (int i = 0; i < MATRIX_HEIGHT; ++i) {
            for (int j = 0; j < MATRIX_WIDTH; ++j) {
                input[i * MATRIX_WIDTH + j] = 2;
            }
        }
    
        // initialize matrix B
        for (int i = 0; i < MASK_SIZE; ++i) {
            for (int j = 0; j < MASK_SIZE; ++j) {
                mask[i * MASK_SIZE + j] = i + j;
            }
        }
        // copy mask kernel to constant memory
        cudaMemcpyToSymbol(mask_c, mask, sizeof(int) * MASK_SIZE * MASK_SIZE); // 使用常量内存
    
        // some events to count the execution time
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        float  naive_gpu_elapsed_time_ms;
            
        // define block and grid size
        dim3 blockDim(block_size, block_size);
        dim3 gridDim((MATRIX_WIDTH + block_size - 1) / block_size, (MATRIX_HEIGHT + block_size - 1) / block_size);
        int *result = new int[MATRIX_HEIGHT * MATRIX_WIDTH];

        //warm-up execution
        gpu_matrix_convolute<<<gridDim, blockDim>>>(input, mask, output, MATRIX_HEIGHT, MATRIX_WIDTH);
        
        // time counting start
        cudaEventRecord(start, 0);
        gpu_matrix_convolute<<<gridDim, blockDim>>>(input, mask, output, MATRIX_HEIGHT, MATRIX_WIDTH);
        cudaThreadSynchronize();
    
        // time counting terminate
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
    
        //verify result
        cudaMemcpy(result,output,MATRIX_HEIGHT * MATRIX_WIDTH * sizeof(int),cudaMemcpyDeviceToHost);
        verify(input, mask, result, MATRIX_HEIGHT, MATRIX_WIDTH);
    
        // compute time elapsed on GPU computing
        cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
        printf("Time elapsed on naive GPU matrix common-convolution of %dx%d. block(%d): %f ms.\n\n", MATRIX_HEIGHT, MATRIX_WIDTH, block_size, naive_gpu_elapsed_time_ms);
    
        // free memory
        free(result);
        cudaFree(input);
        cudaFree(mask);
        cudaFree(output);
    }

    printf("----------------------------------------------------------------------------------------------------------------\n\n");

    // execute tiled 2D-convolution
    for(int out_tile_dim = BLOCK_START_DIM; out_tile_dim <= BLOCK_START_DIM * 8; (out_tile_dim *= 2))
    {
        int in_tile_dim = out_tile_dim + 2 * FILTER_RADIUS;

        // exception 
        if(out_tile_dim * out_tile_dim > 1024){
            printf("The out_tile size is excceed\n\n");
            continue;
        }else if(in_tile_dim * in_tile_dim > 1024){
            printf("The in_tile size is excceed\n\n");
            continue;
        }

        int *input, *mask, *output;
        cudaMallocManaged((void **) &input, sizeof(int)*MATRIX_HEIGHT*MATRIX_WIDTH);
        cudaMallocManaged((void **) &mask, sizeof(int)*MASK_SIZE*MASK_SIZE);
        cudaMallocManaged((void **) &output, sizeof(int)*MATRIX_HEIGHT*MATRIX_WIDTH);
    
        // initialize matrix A
        for (int i = 0; i < MATRIX_HEIGHT; ++i) {
            for (int j = 0; j < MATRIX_WIDTH; ++j) {
                input[i * MATRIX_WIDTH + j] = 2;
            }
        }
    
        // initialize matrix B
        for (int i = 0; i < MASK_SIZE; ++i) {
            for (int j = 0; j < MASK_SIZE; ++j) {
                mask[i * MASK_SIZE + j] = i + j;
            }
        }
        // copy mask kernel to constant memory
        cudaMemcpyToSymbol(mask_c, mask, sizeof(int) * MASK_SIZE * MASK_SIZE); // 使用常量内存
    
        // some events to count the execution time
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        float  naive_gpu_elapsed_time_ms;
    
        // define block and grid size
        dim3 blockDim(in_tile_dim, in_tile_dim);
        dim3 gridDim((MATRIX_WIDTH + out_tile_dim - 1) / out_tile_dim, (MATRIX_HEIGHT + out_tile_dim - 1) / out_tile_dim);
        size_t sharedMemorySize = in_tile_dim * in_tile_dim * sizeof(int);
        int *result = new int[MATRIX_HEIGHT * MATRIX_WIDTH];
    
        //warm-up execution
        convolution_tiled_2D_const_mem_kernel<<<gridDim, blockDim, sharedMemorySize>>>(input, mask, output, MATRIX_HEIGHT, MATRIX_WIDTH, in_tile_dim, out_tile_dim);


        // time counting start
        cudaEventRecord(start, 0);
        convolution_tiled_2D_const_mem_kernel<<<gridDim, blockDim, sharedMemorySize>>>(input, mask, output, MATRIX_HEIGHT, MATRIX_WIDTH, in_tile_dim, out_tile_dim);
        cudaThreadSynchronize();
    
        // time counting terminate
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
    
        //verify result
        cudaMemcpy(result,output,MATRIX_HEIGHT * MATRIX_WIDTH * sizeof(int),cudaMemcpyDeviceToHost);
        verify(input, mask, result, MATRIX_HEIGHT, MATRIX_WIDTH);
    
        // compute time elapsed on GPU computing
        cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
        printf("Time elapsed on naive GPU matrix tiled-convolution of %dx%d. tile(%d): %f ms.\n\n", MATRIX_HEIGHT, MATRIX_WIDTH, out_tile_dim, naive_gpu_elapsed_time_ms);
    
        // free memory
        free(result);
        cudaFree(input);
        cudaFree(mask);
        cudaFree(output);
    }

    





    return 0;
}


Device Number: 0
  Device name: Tesla T4
  max Blocks Per MultiProcessor: 16
  max Threads Per MultiProcessor: 1024
  max Threads Per Block: 1024
  num SM: 40
  num bytes sharedMem Per Block: 49152
  num bytes sharedMem Per Multiprocessor: 65536
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Peak Memory Bandwidth (GB/s): 320.064000

Device Number: 1
  Device name: Tesla T4
  max Blocks Per MultiProcessor: 16
  max Threads Per MultiProcessor: 1024
  max Threads Per Block: 1024
  num SM: 40
  num bytes sharedMem Per Block: 49152
  num bytes sharedMem Per Multiprocessor: 65536
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Peak Memory Bandwidth (GB/s): 320.064000

successs convolution; Time elapsed on naive GPU matrix common-convolution of 8192x512. block(2): 14.498272 ms.

successs convolution; Time elapsed on naive GPU matrix common-convolution of 8192x512. block(4): 11.850944 ms.

successs convolution; Time elapsed on naive GPU matrix common-co