<a href="https://colab.research.google.com/github/jayshah1819/CUDA_PREP/blob/main/register_usage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile test_kernel.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel with configurable dummy register usage to inflate register count
__global__ void kernel_with_regs(int dummy_count) {
    int dummy[64];
    for (int i = 0; i < dummy_count; i++) {
        dummy[i] = threadIdx.x;
    }
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx == 0) printf("Dummy registers used: %d\n", dummy_count);
}

int main() {
    int block_size = 256;

    for (int reg_count = 8; reg_count <= 64; reg_count += 8) {
        int max_active_blocks = 0;
        int max_active_threads = 0;

        cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
            &max_active_blocks,
            kernel_with_regs,
            block_size,
            0);

        if (err != cudaSuccess) {
            printf("Error: %s\n", cudaGetErrorString(err));
            break;
        }

        max_active_threads = max_active_blocks * block_size;

        printf("Registers (dummy count): %d, Max active blocks/SM: %d, Max active threads/SM: %d\n",
               reg_count, max_active_blocks, max_active_threads);
    }

    return 0;
}


Writing test_kernel.cu


In [None]:
!nvcc -o test_kernel test_kernel.cu


      int dummy[64];
          ^




In [None]:
!./test_kernel


Error: the provided PTX was compiled with an unsupported toolchain.


In [None]:
!nvcc -arch=sm_75 -o test_kernel test_kernel.cu


      int dummy[64];
          ^




In [None]:
!./test_kernel


Registers (dummy count): 8, Max active blocks/SM: 4, Max active threads/SM: 1024
Registers (dummy count): 16, Max active blocks/SM: 4, Max active threads/SM: 1024
Registers (dummy count): 24, Max active blocks/SM: 4, Max active threads/SM: 1024
Registers (dummy count): 32, Max active blocks/SM: 4, Max active threads/SM: 1024
Registers (dummy count): 40, Max active blocks/SM: 4, Max active threads/SM: 1024
Registers (dummy count): 48, Max active blocks/SM: 4, Max active threads/SM: 1024
Registers (dummy count): 56, Max active blocks/SM: 4, Max active threads/SM: 1024
Registers (dummy count): 64, Max active blocks/SM: 4, Max active threads/SM: 1024
