<a href="https://colab.research.google.com/github/hiteshscoep/gpu-blocksize-analysis/blob/main/GPU_Research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sat Feb 28 08:38:18 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   70C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [2]:
!pip install cupy-cuda12x



In [3]:
import cupy as cp

x = cp.arange(5)
print(x)
print(type(x))

[0 1 2 3 4]
<class 'cupy.ndarray'>


In [19]:
%%writefile poc1_transpose.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 4096
#define TILE 32

#define CUDA(call) do { \
    cudaError_t e = call; \
    if (e != cudaSuccess) { \
        printf("CUDA error %s:%d: %s\n", \
               __FILE__, __LINE__, cudaGetErrorString(e)); \
        return 1; \
    } \
} while(0)

__global__ void transpose(float *out, const float *in, int n) {
    __shared__ float tile[TILE][TILE + 1];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < n && y < n)
        tile[threadIdx.y][threadIdx.x] = in[y * n + x];

    __syncthreads();

    x = blockIdx.y * blockDim.x + threadIdx.x;
    y = blockIdx.x * blockDim.y + threadIdx.y;

    if (x < n && y < n)
        out[y * n + x] = tile[threadIdx.x][threadIdx.y];
}

float run(dim3 block, float *dO, float *dI) {
    dim3 grid((N + block.x - 1) / block.x,
              (N + block.y - 1) / block.y);

    cudaEvent_t s, e;
    cudaEventCreate(&s); cudaEventCreate(&e);

    for (int i = 0; i < 3; i++)
        transpose<<<grid, block>>>(dO, dI, N);

    cudaEventRecord(s);
    for (int i = 0; i < 10; i++)
        transpose<<<grid, block>>>(dO, dI, N);
    cudaEventRecord(e);
    cudaEventSynchronize(e);

    float ms;
    cudaEventElapsedTime(&ms, s, e);
    return ms / 10.0f;
}

int main() {
    size_t bytes = (size_t)N * N * sizeof(float);
    float *h = (float*)malloc(bytes);
    for (int i = 0; i < N*N; i++) h[i] = (float)i;

    float *dI, *dO;
    CUDA(cudaMalloc(&dI, bytes));
    CUDA(cudaMalloc(&dO, bytes));
    CUDA(cudaMemcpy(dI, h, bytes, cudaMemcpyHostToDevice));

    int cfg[][2] = {
        {16,16}, {32,8}, {8,32},
        {32,16}, {16,32}, {32,32}
    };

    printf("Block     Threads   Time(ms)   BW(GB/s)\n");
    printf("------------------------------------------------\n");

    for (int i = 0; i < 6; i++) {
        dim3 block(cfg[i][0], cfg[i][1]);
        if (block.x * block.y > 1024) continue;

        float ms = run(block, dO, dI);
        double bw = 2.0 * bytes / (ms * 1e6);

        printf("%2dx%-2d %8d %10.3f %10.1f\n",
               block.x, block.y,
               block.x * block.y, ms, bw);
    }

    cudaFree(dI); cudaFree(dO); free(h);
    return 0;
}

Overwriting poc1_transpose.cu


In [20]:
!nvcc -O3 -arch=sm_75 poc1_transpose.cu -o poc1_fixed

In [21]:
!./poc1_fixed

Block     Threads   Time(ms)   BW(GB/s)
------------------------------------------------
16x16      256      0.747      179.6
32x8       256      0.530      253.0
 8x32      256      0.692      194.0
32x16      512      0.626      214.3
16x32      512      0.704      190.8
32x32     1024      0.824      162.9


In [24]:
%%writefile cc.cu

#include<stdio.h>
#include<cuda_runtime.h>

int main(){
	int device;
	cudaGetDevice(&device);

	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop,device);

	printf("GPU: %s\n", prop.name);
	printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
	printf("SMs: %d\n", prop.multiProcessorCount);
	printf("Max threads/block: %d\n", prop.maxThreadsPerBlock);
	printf("Shared mem/block: %zu KB\n", prop.sharedMemPerBlock / 1024);
	printf("Shared mem/SM: %zu KB\n", prop.sharedMemPerMultiprocessor / 1024);
	printf("Total VRAM: %.2f GB\n", prop.totalGlobalMem / 1e9);

	return 0;
}


Writing cc.cu


In [27]:
!nvcc -O3 -arch=sm_75 cc.cu -o cc

In [29]:
!./cc

GPU: Tesla T4
Compute Capability: 7.5
SMs: 40
Max threads/block: 1024
Shared mem/block: 48 KB
Shared mem/SM: 64 KB
Total VRAM: 15.64 GB
