<a href="https://colab.research.google.com/github/lesliee94/cudaLearn/blob/master/coalesce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi


Fri Jan  2 08:13:37 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   56C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [10]:
%%writefile card4_coalescing.cu
#include <cuda_runtime.h>
#include <iostream>
#include <cstdlib>
#include <cmath>

#define CUDA_CHECK(call) do {                                 \
  cudaError_t e = (call);                                     \
  if (e != cudaSuccess) {                                     \
    std::cerr << "CUDA error: " << cudaGetErrorString(e)      \
              << " at " << __FILE__ << ":" << __LINE__ << "\n"; \
    std::exit(1);                                             \
  }                                                           \
} while(0)

// 工业常用：grid-stride loop（无论 n 多大都能覆盖）
__global__ void vadd_coalesced(const float* a, const float* b, float* c, int n) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x;
       i < n;
       i += blockDim.x * gridDim.x) {
    c[i] = a[i] + b[i];
  }
}

// 刻意“跳着访问”：每个线程处理 idx = tid * stride
__global__ void vadd_strided(const float* a, const float* b, float* c, int n, int stride) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int idx = tid * stride;
  if (idx < n) c[idx] = a[idx] + b[idx];
}

float time_ms_coalesced(const float* a, const float* b, float* c, int n, int iters, cudaStream_t s) {
  int block = 256;
  int grid  = std::min((n + block - 1) / block, 65535);

  // warmup
  vadd_coalesced<<<grid, block, 0, s>>>(a, b, c, n);
  CUDA_CHECK(cudaGetLastError());
  CUDA_CHECK(cudaStreamSynchronize(s));

  cudaEvent_t st, ed;
  CUDA_CHECK(cudaEventCreate(&st));
  CUDA_CHECK(cudaEventCreate(&ed));

  CUDA_CHECK(cudaEventRecord(st, s));
  for (int t = 0; t < iters; ++t) vadd_coalesced<<<grid, block, 0, s>>>(a, b, c, n);
  CUDA_CHECK(cudaEventRecord(ed, s));
  CUDA_CHECK(cudaEventSynchronize(ed));

  float ms = 0.f;
  CUDA_CHECK(cudaEventElapsedTime(&ms, st, ed));
  CUDA_CHECK(cudaEventDestroy(st));
  CUDA_CHECK(cudaEventDestroy(ed));
  return ms / iters;
}

float time_ms_strided(const float* a, const float* b, float* c, int n, int stride, int iters, cudaStream_t s) {
  // 实际会触及大约 n/stride 个元素（为了让线程数匹配）
  int n_eff = (n + stride - 1) / stride;
  int block = 256;
  int grid  = std::min((n_eff + block - 1) / block, 65535);

  // warmup
  vadd_strided<<<grid, block, 0, s>>>(a, b, c, n, stride);
  CUDA_CHECK(cudaGetLastError());
  CUDA_CHECK(cudaStreamSynchronize(s));

  cudaEvent_t st, ed;
  CUDA_CHECK(cudaEventCreate(&st));
  CUDA_CHECK(cudaEventCreate(&ed));

  CUDA_CHECK(cudaEventRecord(st, s));
  for (int t = 0; t < iters; ++t) vadd_strided<<<grid, block, 0, s>>>(a, b, c, n, stride);
  CUDA_CHECK(cudaEventRecord(ed, s));
  CUDA_CHECK(cudaEventSynchronize(ed));

  float ms = 0.f;
  CUDA_CHECK(cudaEventElapsedTime(&ms, st, ed));
  CUDA_CHECK(cudaEventDestroy(st));
  CUDA_CHECK(cudaEventDestroy(ed));
  return ms / iters;
}

int main(int argc, char** argv) {
  int n     = (argc > 1) ? std::atoi(argv[1]) : (1 << 26);
  int iters = (argc > 2) ? std::atoi(argv[2]) : 200;

  size_t bytes = (size_t)n * sizeof(float);

  float *ha=nullptr, *hb=nullptr;
  CUDA_CHECK(cudaMallocHost(&ha, bytes));
  CUDA_CHECK(cudaMallocHost(&hb, bytes));
  for (int i = 0; i < n; ++i) { ha[i] = 1.0f; hb[i] = 2.0f; }

  float *da=nullptr, *db=nullptr, *dc=nullptr;
  CUDA_CHECK(cudaMalloc(&da, bytes));
  CUDA_CHECK(cudaMalloc(&db, bytes));
  CUDA_CHECK(cudaMalloc(&dc, bytes));

  cudaStream_t s;
  CUDA_CHECK(cudaStreamCreate(&s));
  CUDA_CHECK(cudaMemcpyAsync(da, ha, bytes, cudaMemcpyHostToDevice, s));
  CUDA_CHECK(cudaMemcpyAsync(db, hb, bytes, cudaMemcpyHostToDevice, s));
  CUDA_CHECK(cudaStreamSynchronize(s));

  auto GBs = [&](double ms_per, double elems_touched){
    double sec = ms_per / 1000.0;
    double moved = 12.0 * elems_touched; // read a + read b + write c
    return (moved / sec) / 1e9;
  };

  float ms_coal = time_ms_coalesced(da, db, dc, n, iters, s);
  std::cout << "coalesced: " << ms_coal << " ms/iter, ~" << GBs(ms_coal, n) << " GB/s\n";

  // 试几个 stride：1,2,4,8,16,32
  for (int stride : {2, 4, 8, 16, 32}) {
    float ms = time_ms_strided(da, db, dc, n, stride, iters, s);
    double n_eff = (n + stride - 1) / stride;
    std::cout << "strided x" << stride << ": " << ms << " ms/iter, ~" << GBs(ms, n_eff) << " GB/s (effective)\n";
  }

  CUDA_CHECK(cudaStreamDestroy(s));
  CUDA_CHECK(cudaFree(da)); CUDA_CHECK(cudaFree(db)); CUDA_CHECK(cudaFree(dc));
  CUDA_CHECK(cudaFreeHost(ha)); CUDA_CHECK(cudaFreeHost(hb));
  return 0;
}

Writing card4_coalescing.cu


In [11]:
!nvcc -O3 -std=c++17 card4_coalescing.cu -o card4 \
  -gencode arch=compute_60,code=sm_60 \
  -gencode arch=compute_70,code=sm_70 \
  -gencode arch=compute_75,code=sm_75 \
  -gencode arch=compute_80,code=sm_80 \
  -gencode arch=compute_86,code=sm_86 \
  -gencode arch=compute_89,code=sm_89

In [12]:
!./card4 $((1<<26)) 200

coalesced: 3.16566 ms/iter, ~254.388 GB/s
strided x2: 2.05763 ms/iter, ~195.688 GB/s (effective)
strided x4: 4.24736 ms/iter, ~47.4004 GB/s (effective)
strided x8: 4.32093 ms/iter, ~23.2967 GB/s (effective)
strided x16: 3.42307 ms/iter, ~14.7037 GB/s (effective)
strided x32: 2.15612 ms/iter, ~11.6718 GB/s (effective)
