<a href="https://colab.research.google.com/github/lesliee94/cudaLearn/blob/master/reduceCompare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi


Fri Jan  2 08:13:37 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   56C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
%%writefile card5_reduce.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <cmath>
#include <cstdlib>

#define CUDA_CHECK(call) do {                                 \
  cudaError_t e = (call);                                     \
  if (e != cudaSuccess) {                                     \
    std::cerr << "CUDA error: " << cudaGetErrorString(e)      \
              << " at " << __FILE__ << ":" << __LINE__ << "\n"; \
    std::exit(1);                                             \
  }                                                           \
} while(0)

// 每个 block 处理 2*blockDim.x 个元素，先写入 shared，再树形规约到 shared[0]
__global__ void reduce_sum(const float* __restrict__ in,
                           float* __restrict__ out,
                           int n) {
  extern __shared__ float s[]; // 动态 shared memory
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * (blockDim.x * 2) + tid;

  float x = 0.f;
  if (i < n) x += in[i];
  if (i + blockDim.x < n) x += in[i + blockDim.x];

  s[tid] = x;
  __syncthreads(); // 关键：保证整个 block 都写完 shared

  // 经典树形规约：512/256/128/...
  for (unsigned int stride = blockDim.x / 2; stride > 32; stride >>= 1) {
    if (tid < stride) s[tid] += s[tid + stride];
    __syncthreads();
  }

  // 最后一个 warp 用 warp-synchronous（这里用 volatile 简化）
  if (tid < 32) {
    volatile float* sv = s;
    sv[tid] += sv[tid + 32];
    sv[tid] += sv[tid + 16];
    sv[tid] += sv[tid + 8];
    sv[tid] += sv[tid + 4];
    sv[tid] += sv[tid + 2];
    sv[tid] += sv[tid + 1];
  }
//剩下32个线程只有0号的结果是有意义的

  if (tid == 0) out[blockIdx.x] = s[0];
}

double cpu_sum(const float* a, int n) {
  double s = 0.0;
  for (int i = 0; i < n; ++i) s += a[i];
  return s;
}

float gpu_reduce_sum(const float* d_in, int n, int block, cudaStream_t stream) {
  // 最坏情况下输出 block 数 ~ n/(2*block)
  int max_blocks = (n + (block * 2 - 1)) / (block * 2);
  float *d_buf1=nullptr, *d_buf2=nullptr;
  CUDA_CHECK(cudaMalloc(&d_buf1, max_blocks * sizeof(float)));
  CUDA_CHECK(cudaMalloc(&d_buf2, max_blocks * sizeof(float)));

  const float* cur_in = d_in;
  float* cur_out = d_buf1;
  int cur_n = n;

  while (true) {
    int grid = (cur_n + (block * 2 - 1)) / (block * 2);
    size_t shmem = block * sizeof(float);
    reduce_sum<<<grid, block, shmem, stream>>>(cur_in, cur_out, cur_n);
    CUDA_CHECK(cudaGetLastError());

    cur_n = grid;
    if (cur_n <= 1) break;

    // 下一轮：输入变成上一轮的输出
    cur_in = cur_out;
    cur_out = (cur_out == d_buf1) ? d_buf2 : d_buf1;
  }

  float result = 0.f;
  CUDA_CHECK(cudaMemcpyAsync(&result, cur_out, sizeof(float), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  CUDA_CHECK(cudaFree(d_buf1));
  CUDA_CHECK(cudaFree(d_buf2));
  return result;
}

int main(int argc, char** argv) {
  int n = (argc > 1) ? std::atoi(argv[1]) : (1 << 26);   // ~67M
  int iters = (argc > 2) ? std::atoi(argv[2]) : 50;
  int block = (argc > 3) ? std::atoi(argv[3]) : 256;

  size_t bytes = (size_t)n * sizeof(float);

  // host
  std::vector<float> h(n);
  for (int i = 0; i < n; ++i) h[i] = 1.0f; // 结果应接近 n

  // device
  float* d = nullptr;
  CUDA_CHECK(cudaMalloc(&d, bytes));
  CUDA_CHECK(cudaMemcpy(d, h.data(), bytes, cudaMemcpyHostToDevice));

  cudaStream_t s;
  CUDA_CHECK(cudaStreamCreate(&s));

  // warmup
  (void)gpu_reduce_sum(d, n, block, s);

  cudaEvent_t st, ed;
  CUDA_CHECK(cudaEventCreate(&st));
  CUDA_CHECK(cudaEventCreate(&ed));

  CUDA_CHECK(cudaEventRecord(st, s));
  float gpu_res = 0.f;
  for (int t = 0; t < iters; ++t) {
    gpu_res = gpu_reduce_sum(d, n, block, s);
  }
  CUDA_CHECK(cudaEventRecord(ed, s));
  CUDA_CHECK(cudaEventSynchronize(ed));

  float total_ms = 0.f;
  CUDA_CHECK(cudaEventElapsedTime(&total_ms, st, ed));
  float ms_per = total_ms / iters;

  // CPU for correctness (double accumulate)
  double cpu_res = cpu_sum(h.data(), n);
  double abs_err = std::fabs(cpu_res - (double)gpu_res);

  // 规约主要读 n 个 float（4n bytes），写回很小，这里用“读带宽”做个直观指标
  double GBs = (4.0 * n) / (ms_per / 1000.0) / 1e9;

  std::cout << "n=" << n << " block=" << block << " iters=" << iters << "\n";
  std::cout << "GPU reduce: " << ms_per << " ms/iter, ~" << GBs << " GB/s (read)\n";
  std::cout << "CPU sum:    " << cpu_res << "\n";
  std::cout << "GPU sum:    " << gpu_res << "\n";
  std::cout << "abs_err:    " << abs_err << "\n";

  CUDA_CHECK(cudaEventDestroy(st));
  CUDA_CHECK(cudaEventDestroy(ed));
  CUDA_CHECK(cudaStreamDestroy(s));
  CUDA_CHECK(cudaFree(d));
  return 0;
}

Writing card5_reduce.cu


In [2]:
!nvcc -O3 -std=c++17 card5_reduce.cu -o card5 \
  -gencode arch=compute_60,code=sm_60 \
  -gencode arch=compute_70,code=sm_70 \
  -gencode arch=compute_75,code=sm_75 \
  -gencode arch=compute_80,code=sm_80 \
  -gencode arch=compute_86,code=sm_86 \
  -gencode arch=compute_89,code=sm_89

In [3]:
!./card5 $((1<<26)) 30 256

n=67108864 block=256 iters=30
GPU reduce: 2.22707 ms/iter, ~120.533 GB/s (read)
CPU sum:    6.71089e+07
GPU sum:    6.71089e+07
abs_err:    0
