<a href="https://colab.research.google.com/github/lesliee94/cudaLearn/blob/master/benchmarktest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi


Fri Jan  2 08:13:37 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   56C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
%%writefile card3_bench_vadd.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <cmath>
#include <cstdlib>

#define CUDA_CHECK(call) do {                                 \
  cudaError_t e = (call);                                     \
  if (e != cudaSuccess) {                                     \
    std::cerr << "CUDA error: " << cudaGetErrorString(e)      \
              << " at " << __FILE__ << ":" << __LINE__ << "\n"; \
    std::exit(1);                                             \
  }                                                           \
} while(0)

__global__ void vadd(const float* a, const float* b, float* c, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) c[idx] = a[idx] + b[idx];
}

double cpu_vadd_ms(const float* a, const float* b, float* c, int n, int iters) {
  using clock = std::chrono::steady_clock;
  // warmup
  for (int i = 0; i < n; ++i) c[i] = a[i] + b[i];

  auto t0 = clock::now();
  for (int t = 0; t < iters; ++t) {
    for (int i = 0; i < n; ++i) c[i] = a[i] + b[i];
  }
  auto t1 = clock::now();
  std::chrono::duration<double, std::milli> ms = t1 - t0;
  return ms.count() / iters;
}

float gpu_kernel_only_ms(const float* da, const float* db, float* dc, int n, int iters, cudaStream_t s) {
  int block = 256;
  int grid  = (n + block - 1) / block;

  // warmup
  vadd<<<grid, block, 0, s>>>(da, db, dc, n);
  CUDA_CHECK(cudaGetLastError());
  CUDA_CHECK(cudaStreamSynchronize(s));

  cudaEvent_t st, ed;
  CUDA_CHECK(cudaEventCreate(&st));
  CUDA_CHECK(cudaEventCreate(&ed));

  CUDA_CHECK(cudaEventRecord(st, s));
  for (int t = 0; t < iters; ++t) {
    vadd<<<grid, block, 0, s>>>(da, db, dc, n);
  }
  CUDA_CHECK(cudaEventRecord(ed, s));
  CUDA_CHECK(cudaEventSynchronize(ed));

  float ms = 0.f;
  CUDA_CHECK(cudaEventElapsedTime(&ms, st, ed));
  CUDA_CHECK(cudaEventDestroy(st));
  CUDA_CHECK(cudaEventDestroy(ed));
  return ms / iters;
}

float gpu_end_to_end_ms(const float* ha, const float* hb, float* hc,
                        float* da, float* db, float* dc,
                        int n, int iters, cudaStream_t s) {
  size_t bytes = (size_t)n * sizeof(float);
  int block = 256;
  int grid  = (n + block - 1) / block;

  // warmup
  CUDA_CHECK(cudaMemcpyAsync(da, ha, bytes, cudaMemcpyHostToDevice, s));
  CUDA_CHECK(cudaMemcpyAsync(db, hb, bytes, cudaMemcpyHostToDevice, s));
  vadd<<<grid, block, 0, s>>>(da, db, dc, n);
  CUDA_CHECK(cudaMemcpyAsync(hc, dc, bytes, cudaMemcpyDeviceToHost, s));
  CUDA_CHECK(cudaGetLastError());
  CUDA_CHECK(cudaStreamSynchronize(s));

  cudaEvent_t st, ed;
  CUDA_CHECK(cudaEventCreate(&st));
  CUDA_CHECK(cudaEventCreate(&ed));

  CUDA_CHECK(cudaEventRecord(st, s));
  for (int t = 0; t < iters; ++t) {
    CUDA_CHECK(cudaMemcpyAsync(da, ha, bytes, cudaMemcpyHostToDevice, s));
    CUDA_CHECK(cudaMemcpyAsync(db, hb, bytes, cudaMemcpyHostToDevice, s));
    vadd<<<grid, block, 0, s>>>(da, db, dc, n);
    CUDA_CHECK(cudaMemcpyAsync(hc, dc, bytes, cudaMemcpyDeviceToHost, s));
  }
  CUDA_CHECK(cudaEventRecord(ed, s));
  CUDA_CHECK(cudaEventSynchronize(ed));

  float ms = 0.f;
  CUDA_CHECK(cudaEventElapsedTime(&ms, st, ed));
  CUDA_CHECK(cudaEventDestroy(st));
  CUDA_CHECK(cudaEventDestroy(ed));
  return ms / iters;
}

int main(int argc, char** argv) {
  int n     = (argc > 1) ? std::atoi(argv[1]) : (1 << 24); // ~16M
  int iters = (argc > 2) ? std::atoi(argv[2]) : 200;
  int pinned= (argc > 3) ? std::atoi(argv[3]) : 0;         // 1: cudaMallocHost

  if (n <= 0 || iters <= 0) { std::cerr << "bad args\n"; return 1; }

  size_t bytes = (size_t)n * sizeof(float);

  // -------- Host buffers --------
  float *ha=nullptr, *hb=nullptr, *hc=nullptr;
  if (pinned) {
    CUDA_CHECK(cudaMallocHost(&ha, bytes));
    CUDA_CHECK(cudaMallocHost(&hb, bytes));
    CUDA_CHECK(cudaMallocHost(&hc, bytes));
  } else {
    ha = (float*)std::malloc(bytes);
    hb = (float*)std::malloc(bytes);
    hc = (float*)std::malloc(bytes);
  }

  for (int i = 0; i < n; ++i) { ha[i] = (i % 1000) * 0.001f; hb[i] = (i % 777) * 0.001f; }

  // CPU baseline
  double cpu_ms = cpu_vadd_ms(ha, hb, hc, n, iters);

  // -------- Device buffers --------
  float *da=nullptr, *db=nullptr, *dc=nullptr;
  CUDA_CHECK(cudaMalloc(&da, bytes));
  CUDA_CHECK(cudaMalloc(&db, bytes));
  CUDA_CHECK(cudaMalloc(&dc, bytes));

  cudaStream_t s;
  CUDA_CHECK(cudaStreamCreate(&s));

  // 先把数据拷到 GPU，准备做 kernel-only
  CUDA_CHECK(cudaMemcpyAsync(da, ha, bytes, cudaMemcpyHostToDevice, s));
  CUDA_CHECK(cudaMemcpyAsync(db, hb, bytes, cudaMemcpyHostToDevice, s));
  CUDA_CHECK(cudaStreamSynchronize(s));

  float kernel_ms = gpu_kernel_only_ms(da, db, dc, n, iters, s);
  float e2e_ms    = gpu_end_to_end_ms(ha, hb, hc, da, db, dc, n, iters, s);

  // correctness（抽样）
  std::vector<float> hc_cpu(1024);
  for (int i = 0; i < 1024; ++i) hc_cpu[i] = ha[i] + hb[i];
  std::vector<float> hc_gpu(1024);
  CUDA_CHECK(cudaMemcpyAsync(hc_gpu.data(), dc, 1024 * sizeof(float), cudaMemcpyDeviceToHost, s));
  CUDA_CHECK(cudaStreamSynchronize(s));
  float max_err = 0.f;
  for (int i = 0; i < 1024; ++i) max_err = std::max(max_err, std::fabs(hc_cpu[i] - hc_gpu[i]));

  // 向量加法每元素大致搬运 12 bytes（读a+读b+写c）
  auto to_GBs = [&](double ms_per_iter, double elements_touched){
    double sec = ms_per_iter / 1000.0;
    double bytes_moved = 12.0 * elements_touched;
    return (bytes_moved / sec) / 1e9;
  };

  std::cout << "n=" << n << " iters=" << iters << " pinned=" << pinned << "\n";
  std::cout << "CPU compute:      " << cpu_ms   << " ms/iter\n";
  std::cout << "GPU kernel-only:  " << kernel_ms<< " ms/iter   (~" << to_GBs(kernel_ms, n) << " GB/s)\n";
  std::cout << "GPU end-to-end:   " << e2e_ms   << " ms/iter\n";
  std::cout << "max_err(sample):  " << max_err << "\n";

  // cleanup
  CUDA_CHECK(cudaStreamDestroy(s));
  CUDA_CHECK(cudaFree(da)); CUDA_CHECK(cudaFree(db)); CUDA_CHECK(cudaFree(dc));
  if (pinned) {
    CUDA_CHECK(cudaFreeHost(ha)); CUDA_CHECK(cudaFreeHost(hb)); CUDA_CHECK(cudaFreeHost(hc));
  } else {
    std::free(ha); std::free(hb); std::free(hc);
  }
  return 0;
}

Writing card3_bench_vadd.cu


In [7]:
!nvcc -O3 -std=c++17 card3_bench_vadd.cu -o card3 \
  -gencode arch=compute_60,code=sm_60 \
  -gencode arch=compute_70,code=sm_70 \
  -gencode arch=compute_75,code=sm_75 \
  -gencode arch=compute_80,code=sm_80 \
  -gencode arch=compute_86,code=sm_86 \
  -gencode arch=compute_89,code=sm_89

In [8]:
!ls


card3  card3_bench_vadd.cu  sample_data


In [9]:
!./card3 $((1<<24)) 200 0

n=16777216 iters=200 pinned=0
CPU compute:      14.8571 ms/iter
GPU kernel-only:  0.769895 ms/iter   (~261.499 GB/s)
GPU end-to-end:   45.29 ms/iter
max_err(sample):  0
