In [1]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [2]:
!nvidia-smi

Thu Jul  4 12:56:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# 新段落

In [3]:
!nvidia-smi

Thu Jul  4 12:57:37 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [110]:
%%writefile vec_add.cu
#include <iostream>
#include <vector>
#include <cuda_runtime.h>

// CUDA kernel function for summing multiple vectors
__global__ void vector_add(const float* in, size_t n, size_t dim, float* out) {
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    for (size_t i = 0; i < dim; ++i) {
        sum += in[idx + i * n];
    }
    out[idx] = sum;
}

int main() {
    size_t n = 4; // Number of vectors
    size_t dim = 1024*1024*128; // Dimension of each vector
    std::vector<float> h_in(n * dim);
    std::vector<float> h_out(n);
    for (size_t i = 0; i < n * dim; ++i) {
        h_in[i] = static_cast<float>(i);
    }
    float* d_in;
    float* d_out;
    cudaMalloc(&d_in, n * dim * sizeof(float));
    cudaMalloc(&d_out, n * sizeof(float));
    cudaMemcpy(d_in, h_in.data(), n * dim * sizeof(float), cudaMemcpyHostToDevice);
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((n + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (dim + threadsPerBlock.y - 1) / threadsPerBlock.y);
    vector_add<<<blocksPerGrid, threadsPerBlock>>>(d_in, n, dim, d_out);
    cudaMemcpy(h_out.data(), d_out, n * sizeof(float), cudaMemcpyDeviceToHost);
//for (size_t i = 0; i < n * dim; i++){
//    //std::cout<<i<<std::endl;
//    std::cout<<h_in[i]<<std::endl;
//}
//for (size_t i = 0; i < n; i++) {
//    //std::cout<<i<<std::endl;
//    std::cout<<h_out[i]<<std::endl;
//}
//
//std::cout << "Success!" << std::endl;
    cudaFree(d_in);
    cudaFree(d_out);

    return 0;
}


Overwriting vec_add.cu


In [111]:
!nvcc -std=c++11 -arch=sm_75 vec_add.cu -o vec_add

In [112]:
!time ./vec_add


real	0m6.080s
user	0m4.831s
sys	0m1.150s


In [114]:
%%writefile add_vec.cu
#include <iostream>
#include <vector>
void vector_add(const std::vector<float>& in, size_t n, size_t dim, std::vector<float>& out) {
    for (size_t i = 0; i < n; ++i) {
        float sum = 0.0f;
        for (size_t j = 0; j < dim; ++j) {
            sum += in[i + j * n];
        }
        out[i] = sum;
    }
}
int main() {
    size_t n = 4; // Number of vectors
    size_t dim = 1024*1024*128; // Dimension of each vector
    std::vector<float> h_in(n * dim);
    std::vector<float> h_out(n);
    for (size_t i = 0; i < n * dim; ++i) {
        h_in[i] = static_cast<float>(i);
    }
    vector_add(h_in, n, dim, h_out);
    for (size_t i = 0; i < n; ++i) {
        float sum = 0.0f;
        for (size_t j = 0; j < dim; ++j) {
            sum += h_in[i + j * n];
        }
        if (h_out[i] != sum) {
            std::cerr << "Error at index " << i << "! Expected " << sum << " but got " << h_out[i] << std::endl;
            return 1;
        }
    }
    std::cout << "Success!" << std::endl;
    return 0;
}


Overwriting add_vec.cu


In [115]:
!nvcc -std=c++11 -arch=sm_75 add_vec.cu -o add_vec_c

In [116]:
!time ./add_vec_c

Success!

real	0m9.265s
user	0m8.143s
sys	0m1.029s
