In [1]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [2]:
!nvidia-smi

Fri Jul  5 14:07:27 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# 新段落

In [3]:
!nvidia-smi

Fri Jul  5 14:07:29 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [32]:
%%writefile matmul.cu
#include <iostream>
#include <vector>
#include <cuda_runtime.h>

// CUDA kernel function for summing multiple vectors
__global__ void matmul(const float* A, const float* B, size_t m, size_t k, size_t n, float* output) {
    // blockIdx.y: row number of block
    // blockDim.y: thread number of each block in column direction
    // threadIdx.y: di ji ge thread in block
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    // here we calculate output[row][col], sum A[row][i]*B[i][col]
    for(int i = 0;i < k; i++){
      sum += A[row * k + i] * B[i * n + col];
    }
    output[row * n + col] = sum;
}

int main() {
    size_t m = 1024;
    size_t n = 1024;
    size_t k = 1024;
    std::vector<float> A(m * k);
    std::vector<float> B(k * n);
    std::vector<float> C(m * n);
    for (size_t i = 0; i < m * k; i++) {
      A[i] = static_cast<float>(i);
    }
    for (size_t i = 0; i < k * n; i++) {
      B[i] = static_cast<float>(i);
    }
    float* d_A;
    float* d_B;
    float* d_C;
    cudaMalloc(&d_A, m * k * sizeof(float));
    cudaMalloc(&d_B, k * n * sizeof(float));
    cudaMalloc(&d_C, m * n * sizeof(float));
    cudaMemcpy(d_A, A.data(), m * k * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B.data(), k * n * sizeof(float), cudaMemcpyHostToDevice);
    //every block has thread shape like 16*16
    dim3 threadsPerBlock(16, 16);
    //how many blocks are needed to cover the whole matrix
    dim3 blocksPerGrid((n + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (m + threadsPerBlock.y - 1) / threadsPerBlock.y);
    matmul<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, m, n, k, d_C);
    cudaMemcpy(C.data(), d_C, m * n * sizeof(float), cudaMemcpyDeviceToHost);
//for (size_t i = 0; i < m * k; i++) {
//    //std::cout<<i<<std::endl;
//    std::cout<<C[i]<<std::endl;
//}
    std::cout << "Success!" << std::endl;
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    return 0;
}


Overwriting matmul.cu


In [33]:
!nvcc -std=c++11 -arch=sm_75 matmul.cu -o matmul

In [34]:
!time ./matmul

Success!

real	0m0.224s
user	0m0.052s
sys	0m0.157s


In [38]:
%%writefile mulmat.cu
#include <iostream>
#include <vector>

// CPU function for matrix multiplication
void matmul(const std::vector<float>& A, const std::vector<float>& B, size_t m, size_t n, size_t k, std::vector<float>& output) {
    for (size_t i = 0; i < m; ++i) {
        for (size_t j = 0; j < k; ++j) {
            float sum = 0.0f;
            for (size_t l = 0; l < n; ++l) {
                sum += A[i * n + l] * B[l * k + j];
            }
            output[i * k + j] = sum;
        }
    }
}

int main() {
    size_t m = 1024;
    size_t n = 1024;
    size_t k = 1024;
    std::vector<float> A(m * n);
    std::vector<float> B(n * k);
    std::vector<float> C(m * k);
    for (size_t i = 0; i < m * n; ++i) {
        A[i] = static_cast<float>(i);
    }
    for (size_t i = 0; i < n * k; ++i) {
        B[i] = static_cast<float>(i);
    }
    matmul(A, B, m, n, k, C);
//for (size_t i = 0; i < m; ++i) {
//    for (size_t j = 0; j < k; ++j) {
//        std::cout<<C[i]<<std::endl;
//    }
//}
    return 0;
}



Overwriting mulmat.cu


In [39]:
!nvcc -std=c++11 -arch=sm_75 mulmat.cu -o mulmat_c

In [40]:
!time ./mulmat_c


real	0m13.633s
user	0m13.546s
sys	0m0.016s
