<a href="https://colab.research.google.com/github/kristenzeng94/cuda_practice/blob/main/run_cuda_on_jupiter_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

如果运行结果不符合预期，参考这里https://github.com/flin3500/Cuda-Google-Colab

In [22]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [23]:
!pip install nvcc4jupyter



In [3]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpqo0d38l_".


In [28]:
%%writefile  hello.cu
#include <stdio.h>

__global__ void hello(){
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){
    printf("here");
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
}


Overwriting hello.cu


In [29]:
!nvcc -arch=sm_75 -gencode=arch=compute_75,code=sm_75 hello.cu -o hello

In [30]:
!./hello

hereHello from block: 0, thread: 0
Hello from block: 0, thread: 1
Hello from block: 1, thread: 0
Hello from block: 1, thread: 1


In [5]:
%%cuda_group_save --group shared --name "error_handling.h"
// error checking macro
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

In [6]:
%%cuda
#include <cstdio>
#include <iostream>

    using namespace std;

__global__ void maxi(int* a, int* b, int n)
{
    int block = 256 * blockIdx.x;
    int max = 0;

    for (int i = block; i < min(256 + block, n); i++) {

        if (max < a[i]) {
            max = a[i];
        }
    }
    b[blockIdx.x] = max;
}

int main()
{

    int n;
    n = 3 >> 2;
    int a[n];

    for (int i = 0; i < n; i++) {
        a[i] = rand() % n;
        cout << a[i] << "\t";
    }

    cudaEvent_t start, end;
    int *ad, *bd;
    int size = n * sizeof(int);
    cudaMalloc(&ad, size);
    cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice);
    int grids = ceil(n * 1.0f / 256.0f);
    cudaMalloc(&bd, grids * sizeof(int));

    dim3 grid(grids, 1);
    dim3 block(1, 1);

    cudaEventCreate(&start);
    cudaEventCreate(&end);
    cudaEventRecord(start);

    while (n > 1) {
        maxi<<<grids, block>>>(ad, bd, n);
        n = ceil(n * 1.0f / 256.0f);
        cudaMemcpy(ad, bd, n * sizeof(int), cudaMemcpyDeviceToDevice);
    }

    cudaEventRecord(end);
    cudaEventSynchronize(end);

    float time = 0;
    cudaEventElapsedTime(&time, start, end);

    int ans[2];
    cudaMemcpy(ans, ad, 4, cudaMemcpyDeviceToHost);

    cout << "The maximum element is : " << ans[0] << endl;

    cout << "The time required : ";
    cout << time << endl;
}

The maximum element is : 0
The time required : 0.003584



In [24]:
%%writefile reduce.cu

#include <cstdio>
#include <iostream>
#include <cmath>

using namespace std;

__global__ void reduce(int *input, int *output, int n) {
    extern __shared__ int sdata[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

    // 每个线程处理两个数据点
    sdata[tid] = (i < n) ? input[i] : 0;
    if (i + blockDim.x < n) {
        sdata[tid] += input[i + blockDim.x];
    }
    __syncthreads();

    // 逐步合并结果
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // 写入结果
    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}

int main()
{
    int n = 10;
    int *input, *output;
    int size = n * sizeof(int);

    // 分配设备内存
    cudaMalloc(&input, size);
    cudaMalloc(&output, size);

    // 初始化输入数据
    int *h_input = new int[n];
    for(int i = 0; i < n; i++){
        h_input[i] = i + 1;
    }
    // 将输入数据从主机复制到设备
    cudaMemcpy(input, h_input, size, cudaMemcpyHostToDevice);

    // 计算网格和线程块大小
    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    // 启动内核
    reduce<<<gridSize, blockSize, blockSize * sizeof(int)>>>(input, output, n);

    // 同步设备
    cudaDeviceSynchronize();
    int *h_output = new int[(n + 255) / 256];
    // 将输出数据从设备复制回主机
    cudaMemcpy(h_output, output, (gridSize * sizeof(int)), cudaMemcpyDeviceToHost);
    // 检查输出结果
    int totalSum = 0;
    for (int i = 0; i < gridSize; i++) {
        totalSum += h_output[i];
        std::cout << "Block " << i << " result: " << h_output[i] << std::endl;
    }
    // 释放设备内存
    cudaFree(input);
    cudaFree(output);

    // 释放主机内存
    delete[] h_input;
    delete[] h_output;

    return 0;
}

Overwriting reduce.cu


In [25]:
!nvcc -arch=sm_75 -gencode=arch=compute_75,code=sm_75 reduce.cu -o reduce

In [26]:
! ./reduce

Block 0 result: 55


In [8]:
%%writefile hello.cu

#include<stdio.h>
__global__ void hello(void)
{
    printf("GPU: Hello!\n");
}
int main(int argc,char **argv)
{
    printf("CPU: Hello!\n");
    hello<<<1,10>>>();
    cudaDeviceReset();
    return 0;
}

Writing hello.cu


In [16]:
!nvcc -arch=sm_75 -gencode=arch=compute_75,code=sm_75 hello.cu -o hello

In [17]:
! ./hello

CPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!


hello  hello.cu  reduce  reduce.cu  sample_data
