<a href="https://colab.research.google.com/github/kristenzeng94/cuda_practice/blob/main/run_cuda_on_jupiter_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

如果运行结果不符合预期，参考这里https://github.com/flin3500/Cuda-Google-Colab

In [36]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [37]:
!pip install nvcc4jupyter



In [38]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [43]:
%%writefile  hello.cu
#include <stdio.h>

__global__ void hello(){
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){
    printf("here");
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
}


Overwriting hello.cu


In [44]:
!nvcc -arch=sm_75 -gencode=arch=compute_75,code=sm_75 hello.cu -o hello

In [45]:
!./hello

hereHello from block: 0, thread: 0
Hello from block: 0, thread: 1
Hello from block: 1, thread: 0
Hello from block: 1, thread: 1


In [46]:
%%writefile reduce.cu

#include <cstdio>
#include <iostream>
#include <cmath>

using namespace std;

__global__ void reduce(int *input, int *output, int n) {
    extern __shared__ int sdata[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

    // 每个线程处理两个数据点
    sdata[tid] = (i < n) ? input[i] : 0;
    if (i + blockDim.x < n) {
        sdata[tid] += input[i + blockDim.x];
    }
    __syncthreads();

    // 逐步合并结果
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // 写入结果
    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}

int main()
{
    int n = 10;
    int *input, *output;
    int size = n * sizeof(int);

    // 分配设备内存
    cudaMalloc(&input, size);
    cudaMalloc(&output, size);

    // 初始化输入数据
    int *h_input = new int[n];
    for(int i = 0; i < n; i++){
        h_input[i] = i + 1;
    }
    // 将输入数据从主机复制到设备
    cudaMemcpy(input, h_input, size, cudaMemcpyHostToDevice);

    // 计算网格和线程块大小
    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    // 启动内核
    reduce<<<gridSize, blockSize, blockSize * sizeof(int)>>>(input, output, n);

    // 同步设备
    cudaDeviceSynchronize();
    int *h_output = new int[(n + 255) / 256];
    // 将输出数据从设备复制回主机
    cudaMemcpy(h_output, output, (gridSize * sizeof(int)), cudaMemcpyDeviceToHost);
    // 检查输出结果
    int totalSum = 0;
    for (int i = 0; i < gridSize; i++) {
        totalSum += h_output[i];
        std::cout << "Block " << i << " result: " << h_output[i] << std::endl;
    }
    // 释放设备内存
    cudaFree(input);
    cudaFree(output);

    // 释放主机内存
    delete[] h_input;
    delete[] h_output;

    return 0;
}

Overwriting reduce.cu


In [47]:
!nvcc -arch=sm_75 -gencode=arch=compute_75,code=sm_75 reduce.cu -o reduce

In [48]:
! ./reduce

Block 0 result: 55


In [49]:
%%writefile hello.cu

#include<stdio.h>
__global__ void hello(void)
{
    printf("GPU: Hello!\n");
}
int main(int argc,char **argv)
{
    printf("CPU: Hello!\n");
    hello<<<1,10>>>();
    cudaDeviceReset();
    return 0;
}

Overwriting hello.cu


In [50]:
!nvcc -arch=sm_75 -gencode=arch=compute_75,code=sm_75 hello.cu -o hello

In [51]:
! ./hello

CPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!


/content
