1. Check if CUDA is installed successfully.

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0


2. Install a small extension to run nvcc (CUDA compiler) from the Notebook cells.

In [4]:
pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-brh4u7f5
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-brh4u7f5
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=65bbf7d2eab861cd30d2703276b1ec6998345c2cb3ee536c29862598dc51857a
  Stored in directory: /tmp/pip-ephem-wheel-cache-8tytauti/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


3. Load the extension

In [5]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


4. Try a simple C/C++ program.

In [11]:
%%cu
#include <iostream>
int main()
{
    std::cout << "Welcome To Colab!\n";
    return 0;
}

Welcome To Colab!



4. Now try the CUDA code that you are going to develop.

In [14]:
%%cu
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h> 

#define SIZE 10

// Convert and mod
__global__ void add_kernel(uint32_t *d_c, uint32_t *d_a, uint32_t *d_b) {   
   uint32_t tid = threadIdx.x;

   d_c[tid] = d_a[tid] + d_b[tid];
}

int main(int argc, char* argv[]) {
    int i;
    uint32_t *h_a, *h_b, *h_c;  // host pointer
    uint32_t *d_a, *d_b, *d_c;  // device pointer

    cudaMallocHost((void**)&h_a, SIZE * sizeof(uint32_t));   
    cudaMallocHost((void**)&h_b, SIZE * sizeof(uint32_t));   
    cudaMallocHost((void**)&h_c, SIZE * sizeof(uint32_t));   
    cudaMalloc((void**)&d_a, SIZE * sizeof(uint32_t));   
    cudaMalloc((void**)&d_b, SIZE * sizeof(uint32_t));   
    cudaMalloc((void**)&d_c, SIZE * sizeof(uint32_t));   

    for(i=0; i<SIZE; i++) 
    {
      h_a[i] = i; 
      h_b[i] = i;
    }

    cudaMemcpy(d_a, h_a, SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice);  
    cudaMemcpy(d_b, h_b, SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice);     

    add_kernel<<<1, SIZE>>>(d_c, d_a, d_b);
    cudaMemcpy(h_c, d_c, SIZE * sizeof(uint32_t), cudaMemcpyDeviceToHost);  

    printf("Results:\n");
    for(i=0; i<SIZE; i++) printf("%u ", h_c[i]);

    cudaFree(h_a);    cudaFree(h_b);        cudaFree(h_c);
    cudaFree(d_a);    cudaFree(d_b);        cudaFree(d_c);    
    cudaDeviceReset();
    return 0;
}


Results:
0 2 4 6 8 10 12 14 16 18 
