In [1]:
#!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
!pip install nvcc4jupyter
!nvcc --version

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpoggb52nz".


In [8]:
%%cuda

#include <stdio.h>

const int N = 1000;

// here N is the number of elements and also the number of threads we will use to compute

__global__ void Vector_Addition(const int *d_a, const int *d_b, int *d_c)
{
      int tid = blockIdx.x * blockDim.x + threadIdx.x;

      // Boundary Check
      if (tid < N)
            d_c[tid] = d_a[tid] + d_b[tid];
}

int main()
{
      int h_a[N], h_b[N], h_c[N];

      int *d_a, *d_b, *d_c;

      const int byte = sizeof(int) * N;

      cudaMalloc((void **)&d_a, byte);
      cudaMalloc((void **)&d_b, byte);
      cudaMalloc((void **)&d_c, byte);

      for (int i = 0; i < N; i++)
      {
            h_a[i] = rand() % 100;
            h_b[i] = rand() % 100;
      }

      cudaMemcpy(d_a, h_a, byte, cudaMemcpyHostToDevice);
      cudaMemcpy(d_b, h_b, byte, cudaMemcpyHostToDevice);

      int NUM_THREADS = 1024;

        // padding extra Thread Block to the grid if N cannot evenly be divided by NUM_THREADS
        // eg. N = 1025, NUM_THREADS = 1024

      int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;


      // Create CUDA events for timing
      cudaEvent_t start, stop;
      cudaEventCreate(&start);
      cudaEventCreate(&stop);

      // Record start event
      cudaEventRecord(start, 0);


      Vector_Addition<<<NUM_BLOCKS, NUM_THREADS>>>(d_a, d_b, d_c);


      // Record stop event
      cudaEventRecord(stop, 0);

      // Synchronize events
      cudaEventSynchronize(stop);

      // Calculate elapsed time
      float elapsedTime;
      cudaEventElapsedTime(&elapsedTime, start, stop);


      // Copy result back to Host
      cudaMemcpy(h_c, d_c, byte, cudaMemcpyDeviceToHost);

      for (int i = 0; i < N; i++)
            printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);

      printf("Elapsed Time: %f ms\n", elapsedTime);

      cudaFree(d_a);
      cudaFree(d_b);
      cudaFree(d_c);

      return 0;
}


83 + 86 = 169
77 + 15 = 92
93 + 35 = 128
86 + 92 = 178
49 + 21 = 70
62 + 27 = 89
90 + 59 = 149
63 + 26 = 89
40 + 26 = 66
72 + 36 = 108
11 + 68 = 79
67 + 29 = 96
82 + 30 = 112
62 + 23 = 85
67 + 35 = 102
29 + 2 = 31
22 + 58 = 80
69 + 67 = 136
93 + 56 = 149
11 + 42 = 53
29 + 73 = 102
21 + 19 = 40
84 + 37 = 121
98 + 24 = 122
15 + 70 = 85
13 + 26 = 39
91 + 80 = 171
56 + 73 = 129
62 + 70 = 132
96 + 81 = 177
5 + 25 = 30
84 + 27 = 111
36 + 5 = 41
46 + 29 = 75
13 + 57 = 70
24 + 95 = 119
82 + 45 = 127
14 + 67 = 81
34 + 64 = 98
43 + 50 = 93
87 + 8 = 95
76 + 78 = 154
88 + 84 = 172
3 + 51 = 54
54 + 99 = 153
32 + 60 = 92
76 + 68 = 144
39 + 12 = 51
26 + 86 = 112
94 + 39 = 133
95 + 70 = 165
34 + 78 = 112
67 + 1 = 68
97 + 2 = 99
17 + 92 = 109
52 + 56 = 108
1 + 80 = 81
86 + 41 = 127
65 + 89 = 154
44 + 19 = 63
40 + 29 = 69
31 + 17 = 48
97 + 71 = 168
81 + 75 = 156
9 + 27 = 36
67 + 56 = 123
97 + 53 = 150
86 + 65 = 151
6 + 83 = 89
19 + 24 = 43
28 + 71 = 99
32 + 29 = 61
3 + 19 = 22
70 + 68 = 138
8 + 15 = 23
