<a href="https://colab.research.google.com/github/kt-chan/cuda-demo/blob/main/cuda_cplusplus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 使用Google Colab寫C++程式並運行

Create sample c++ code with %%writefile filename.cpp

In [None]:
%%writefile demo.cpp

#include <iostream>
using namespace std;
int main()
{
    string text = "world2";
    cout << "hello, " + text;
}

Overwriting demo.cpp


Compile the code with %%shell command

In [None]:
%%shell

g++ demo.cpp -o demo



Execution by just run it, with %%shell command.

In [None]:
%%shell
./demo

Hello World!




# 配置 CUDA Environment

In [None]:
# check nvidia card info
!nvidia-smi

Thu May 23 03:05:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# check os info
!cat /etc/*release

DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=22.04
DISTRIB_CODENAME=jammy
DISTRIB_DESCRIPTION="Ubuntu 22.04.3 LTS"
PRETTY_NAME="Ubuntu 22.04.3 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.3 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy


In [None]:
#get current working directory
!pwd

/content


Remote all legacy cuda framework, and update to latest version Go here: https://developer.nvidia.com/cuda-downloads

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
!sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
!wget https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda-repo-ubuntu2204-12-5-local_12.5.0-555.42.02-1_amd64.deb
!sudo dpkg -i cuda-repo-ubuntu2204-12-5-local_12.5.0-555.42.02-1_amd64.deb
!sudo cp /var/cuda-repo-ubuntu2204-12-5-local/cuda-*-keyring.gpg /usr/share/keyrings/
!sudo apt-get update
!sudo apt-get -y install cuda-toolkit-12-5

--2024-05-22 09:49:35--  https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 152.195.19.142
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|152.195.19.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 190 [application/octet-stream]
Saving to: ‘cuda-ubuntu2204.pin’


2024-05-22 09:49:36 (4.09 MB/s) - ‘cuda-ubuntu2204.pin’ saved [190/190]

--2024-05-22 09:49:36--  https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda-repo-ubuntu2204-12-5-local_12.5.0-555.42.02-1_amd64.deb
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 152.195.19.142
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|152.195.19.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3302514250 (3.1G) [application/x-deb]
Saving to: ‘cuda-repo-ubuntu2204

In [1]:
#After refresh the cuda framework, check version info

!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


# 使用Google Colab寫 cuda c++ 程式並運行

set your runtime to cuda by click "runtime" -> "change runtime type" in above toolbar, and select T4 GPU.

First, you have to install nvcc plugin for cuda compiler

In [3]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


then, Load the plugin



In [4]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpvfmkalah".


In [63]:
%%cuda_group_save -n demo.cu -g share

#include <cstdio>
#include <iostream>

#define Debug false

using namespace std;

void rand(int *a, int n){
  for (int i = 0; i < n; i++) {
		a[i] = rand() % n;
		cout << a[i] << "\t";
	}
  cout << "\n";
}

__global__ void vecadd_kernel (int *a, int *b, int *c, int n ) {
  int i = threadIdx.x ; // Calculate my index
  if (i < n ) {  // Protect against out of bounds error
      if(Debug) printf("calling threadIdx.x: %d, \n", i);
      if(Debug) printf("a value: %d, \t", a[i]);
      if(Debug) printf("b value: %d, \t", b[i]);
      c[i] = a[i] + b[i];
      if(Debug) printf("c value: %d, \t", c[i]);
  }
}

void cpuadd (int *a, int *b, int *c, int n ) {
  for (int i=0; i<n; i++)
    {
        c[i] = a[i] + b[i];
    }
}


void gpuadd (int *a, int *b, int *c, int n ) {
  int *d_a , *d_b , *d_c;
  cudaMalloc(&d_a, n*sizeof(int));
  cudaMalloc(&d_b, n*sizeof(int));
  cudaMalloc(&d_c, n*sizeof(int));
  cudaMemcpy(d_a, a, n*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, n*sizeof(int), cudaMemcpyHostToDevice);
  vecadd_kernel<<<1,n>>>(d_a, d_b, d_c, n);
  cudaDeviceSynchronize();
  cudaMemcpy(c, d_c, n*sizeof(int), cudaMemcpyDeviceToHost);
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
}


int main(void) {

  int N=1<<8;
  int *a, *b, *c;
  a = (int*)malloc(N*sizeof (int));
  b = (int*)malloc(N*sizeof (int));
  c = (int*)malloc(N*sizeof (int));

  cout << "a:\t";
  rand(a, N);
  cout << "b:\t";
  rand(b, N);

  cout << "\n\n@CPU, summing value... \n";

  clock_t t;

  // calling cpu
  t=clock();//start time
  cpuadd(a,b,c,N);
  t = clock() - t;//total time = end time - start time


  for (int i = 0; i < N; i++) {
    cout << c[i] << "\t";
	}
  cout << "\n";

  printf ("CPU Avg time = %lf ms.\n",((((float)t)/CLOCKS_PER_SEC)*1000));

  // calling gpu
  cout << "\n\n@GPU, summing value... \n";

  t=clock();//start time
  gpuadd(a,b,c,N);
  t = clock() - t;//total time = end time - start time


  for (int i = 0; i < N; i++) {
    cout << c[i] << "\t";
	}
  cout << "\n";

  printf ("GPU Avg time = %lf ms.\n",((((float)t)/CLOCKS_PER_SEC)*1000));

  free(a);
  free(b);
  free(c);

  return 0;
}

In [64]:
%cuda_group_run -g "share"

a:	103	198	105	115	81	255	74	236	41	205	186	171	242	251	227	70	124	194	84	248	27	232	231	141	118	90	46	99	51	159	201	154	102	50	13	183	49	88	163	90	37	93	5	23	88	233	94	212	171	178	205	198	155	180	84	17	14	130	116	65	33	61	220	135	112	233	62	161	65	225	252	103	62	1	126	151	234	220	107	150	143	56	92	42	236	176	59	251	50	175	60	84	236	24	219	92	2	26	254	67	251	250	170	58	251	41	209	230	5	60	124	148	117	216	190	97	137	249	92	187	168	153	15	149	177	235	241	179	5	239	247	0	233	161	58	229	202	11	203	208	72	71	100	189	31	35	30	168	28	123	100	197	20	115	90	197	94	75	121	99	59	112	100	36	17	158	9	220	170	212	172	242	27	16	175	59	51	205	227	80	72	71	21	92	187	111	34	25	186	155	125	245	11	225	26	28	127	35	248	41	248	164	27	19	181	202	78	232	152	50	56	224	121	77	61	52	188	95	78	119	250	203	108	5	172	134	33	43	170	26	85	162	190	112	181	115	59	4	92	211	54	148	179	175	226	240	228	158	79	50	21	73	253	130	78	169	
b:	8	112	212	178	138	41	84	72	154	10	188	213	14	24	168	68	172	91	243	142	76	215	45	155	9	6

Cuda hellow world program, check this for tool syntax
https://nvcc4jupyter.readthedocs.io/en/latest/magics.html#cuda-magic

example
%%cuda_group_save -n <FILENAME> -g <GROUPNAME>: Save the code in the current cell to a group of source files.

In [None]:
%%cuda_group_save -n demo.cu -g share

#include <cstdio>
#include <iostream>

using namespace std;

#define N 64

__global__ void maxi(int* a, int* b, int n)
{
	int block = 256 * blockIdx.x;
	int max = 0;

	for (int i = block; i < min(256 + block, n); i++) {

		if (max < a[i]) {
			max = a[i];
		}
	}
	b[blockIdx.x] = max;
}

int main()
{

	int a[N];

  cout << N << "\n";
	for (int i = 0; i < N; i++) {
		a[i] = rand() % N;
		# output for debugg the a array
		# cout << a[i] << "\t";
	}

	cudaEvent_t start, end;
	int *ad, *bd;
	int size = N * sizeof(int);
	cudaMalloc(&ad, size);
	cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice);
	int grids = ceil(N * 1.0f / 16.0f);
	cudaMalloc(&bd, grids * sizeof(int));

	dim3 grid(grids, 1);
	dim3 block(4, 4);

	cudaEventCreate(&start);
	cudaEventCreate(&end);
	cudaEventRecord(start);

	while (N > 1) {
		maxi<<<grids, block>>>(ad, bd, N);
		N = ceil(N * 1.0f / 16.0f);
		cudaMemcpy(ad, bd, n * sizeof(int), cudaMemcpyDeviceToDevice);
	}

	cudaEventRecord(end);
	cudaEventSynchronize(end);

	float time = 0;
	cudaEventElapsedTime(&time, start, end);

	int ans[2];
	cudaMemcpy(ans, ad, 4, cudaMemcpyDeviceToHost);

	cout << "The maximum element is : " << ans[0] << endl;

	cout << "The time required : ";
	cout << time << endl;
}


In [None]:
%cuda_group_run -g "share"

Hello, n
