## **Exercise: Hello world CUDA extended**

Modify the Hello world CUDA example from the previous step to complete the following tasks:

- define 2 blocks with 4 threads each
- print the "Hello World" message to reflect also information on the thread number from each block (hint: use the built-in variable ```threadIdx.x```)



In [1]:
!nvidia-smi

Fri Feb  4 20:51:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.85       Driver Version: 472.47       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P400         Off  | 00000000:01:00.0 Off |                  N/A |
| 34%   28C    P8    N/A /  N/A |    108MiB /  2048MiB |    ERR!      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
 %%file hello.cu
#include <stdio.h>
 
#define NUM_BLOCKS 4
#define BLOCK_SIZE 1
 
__global__ void hello()
{
    int idx = blockIdx.x;
    printf("Hello world! I'm a thread in block %d\n", idx);
}
 
 
int main(int argc,char **argv)
{
    // launch the kernel
    hello<<<NUM_BLOCKS, BLOCK_SIZE>>>();
 
    // force the printf()s to flush
    cudaDeviceSynchronize();
 
    printf("That's all!\n");
 
    return 0;
}

Overwriting hello.cu


In [3]:
!PATH=/usr/local/cuda-10.1/bin:${PATH} nvcc -o hello_cuda hello.cu && ./hello_cuda

Hello world! I'm a thread in block 0
Hello world! I'm a thread in block 1
Hello world! I'm a thread in block 2
Hello world! I'm a thread in block 3
That's all!


You can compare with our solution:

In [4]:
 %%file hello_exercise.cu
#include <stdio.h>
 
#define NUM_BLOCKS 2
#define BLOCK_SIZE 4
 
__global__ void hello()
{
    int bidx = blockIdx.x;
    int tidx = threadIdx.x;
    printf("Hello world! I'm a thread %d in block %d\n", tidx, bidx);
}
 
 
int main(int argc,char **argv)
{
    // launch the kernel
    hello<<<NUM_BLOCKS, BLOCK_SIZE>>>();
 
    // force the printf()s to flush
    cudaDeviceSynchronize();
 
    printf("That's all!\n");
 
    return 0;
}

Overwriting hello_exercise.cu


In [5]:
 !PATH=/usr/local/cuda-10.1/bin:${PATH} nvcc -o hello_cuda_exercise hello_exercise.cu && ./hello_cuda_exercise

Hello world! I'm a thread 0 in block 0
Hello world! I'm a thread 1 in block 0
Hello world! I'm a thread 2 in block 0
Hello world! I'm a thread 3 in block 0
Hello world! I'm a thread 0 in block 1
Hello world! I'm a thread 1 in block 1
Hello world! I'm a thread 2 in block 1
Hello world! I'm a thread 3 in block 1
That's all!


## Exercise: Hello world OpenCL extended

Modify the Hello world OpenCL example from the previous step to complete the following tasks:

- define 2 blocks (work-groups) with 4 threads (work-items) each
- print the "Hello World" message to reflect also information on the thread (work-item) number from each block (work-group) (hint: use the built-in variables ```get_group_id(0)``` for work-groups and ```get_local_id(0)``` for work-items)

In [6]:
%%file hello.cl
__kernel void hello() {
    int gid = get_global_id(0);
    printf("Hello world! I'm a thread in block %d\n", gid);
}

Overwriting hello.cl


In [7]:
 %%file hello_CL.c
#include <CL/cl.h>

#include <stdio.h>
#include <stdlib.h>

#define MAX_SOURCE_SIZE (0x100000)

#define GLOBAl_SIZE 4
#define LOCAL_SIZE 1

int main(int argc, char ** argv) {

	// Load kernel from file hello.cl

	FILE *kernelFile;
	char *kernelSource;
	size_t kernelSize;

	kernelFile = fopen("hello.cl", "r");

	if (!kernelFile) {

		fprintf(stderr, "No file named hello.cl was found\n");

		exit(-1);

	}
	kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
	kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile);
	fclose(kernelFile);

	// Getting platform and device information
	cl_platform_id platformId = NULL;
	cl_device_id deviceID = NULL;
	cl_uint retNumDevices;
	cl_uint retNumPlatforms;
	cl_int ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms);
	ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_DEFAULT, 1, &deviceID, &retNumDevices);

	// Creating context.
	cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL,  &ret);

	// Creating command queue
	cl_command_queue commandQueue = clCreateCommandQueue(context, deviceID, 0, &ret);

	// Create program from kernel source
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret);	

	// Build program
	ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);

	// Create kernel
	cl_kernel kernel = clCreateKernel(program, "hello", &ret);

	// Execute the kernel
	size_t globalItemSize = GLOBAl_SIZE;
	size_t localItemSize = LOCAL_SIZE;
	ret = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, NULL);

  printf("That's all!\n");		

	// Clean up, release memory.
	ret = clFlush(commandQueue);
	ret = clFinish(commandQueue);
	ret = clReleaseCommandQueue(commandQueue);
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);	
	ret = clReleaseContext(context);

	return 0;
}

Overwriting hello_CL.c


In [8]:
 !nvcc -o hello_CL hello_CL.c -lOpenCL && ./hello_CL

Hello world! I'm a thread in block 3
Hello world! I'm a thread in block 0
Hello world! I'm a thread in block 1
Hello world! I'm a thread in block 2
That's all!


You can compare with our solution:

In [9]:
%%file hello_exercise.cl
__kernel void hello() {
    int grid = get_group_id(0);
    int lid = get_local_id(0);
    printf("Hello world! I'm a thread %d in block %d\n", lid, grid);
}

Writing hello_exercise.cl


In [10]:
 %%file hello_CL_exercise.c
#include <CL/cl.h>

#include <stdio.h>
#include <stdlib.h>

#define MAX_SOURCE_SIZE (0x100000)

#define GLOBAl_SIZE 8
#define LOCAL_SIZE 4

int main(int argc, char ** argv) {

	// Load kernel from file hello.cl

	FILE *kernelFile;
	char *kernelSource;
	size_t kernelSize;

	kernelFile = fopen("hello_exercise.cl", "r");

	if (!kernelFile) {

		fprintf(stderr, "No file named hello_exercise.cl was found\n");

		exit(-1);

	}
	kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
	kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile);
	fclose(kernelFile);

	// Getting platform and device information
	cl_platform_id platformId = NULL;
	cl_device_id deviceID = NULL;
	cl_uint retNumDevices;
	cl_uint retNumPlatforms;
	cl_int ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms);
	ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_DEFAULT, 1, &deviceID, &retNumDevices);

	// Creating context.
	cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL,  &ret);

	// Creating command queue
	cl_command_queue commandQueue = clCreateCommandQueue(context, deviceID, 0, &ret);

	// Create program from kernel source
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret);	

	// Build program
	ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);

	// Create kernel
	cl_kernel kernel = clCreateKernel(program, "hello", &ret);

	// Execute the kernel
	size_t globalItemSize = GLOBAl_SIZE;
	size_t localItemSize = LOCAL_SIZE;
	ret = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, NULL);

  printf("That's all!\n");		

	// Clean up, release memory.
	ret = clFlush(commandQueue);
	ret = clFinish(commandQueue);
	ret = clReleaseCommandQueue(commandQueue);
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);	
	ret = clReleaseContext(context);

	return 0;
}

Writing hello_CL_exercise.c


In [11]:
 !nvcc -o hello_CL_exercise hello_CL_exercise.c -lOpenCL && ./hello_CL_exercise

Hello world! I'm a thread 0 in block 0
Hello world! I'm a thread 1 in block 0
Hello world! I'm a thread 2 in block 0
Hello world! I'm a thread 3 in block 0
Hello world! I'm a thread 0 in block 1
Hello world! I'm a thread 1 in block 1
Hello world! I'm a thread 2 in block 1
Hello world! I'm a thread 3 in block 1
That's all!
