<a href="https://colab.research.google.com/github/hyeonji0401/CUDA_practice/blob/main/performance_measurements.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **성능 측정 테스트**


이전 벡터의 합 성능 측정 당시 사용한 chrono 방식과 cudaEvent 방식으로 비교해서 측정해본 뒤 결과 확인해보기


**1. chrono 방식**

In [12]:
%%cuda
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <chrono>
#include <iostream>

// The size of the vector
#define NUM_DATA 1024

// Simple vector sum kernel (Max vector size : 1024)
__global__ void vecAdd(int* _a, int* _b, int* _c) {
	int tID = threadIdx.x;
	_c[tID] = _a[tID] + _b[tID];
}

int main(void)
{

	int* a, * b, * c, * hc;	// Vectors on the host
	int* da, * db, * dc;	// Vectors on the device

	int memSize = sizeof(int) * NUM_DATA;
	printf("%d elements, memSize = %d bytes\n", NUM_DATA, memSize);

	// Memory allocation on the host-side
	a = new int[NUM_DATA]; memset(a, 0, memSize);
	b = new int[NUM_DATA]; memset(b, 0, memSize);
	c = new int[NUM_DATA]; memset(c, 0, memSize);
	hc = new int[NUM_DATA]; memset(hc, 0, memSize);

	// Data generation
	for (int i = 0; i < NUM_DATA; i++) {
		a[i] = rand() % 10;
		b[i] = rand() % 10;
	}


	// Vector sum on host (for performance comparision)
  auto hostStart = std::chrono::high_resolution_clock::now();
	for (int i = 0; i < NUM_DATA; i++)
		hc[i] = a[i] + b[i];
  auto hostEnd = std::chrono::high_resolution_clock::now();
  // 밀리초 단위로 경과 시간 계산 (소수점 포함)
  std::chrono::duration<double, std::milli> hostElapsed = hostEnd - hostStart;


	// Memory allocation on the device-side
	cudaMalloc(&da, memSize); cudaMemset(da, 0, memSize);
	cudaMalloc(&db, memSize); cudaMemset(db, 0, memSize);
	cudaMalloc(&dc, memSize); cudaMemset(dc, 0, memSize);

  auto GPUstart = std::chrono::high_resolution_clock::now();

	// Data copy : Host -> Device
	auto h2dStart = std::chrono::high_resolution_clock::now();
	cudaMemcpy(da, a, memSize, cudaMemcpyHostToDevice);
	cudaMemcpy(db, b, memSize, cudaMemcpyHostToDevice);
	auto h2dEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> h2dElapsed = h2dEnd - h2dStart;

	// Kernel call
	auto kernelStart = std::chrono::high_resolution_clock::now();
	vecAdd <<<1, NUM_DATA >>> (da, db, dc);
	cudaDeviceSynchronize(); // synchronization function
  auto kernelEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> kernelElapsed = kernelEnd - kernelStart;


	// Copy results : Device -> Host
  auto d2hStart = std::chrono::high_resolution_clock::now();
	cudaMemcpy(c, dc, memSize, cudaMemcpyDeviceToHost);
  auto d2hEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> d2hElapsed = d2hEnd - d2hStart;

	auto GPUend = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> GPUelapsed = GPUend - GPUstart;

	// Release device memory
	cudaFree(da); cudaFree(db); cudaFree(dc);

  // 결과 출력 (밀리초 단위, 소수점 포함)
  std::cout << "Host time: " << hostElapsed.count() << " ms" << std::endl;
  std::cout<<"Host -> Device: " << h2dElapsed.count() << " ms" << std::endl;
  std::cout<<"Kernel: " << kernelElapsed.count() << " ms" << std::endl;
  std::cout<<"Device -> Host: " << d2hElapsed.count() << " ms" << std::endl;
  std::cout<<"CUDA Total Time: " << GPUelapsed.count() << " ms" << std::endl;

	// Check results
	bool result = true;
	for (int i = 0; i < NUM_DATA; i++) {
		if (hc[i] != c[i]) {
			printf("[%d] The result is not matched! (%d, %d)\n"
				, i, hc[i], c[i]);
			result = false;
		}
	}

	if (result)
		printf("GPU works well!\n");

	// Release host memory
	delete[] a; delete[] b; delete[] c;

	return 0;
}

1024 elements, memSize = 4096 bytes
Host time: 0.002938 ms
Host -> Device: 0.022235 ms
Kernel: 0.152926 ms
Device -> Host: 0.016128 ms
CUDA Total Time: 0.191843 ms
GPU works well!



**3번 실행 결과 평균**

**Host :** 0.003306 ms

**Host -> Device :** 0.072495 ms

**Kernel:** 0.10874393 ms

**Device -> Host:** 0.0171623 ms

**CUDA Total time:** 0.210348 ms

**2. CUDA Event 방식**

> CUDA event방식으로는 CPU 성능 측정이 불가함으로 호스트 코드 외 측정

In [16]:
%%cuda
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <chrono>
#include <iostream>

// The size of the vector
#define NUM_DATA 1024

// Simple vector sum kernel (Max vector size : 1024)
__global__ void vecAdd(int* _a, int* _b, int* _c) {
	int tID = threadIdx.x;
	_c[tID] = _a[tID] + _b[tID];
}

int main(void)
{

	int* a, * b, * c, * hc;	// Vectors on the host
	int* da, * db, * dc;	// Vectors on the device

	int memSize = sizeof(int) * NUM_DATA;
	printf("%d elements, memSize = %d bytes\n", NUM_DATA, memSize);

	// Memory allocation on the host-side
	a = new int[NUM_DATA]; memset(a, 0, memSize);
	b = new int[NUM_DATA]; memset(b, 0, memSize);
	c = new int[NUM_DATA]; memset(c, 0, memSize);
	hc = new int[NUM_DATA]; memset(hc, 0, memSize);

	// Data generation
	for (int i = 0; i < NUM_DATA; i++) {
		a[i] = rand() % 10;
		b[i] = rand() % 10;
	}


	// Vector sum on host (for performance comparision)
  auto start = std::chrono::high_resolution_clock::now();
	for (int i = 0; i < NUM_DATA; i++)
		hc[i] = a[i] + b[i];
  auto end = std::chrono::high_resolution_clock::now();
  // 밀리초 단위로 경과 시간 계산 (소수점 포함)
  std::chrono::duration<double, std::milli> elapsed = end - start;


  cudaEvent_t totalStart, totalStop, h2dStart, h2dStop, kernelStart, kernelStop, d2hStart, d2hStop;
  cudaEventCreate(&totalStart);
  cudaEventCreate(&totalStop);
  cudaEventCreate(&h2dStart);
  cudaEventCreate(&h2dStop);
  cudaEventCreate(&kernelStart);
  cudaEventCreate(&kernelStop);
  cudaEventCreate(&d2hStart);
  cudaEventCreate(&d2hStop);


	// Memory allocation on the device-side
	cudaMalloc(&da, memSize); cudaMemset(da, 0, memSize);
	cudaMalloc(&db, memSize); cudaMemset(db, 0, memSize);
	cudaMalloc(&dc, memSize); cudaMemset(dc, 0, memSize);

  cudaEventRecord(totalStart, 0);

	// Data copy : Host -> Device
	cudaEventRecord(h2dStart, 0);
	cudaMemcpy(da, a, memSize, cudaMemcpyHostToDevice);
	cudaMemcpy(db, b, memSize, cudaMemcpyHostToDevice);
	cudaEventRecord(h2dStop, 0);
  cudaEventSynchronize(h2dStop);
  float h2dTime;
  cudaEventElapsedTime(&h2dTime, h2dStart, h2dStop);

	// Kernel call
	cudaEventRecord(kernelStart, 0);
	vecAdd <<<1, NUM_DATA >>> (da, db, dc);
  cudaEventRecord(kernelStop, 0);
  cudaEventSynchronize(kernelStop);
  float kernelTime;
  cudaEventElapsedTime(&kernelTime, kernelStart, kernelStop);

	// Copy results : Device -> Host
  cudaEventRecord(d2hStart, 0);
	cudaMemcpy(c, dc, memSize, cudaMemcpyDeviceToHost);
  cudaEventRecord(d2hStop, 0);
  cudaEventSynchronize(d2hStop);
  float d2hTime;
  cudaEventElapsedTime(&d2hTime, d2hStart, d2hStop);

	cudaEventRecord(totalStop, 0);
  cudaEventSynchronize(totalStop);
  float totalTime;
  cudaEventElapsedTime(&totalTime, totalStart, totalStop);

	// Release device memory
	cudaFree(da); cudaFree(db); cudaFree(dc);

  // 결과 출력 (밀리초 단위, 소수점 포함)
  std::cout << "Host time: " << elapsed.count() << " ms" << std::endl;
  std::cout<<"Host -> Device: " << h2dTime << " ms" << std::endl;
  std::cout<<"Kernel: " << kernelTime << " ms" << std::endl;
  std::cout<<"Device -> Host: " << d2hTime << " ms" << std::endl;
  std::cout<<"CUDA Total Time: " << totalTime << " ms" << std::endl;

	// Check results
	bool result = true;
	for (int i = 0; i < NUM_DATA; i++) {
		if (hc[i] != c[i]) {
			printf("[%d] The result is not matched! (%d, %d)\n"
				, i, hc[i], c[i]);
			result = false;
		}
	}

	if (result)
		printf("GPU works well!\n");

	// Release host memory
	delete[] a; delete[] b; delete[] c;

	return 0;
}

1024 elements, memSize = 4096 bytes
Host time: 0.002987 ms
Host -> Device: 0.034208 ms
Kernel: 0.191776 ms
Device -> Host: 0.01856 ms
CUDA Total Time: 0.273536 ms
GPU works well!



**3번 실행 결과 평균**

**Host -> Device :** 0.0260293 ms

**Kernel:** 0.1693626667 ms

**Device -> Host:** 0.01904 ms

**CUDA Total time:** 0.2434773 ms

**만약 chrono 방식에서 동기화가 이루어지지 않는다면?**

In [19]:
%%cuda
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <chrono>
#include <iostream>

// The size of the vector
#define NUM_DATA 1024

// Simple vector sum kernel (Max vector size : 1024)
__global__ void vecAdd(int* _a, int* _b, int* _c) {
	int tID = threadIdx.x;
	_c[tID] = _a[tID] + _b[tID];
}

int main(void)
{

	int* a, * b, * c, * hc;	// Vectors on the host
	int* da, * db, * dc;	// Vectors on the device

	int memSize = sizeof(int) * NUM_DATA;
	printf("%d elements, memSize = %d bytes\n", NUM_DATA, memSize);

	// Memory allocation on the host-side
	a = new int[NUM_DATA]; memset(a, 0, memSize);
	b = new int[NUM_DATA]; memset(b, 0, memSize);
	c = new int[NUM_DATA]; memset(c, 0, memSize);
	hc = new int[NUM_DATA]; memset(hc, 0, memSize);

	// Data generation
	for (int i = 0; i < NUM_DATA; i++) {
		a[i] = rand() % 10;
		b[i] = rand() % 10;
	}


	// Vector sum on host (for performance comparision)
  auto hostStart = std::chrono::high_resolution_clock::now();
	for (int i = 0; i < NUM_DATA; i++)
		hc[i] = a[i] + b[i];
  auto hostEnd = std::chrono::high_resolution_clock::now();
  // 밀리초 단위로 경과 시간 계산 (소수점 포함)
  std::chrono::duration<double, std::milli> hostElapsed = hostEnd - hostStart;


	// Memory allocation on the device-side
	cudaMalloc(&da, memSize); cudaMemset(da, 0, memSize);
	cudaMalloc(&db, memSize); cudaMemset(db, 0, memSize);
	cudaMalloc(&dc, memSize); cudaMemset(dc, 0, memSize);

  auto GPUstart = std::chrono::high_resolution_clock::now();

	// Data copy : Host -> Device
	auto h2dStart = std::chrono::high_resolution_clock::now();
	cudaMemcpy(da, a, memSize, cudaMemcpyHostToDevice);
	cudaMemcpy(db, b, memSize, cudaMemcpyHostToDevice);
	auto h2dEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> h2dElapsed = h2dEnd - h2dStart;

	// Kernel call
	auto kernelStart = std::chrono::high_resolution_clock::now();
	vecAdd <<<1, NUM_DATA >>> (da, db, dc);
  auto kernelEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> kernelElapsed = kernelEnd - kernelStart;


	// Copy results : Device -> Host
  auto d2hStart = std::chrono::high_resolution_clock::now();
	cudaMemcpy(c, dc, memSize, cudaMemcpyDeviceToHost);
  auto d2hEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> d2hElapsed = d2hEnd - d2hStart;

	auto GPUend = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> GPUelapsed = GPUend - GPUstart;

	// Release device memory
	cudaFree(da); cudaFree(db); cudaFree(dc);

  // 결과 출력 (밀리초 단위, 소수점 포함)
  std::cout << "Host time: " << hostElapsed.count() << " ms" << std::endl;
  std::cout<<"Host -> Device: " << h2dElapsed.count() << " ms" << std::endl;
  std::cout<<"Kernel: " << kernelElapsed.count() << " ms" << std::endl;
  std::cout<<"Device -> Host: " << d2hElapsed.count() << " ms" << std::endl;
  std::cout<<"CUDA Total Time: " << GPUelapsed.count() << " ms" << std::endl;

	// Check results
	bool result = true;
	for (int i = 0; i < NUM_DATA; i++) {
		if (hc[i] != c[i]) {
			printf("[%d] The result is not matched! (%d, %d)\n"
				, i, hc[i], c[i]);
			result = false;
		}
	}

	if (result)
		printf("GPU works well!\n");

	// Release host memory
	delete[] a; delete[] b; delete[] c;

	return 0;
}

1024 elements, memSize = 4096 bytes
Host time: 0.002958 ms
Host -> Device: 0.022639 ms
Kernel: 0.141264 ms
Device -> Host: 0.019987 ms
CUDA Total Time: 0.184389 ms
GPU works well!



**별 차이가 없다 이유를 생각해보자면?**

**cudaMemcpy** 때문인 것 같다.

cudaMemcpy는 자동으로 동기화로 수행되어서 간단한 연산에서는 굳이 동기화를 명시하지 않아도 동기화로 수행되는 것 같다.