<a href="https://colab.research.google.com/github/hyeonji0401/CUDA_practice/blob/main/CUDA_based_matrix_multiplication_program.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 7.1 행렬곱셈이란?
- 행렬 A 크기가 m*n, B의 크기가 k*n일때 행렬 C의 크기는 m*n이 됨
- 본 장에서 작성하고자 하는 행렬 곱셈 프로그램은 하나의 스레드 블록으로 처리할 수 없는 대규모 행렬의 연산을 목적으로 함(m,n,k의 각 크기가 256이상인 경우

In [17]:
%%cuda
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <chrono>
#include <iostream>

#define DO_CPU
#define DATA_TYPE int

#define SIZE_M (512*2)
#define SIZE_N (512*4)
#define SIZE_K (512*2)

template<class T> void allocNinitMem(T** p, long long size, double* memUsage = NULL);
bool compareMatrix(DATA_TYPE* _A, DATA_TYPE* _B, int _size);

/******************************************************************
* Complete this kernels
******************************************************************/
__global__ void MatMul(DATA_TYPE* matA, DATA_TYPE* matB, DATA_TYPE* matC, int m, int n, int k)
{
  unsigned int col = threadIdx.x + blockIdx.x * blockDim.x;
  unsigned int row = threadIdx.y + blockIdx.y * blockDim.y;
  unsigned int index = row*n + col;
  for(int i=0; i<k; i++){
      if(col<n&&row<m)
        matC[index]+=matA[(index%k)+(row*k)]*matB[(index%k*2)+col];
  }

}


int main(int argc, char* argv[])
{

	// set matrix size
	int m, n, k;
	m = SIZE_M;
	n = SIZE_N;
	k = SIZE_K;

	printf("Size : A = (%d by %d), B = (%d by %d), C = (%d by %d)\n", m, k, k, n, m, n);

	int sizeA = m * k;
	int sizeB = k * n;
	int sizeC = m * n;

	// Make matrix
	DATA_TYPE* A = NULL, * B = NULL;
	allocNinitMem<DATA_TYPE>(&A, sizeA);
	allocNinitMem<DATA_TYPE>(&B, sizeB);

	DATA_TYPE* Ccpu = NULL, * Cgpu = NULL;
	allocNinitMem<DATA_TYPE>(&Ccpu, sizeC);
	allocNinitMem<DATA_TYPE>(&Cgpu, sizeC);

	// generate input matrices
	for (int i = 0; i < sizeA; i++) A[i] = ((rand() % 10) + ((rand() % 100) / 100.0));
	for (int i = 0; i < sizeB; i++) B[i] = ((rand() % 10) + ((rand() % 100) / 100.0));

	// CPU algorithm
	auto hostStart = std::chrono::high_resolution_clock::now();
	for (int row = 0; row < m; row++) {
		for (int col = 0; col < n; col++) {
			int cIndex = row * n + col;
			Ccpu[cIndex] = 0;
			for (int i = 0; i < k; i++)
				Ccpu[cIndex] += (A[row * k + i] * B[i * n + col]);
		}
	}
	printf("CPU finished!\n");
	auto hostEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> hostElapsed = hostEnd - hostStart;


	auto GPUStart = std::chrono::high_resolution_clock::now();
	/******************************************************************
	* Write your codes for GPU algorithm from here
	******************************************************************/
	DATA_TYPE* dA, * dB, * dC;

	// 1. Allocate device memory for dA, dB, dC
	// Hint: cudaMalloc, cudaMemset
  cudaMalloc(&dA, sizeof(DATA_TYPE)*sizeA); cudaMemset(dA, 0, sizeof(DATA_TYPE)*sizeA);
  cudaMalloc(&dB, sizeof(DATA_TYPE)*sizeB); cudaMemset(dB, 0, sizeof(DATA_TYPE)*sizeB);
  cudaMalloc(&dC, sizeof(DATA_TYPE)*sizeC); cudaMemset(dC, 0, sizeof(DATA_TYPE)*sizeC);
	auto h2dStart = std::chrono::high_resolution_clock::now();

	// 2. Send(Copy) the input matrices to GPU (A -> dB, B -> dB)
	// Hint: cudaMemcpy
  cudaMemcpy(dA, A, sizeof(DATA_TYPE)*sizeA, cudaMemcpyHostToDevice);
  cudaMemcpy(dB, B, sizeof(DATA_TYPE)*sizeB, cudaMemcpyHostToDevice);

	auto h2dEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> h2dElapsed = h2dEnd - h2dStart;

	// 3. Set the thread layout
	//
	dim3 gridDim(ceil(sizeof(float)*SIZE_M/blockDim.x), ceil(sizeof(float)*SIZE_N/blockDim.y));
	dim3 blockDim(32, 32);

	printf("Grid(%d, %d), Block(%d, %d)\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);

	auto kernelStart = std::chrono::high_resolution_clock::now();

	// 4. Kernel call
	MatMul <<< gridDim, blockDim >>> (dA, dB, dC, m, n, k);

	cudaDeviceSynchronize(); // this is synchronization for mearusing the kernel processing time
	auto kernelEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> kernelElapsed = kernelEnd - kernelStart;

	auto d2hStart = std::chrono::high_resolution_clock::now();

	//5. Get(copy) the result from GPU to host memory (dC -> Cgpu)
	// Hint: cudaMemcpy
  cudaMemcpy(Cgpu, dC, sizeC, cudaMemcpyDeviceToHost);

	auto d2hEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> d2hElapsed = d2hEnd - d2hStart;

	// 6. Release device memory space (dA, dB, dC)
	// Hint: cudaFree
  cudaFree(dA);
  cudaFree(dB);
  cudaFree(dC);


	/******************************************************************
	******************************************************************/
	auto GPUEnd = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> GPUElapsed = GPUEnd - GPUStart;

  std::cout << "Host time: " << hostElapsed.count() << " ms" << std::endl;
  std::cout<<"Host -> Device: " << h2dElapsed.count() << " ms" << std::endl;
  std::cout<<"Kernel: " << kernelElapsed.count() << " ms" << std::endl;
  std::cout<<"Device -> Host: " << d2hElapsed.count() << " ms" << std::endl;
  std::cout << "GPU time: " << GPUElapsed.count() << " ms" << std::endl;

	compareMatrix(Ccpu, Cgpu, sizeC);

	delete A;
	delete B;
	delete Ccpu;
	delete Cgpu;

	return 0;
}


// Utility functions
bool compareMatrix(DATA_TYPE* _A, DATA_TYPE* _B, int _size)
{
	bool isMatched = true;
	for (int i = 0; i < _size; i++) {
		if (_A[i] != _B[i]) {
			printf("[%d] not matched! (%f, %f)\n", i, (double)_A[i], (double)_B[i]);
			getchar();
			isMatched = false;
		}
	}
	if (isMatched)
		printf("Results are matched!\n");
	else
		printf("Results are not matched!!!!!!!!!!!\n");

	return isMatched;
}

template<class T>
void allocNinitMem(T** p, long long size, double* memUsage) {
	*p = new T[size];
	memset(*p, 0, sizeof(T) * size);

	if (memUsage != NULL) {
		*memUsage += sizeof(T) * size;
	}
}

/usr/bin/ld: /tmp/tmpxft_00003526_00000000-11_single_file.o: in function `main':
tmpxft_00003526_00000000-6_single_file.cudafe1.cpp:(.text+0x62a): undefined reference to `__device_builtin_variable_blockDim'
/usr/bin/ld: tmpxft_00003526_00000000-6_single_file.cudafe1.cpp:(.text+0x64e): undefined reference to `__device_builtin_variable_blockDim'
collect2: error: ld returned 1 exit status

