<a href="https://colab.research.google.com/github/leandrocodes/Paralel-Programming-UFMS-CPPP/blob/master/T1_PP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-x5wr0cnk
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-x5wr0cnk
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=6ac0f63c3d9ead5a0996db68000e8a9f8dc6d7b9f95e3f1ee6c04714589aa74f
  Stored in directory: /tmp/pip-ephem-wheel-cache-jcmiywti/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7410 sha256=ca6a1d8e32ec963643a209820c354531008fd9f897839a3f89e15da328c44aec
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0


In [3]:
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
print("GPU Name:",gpu.name)
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

GPU Name: Tesla T4
GPU RAM Free: 15079MB | Used: 0MB | Util   0% | Total 15079MB


In [7]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [21]:
%%cu
#include <iostream>
#include <stdio.h>
#include <numeric>
#include <stdlib.h>
#include <cuda.h>

/* Função e constante para mensurar o tempo de execução na GPU */
static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

/* Constante para definir o tamanho do vetor*/
#define TAMANHO 32

__global__ void MatAdd (float* A, float* B, int N){

	int index = threadIdx.x;
	if (index < TAMANHO){
		B[index] = A[index] * 3.1415;
	}
}

int main(){

	/*Define o tamanho do vetor*/
	int N = TAMANHO;
	int size = N * sizeof(float);
    printf("sizeof(float): %u\n",sizeof(float));
    printf("Tamanho em memória alocado para a variável size: %i\n",size);  
    
    /*Define e inicializa o vetor no HOST(CPU)*/
	float* h_A = (float *)malloc(size);
	float* h_B = (float *)malloc(size);

	for (int i = 0; i < N; i++){
		h_A[i] = i;
		h_B[i] = 0;
	}

    /*Define e inicializa o vetor no DEVICE(GPU)*/
	float* d_A;
	float* d_B;
	cudaMalloc((void **)&d_A, size);
	cudaMalloc((void **)&d_B, size);

    /*Copia o vetor do HOST para o DEVICE*/
	cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);



    
    /* Essa porção do código, é a parte que irá mensurar o tempo de execução da soma na GPU,
    logo, não se preocupem em aprender, ou caso acharem muito estranho, simplesmente copiem e colem 
    essa parte, sempre que precisarem calcular o tempo de execução de um kernel na GPU, notem que
    essa parte é idêntica ao dos códigos disponibilizados anteriormente*/
    
    cudaEvent_t start, stop;
	CUDA_CHECK_RETURN(cudaEventCreate(&start));
	CUDA_CHECK_RETURN(cudaEventCreate(&stop));
	float gpu_time = 0.0f;

    CUDA_CHECK_RETURN(cudaEventRecord(start, 0));  
    
    /* Realiza a chamada do kernel - MUITO IMPORTANTE*/       
	MatAdd<<<2, 32>>>(d_A, d_B, N);
        
    CUDA_CHECK_RETURN(cudaEventRecord(stop, 0));
    CUDA_CHECK_RETURN(cudaEventSynchronize(stop)); 
    CUDA_CHECK_RETURN(cudaEventElapsedTime(&gpu_time, start, stop));
    
    /* A função cudaDeviceSynchronize() sincroniza os dados - MUITO IMPORTANTE*/ 
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    
	printf("Tempo de Execução na GPU: %.4f ms \n\n", gpu_time);  
    /* FIM DO CÓDIGO QUE MENSURA O TEMPO DE EXECUÇÃO*/
      
    /* Copia o resultado do vetor do DEVICE para o HOST*/
	cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);

    /* Limpa a memória no DEVICE*/
	cudaFree(d_A);
	cudaFree(d_B);

    /* Imprime o resultado*/
    printf("A[] = ");
    for (int i = 0; i < N; i++){
        if (i == 0) printf("[");
        printf("%.0f",h_A[i]);        
        if (i == N-1) printf("]\n\n");       
        else printf(", ");
    }
    printf("B[] = ");
    for (int i = 0; i < N; i++){
        if (i == 0) printf("[");
        printf("%.4f",h_B[i]);
        if (i == N-1) printf("]\n\n");  
        else printf(", ");
    }

	return 0;
}


/* Verifica o valor de retorno da chamada da API de tempo de execução CUDA e para o programa se a chamada falhar.*/

static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err){
	if (err == cudaSuccess) return;
	std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
	exit (1);
}






sizeof(float): 4
Tamanho em memória alocado para a variável size: 128
Tempo de Execução na GPU: 0.0381 ms 

A[] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]

B[] = [0.0000, 3.1415, 6.2830, 9.4245, 12.5660, 15.7075, 18.8490, 21.9905, 25.1320, 28.2735, 31.4150, 34.5565, 37.6980, 40.8395, 43.9810, 47.1225, 50.2640, 53.4055, 56.5470, 59.6885, 62.8300, 65.9715, 69.1130, 72.2545, 75.3960, 78.5375, 81.6790, 84.8205, 87.9620, 91.1035, 94.2450, 97.3865]




In [6]:
%%cu
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#define N 4096
int main(){
    long int a[N];
    long int b[N];
    float pi = 3.1415;


    double time = 0.0;

    for (long int i = 0; i < N; i++){
        a[i] = i;   
        b[i] = 0;
    }

    clock_t begin = clock();
    for (long int i = 0; i < N; i++){
        b[i] = a[i] * pi;
    }

    clock_t end = clock();
    time += (double)(end - begin) / CLOCKS_PER_SEC;

	printf("Tempo gasto: %f ms", time*1000);
}

Tempo gasto: 0.011000 ms
