In [3]:
%%writefile saxpy.cu
/*
 * GPU code of SAPXPY
 * Y = a.X + Y
 */

#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>

////////////////////////////////////////////////////////////////
//     Vector initialization
////////////////////////////////////////////////////////////////
void init_tab(float *tab, int len, float val) {
    for (int k=0; k<len; k++)
      tab[k]= k + val;
}

void print_tab(const char *tab_name, float *tab, int len){
   int k;
   printf("\n 10 first elements of %s: \n", tab_name);
   for (k=0; k<10; k++)
      printf("%.2f ", tab[k]);
   printf("\n 10 lasts : \n");
   for (k=len-10; k<len; k++)
      printf("%.2f ", tab[k]);
   printf("\n");
}



////////////////////////////////////////////////////////////////
//     SAXPY kernel
////////////////////////////////////////////////////////////////
__global__ void saxpy(float *tabX, float *tabY, int len, float a){
   // TODO
   int idx = blockIdx.x*blockDim.x + threadIdx.x;

   if(idx < len)
     tabY[idx] = a * tabX[idx] + tabY[idx];
}




////////////////////////////////////////////////////////////////
//     Main program
////////////////////////////////////////////////////////////////
int main( int argc, char** argv){
    float *tabX_d, *tabX_h;
    float *tabY_d, * tabY_h;
    int len = 1000;

     /** Initialization of the grid **/
    // TODO
    int threadsPerBlock = 256;  // better be multiple of 32 to match the warp size
    int blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock; // ceiling division

    dim3 grid(blocksPerGrid);
    dim3 block(threadsPerBlock);

    /** Allocation in host memory **/
    tabX_h = (float *) malloc(sizeof(float) * len);
    init_tab(tabX_h, len , 0.);
    //TODO - allocation and initialization of tabY_dh
    tabY_h = (float *) malloc(sizeof(float) * len);
    init_tab(tabY_h, len , 1.);

     /** Allocation in device memory **/
    cudaMalloc((void**) &tabX_d, sizeof(float) * len);
    // TODO - allocation of tabY_d
    cudaMalloc((void**) &tabY_d, sizeof(float) * len);


     /** Pre-print of tabY **/
    printf("Before computation \n");
    print_tab("tabY_h",tabY_h, len);



    /** Transfer of data from host to device **/
    // TODO
    cudaMemcpy(tabX_d, tabX_h, sizeof(float) * len, cudaMemcpyHostToDevice);
    cudaMemcpy(tabY_d, tabY_h, sizeof(float) * len, cudaMemcpyHostToDevice);

    /** SaxPY kernel launching **/
    //TODO
    saxpy<<<grid, block>>>(tabX_d, tabY_d, len, 2.);

    /** Transfer of the result from device to host **/
    // TODO
    cudaMemcpy(tabY_h, tabY_d, sizeof(float) * len, cudaMemcpyDeviceToHost);

    /** Affichage du resultat **/
    printf("After computation\n");
    print_tab("tabY_h", tabY_h, len);

    /** Memory free **/
    cudaFree(tabX_d); cudaFree(tabY_d);
    free(tabX_h); free(tabY_h);

    return EXIT_SUCCESS;
}

Overwriting saxpy.cu


In [4]:
! nvcc -arch=sm_75 saxpy.cu -o saxpy

In [5]:
! ./saxpy

Before computation 

 10 first elements of tabY_h: 
1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 
 10 lasts : 
991.00 992.00 993.00 994.00 995.00 996.00 997.00 998.00 999.00 1000.00 
After computation

 10 first elements of tabY_h: 
1.00 4.00 7.00 10.00 13.00 16.00 19.00 22.00 25.00 28.00 
 10 lasts : 
2971.00 2974.00 2977.00 2980.00 2983.00 2986.00 2989.00 2992.00 2995.00 2998.00 
