# Hadamard Product


## C Implementation

In [None]:
%%writefile hadamard_c.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void dumpArr(size_t row, size_t col, float arr[row][col])
{
  printf("Array output: \n");
  for (size_t i = 0; i < row; ++i) {
    for (size_t j = 0; j < col; ++j) {
      printf("%.2f ", arr[i][j]);
    }
    printf("\n");
  }
}

void C_hadamard(size_t ARR_SIZE, float z[ARR_SIZE][ARR_SIZE], float x[ARR_SIZE][ARR_SIZE], float y[ARR_SIZE][ARR_SIZE])
{
  for (size_t i = 0; i < ARR_SIZE; ++i) {
    for (size_t j = 0; j < ARR_SIZE; ++j) {
      z[i][j] = x[i][j] * y[i][j];
    }
  }
}

int main()
{
  const size_t ARR_SIZE = 4096;
  size_t NUM_EXEC = 10;

  // https://stackoverflow.com/questions/3911400/how-to-pass-2d-array-matrix-in-a-function-in-c
  // int (*array)[cols] = malloc(rows * cols * sizeof(array[0][0]));
  float (*x)[ARR_SIZE] = malloc(ARR_SIZE * ARR_SIZE * sizeof(x[0][0]));
  float (*y)[ARR_SIZE] = malloc(ARR_SIZE * ARR_SIZE * sizeof(y[0][0]));
  float (*z)[ARR_SIZE] = malloc(ARR_SIZE * ARR_SIZE * sizeof(z[0][0]));

  for (size_t i = 0; i < ARR_SIZE; ++i) {
    for (size_t j = 0; j < ARR_SIZE; ++j) {
      x[i][j] = 1.0f;
      y[i][j] = 2.0f;
    }
  }

  clock_t start, end;
  double elapse, time_taken;
  elapse = 0.0f;
  // fill in cache
  C_hadamard(ARR_SIZE, z, x, y);


  for (size_t i = 0; i < NUM_EXEC; ++i) {
      start = clock();
      C_hadamard(ARR_SIZE, z, x, y);
      end = clock();
      time_taken = (end-start)*1E6/CLOCKS_PER_SEC;
      elapse +=  time_taken;
  }


  size_t err_count = 0;

  for (size_t i = 0; i < ARR_SIZE; ++i) {
    for (size_t j = 0; j < ARR_SIZE; ++j) {
      if( (x[i][j] * y[i][j]) != z[i][j] ) {
        err_count++;
      }
    }
  }

  free(x);
  free(y);
  free(z);
  // dumpArr(ARR_SIZE, ARR_SIZE, z);
  printf("Function in C took an average time of %lf in microseconds\n", elapse/NUM_EXEC);
  printf("Total error count: %lu", err_count);

  return 0;
}


Writing hadamard_c.c


In [None]:
%%shell
gcc -Wall -Wextra -pedantic -o hadamard_c hadamard_c.c



In [None]:
%%shell
chmod +X ./hadamard_c



In [None]:
%%shell
./hadamard_c

Function in C took an average time of 61122.800000 in microseconds
Total error count: 0



In [None]:
%%writefile hadamard_cuda.cu
#include <cstddef>
#include <cstdio>
#include <stdlib.h>
#include <stdio.h>


void dump_arr(size_t numCol, size_t numRow, float* Arr)
{
    for (size_t j = 0; j < numCol; ++j) {
      for (size_t i = 0; i < numRow; ++i) {
      printf("%.2f ", Arr[i * numCol + j]);
    }
    printf("\n");
  }
}

void C_Hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y)
{
  for (size_t i = 0; i < numRow; ++i) {
    for (size_t j = 0; j < numCol; ++j) {
      Z[i * numCol + j] = X[i * numCol + j] * Y[i * numCol + j];
    }
  }
}

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y)
{
  size_t threadRowID = blockIdx.x * blockDim.x + threadIdx.x;
  size_t threadColId = blockIdx.y * blockDim.y + threadIdx.y;

  // Z[threadColId * numCol + threadRowID] = X[threadColId * numCol + threadRowID] * Y[threadColId * numCol + threadRowID];
  // if (threadRowID < numRow && threadColId < numCol) {
  Z[threadRowID * numCol + threadColId] = X[threadRowID * numCol + threadColId] * Y[threadRowID * numCol + threadColId];
  // }

}



int main()
{
  const size_t ARRAY_DIM = 4096;
  const size_t ARRAY_BYTES = ARRAY_DIM * ARRAY_DIM * sizeof(float);
  size_t NUM_EXEC = 30;

  // Array Initialization
  float* X;
  float* Y;
  float* Z;

  // Initialize C implementation of Hadamard Product, reference for error checking
  float* C;
  C = (float*)malloc(ARRAY_DIM * ARRAY_DIM * sizeof(float));

  // Array Malloc
  cudaMallocManaged(&X, ARRAY_BYTES);
  cudaMallocManaged(&Y, ARRAY_BYTES);
  cudaMallocManaged(&Z, ARRAY_BYTES);

  // get gpu ID
  int device = -1;
  cudaGetDevice(&device);

  // Mem advise
  cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

  // "prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(X, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  // "prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(Y, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  // "prefetch data" to create GPU page memory
  cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL);

  // initialize array contents
  for (size_t i = 0; i < ARRAY_DIM; ++i) {
    for (size_t j = 0; j < ARRAY_DIM; ++j) {
      X[ARRAY_DIM * i + j] = 1;
      Y[ARRAY_DIM * i + j] = 1;
    }
  }

  // "Prefetch data" from CPU-GPU
  cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL);
  cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL);

  C_Hadamard(ARRAY_DIM, ARRAY_DIM, C, X, Y);

  // setup CUDA kernel
  // https://www.cs.emory.edu/~cheung/Courses/355/Syllabus/94-CUDA/2D-grids.html
  size_t threadDimBlockx = 8;
  size_t threadDimBlocky = 8;

  dim3 blockShape = dim3(threadDimBlockx, threadDimBlocky);
  dim3 gridShape = dim3(ARRAY_DIM/threadDimBlockx, ARRAY_DIM/threadDimBlocky);

  for (size_t i = 0; i < NUM_EXEC; ++i) {
    cuda_hadamard <<< gridShape, blockShape >>> (ARRAY_DIM, ARRAY_DIM, Z, X, Y);
  }

  cudaDeviceSynchronize();

  // "Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(X, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  cudaMemPrefetchAsync(Y, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  cudaMemPrefetchAsync(Z, ARRAY_BYTES, cudaCpuDeviceId, NULL);


  // dump_arr(ARRAY_DIM, ARRAY_DIM, X);
  // printf("\n\n");
  // dump_arr(ARRAY_DIM, ARRAY_DIM, Z);

  // error checking
  size_t errCount = 0;

  for (size_t i = 0; i < ARRAY_DIM; ++i) {
    for (size_t j = 0; j <ARRAY_DIM; ++j ) {
      if (C[i * ARRAY_DIM + j] != Z[i * ARRAY_DIM + j]) {
        errCount++;
      }
    }
  }

  printf("Total error count: %lu", errCount);

  cudaFree(X);
  cudaFree(Y);
  cudaFree(Z);


  return 0;
}


Writing hadamard_cuda.cu


In [None]:
%%shell
nvcc -o hadamard_cuda hadamard_cuda.cu -arch=sm_75



In [None]:
%%shell
nvprof ./hadamard_cuda

==818== NVPROF is profiling process 818, command: ./hadamard_cuda
==818== Profiling application: ./hadamard_cuda
==818== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  66.104ms        30  2.2035ms  2.1959ms  2.2085ms  cuda_hadamard(unsigned long, unsigned long, float*, float*, float*)
      API calls:   61.65%  239.87ms         3  79.956ms  32.372us  239.77ms  cudaMallocManaged
                   17.67%  68.735ms         8  8.5919ms  64.788us  18.983ms  cudaMemPrefetchAsync
                   16.95%  65.946ms         1  65.946ms  65.946ms  65.946ms  cudaDeviceSynchronize
                    3.55%  13.819ms         3  4.6063ms  3.8403ms  5.5836ms  cudaFree
                    0.11%  436.18us        30  14.539us  5.5870us  243.14us  cudaLaunchKernel
                    0.05%  190.15us       114  1.6670us     200ns  75.262us  cuDeviceGetAttribute
                    0.01%  32.732us         1  32.732us  32.732u



In [None]:
%%writefile hadamard_cuda16x16.cu
#include <cstddef>
#include <cstdio>
#include <stdlib.h>
#include <stdio.h>


void dump_arr(size_t numCol, size_t numRow, float* Arr)
{
    for (size_t j = 0; j < numCol; ++j) {
      for (size_t i = 0; i < numRow; ++i) {
      printf("%.2f ", Arr[i * numCol + j]);
    }
    printf("\n");
  }
}

void C_Hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y)
{
  for (size_t i = 0; i < numRow; ++i) {
    for (size_t j = 0; j < numCol; ++j) {
      Z[i * numCol + j] = X[i * numCol + j] * Y[i * numCol + j];
    }
  }
}

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y)
{
  size_t threadRowID = blockIdx.x * blockDim.x + threadIdx.x;
  size_t threadColId = blockIdx.y * blockDim.y + threadIdx.y;

  // Z[threadColId * numCol + threadRowID] = X[threadColId * numCol + threadRowID] * Y[threadColId * numCol + threadRowID];
  // if (threadRowID < numRow && threadColId < numCol) {
  Z[threadRowID * numCol + threadColId] = X[threadRowID * numCol + threadColId] * Y[threadRowID * numCol + threadColId];
  // }

}



int main()
{
  const size_t ARRAY_DIM = 4096;
  const size_t ARRAY_BYTES = ARRAY_DIM * ARRAY_DIM * sizeof(float);
  size_t NUM_EXEC = 30;

  // Array Initialization
  float* X;
  float* Y;
  float* Z;

  // Initialize C implementation of Hadamard Product, reference for error checking
  float* C;
  C = (float*)malloc(ARRAY_DIM * ARRAY_DIM * sizeof(float));

  // Array Malloc
  cudaMallocManaged(&X, ARRAY_BYTES);
  cudaMallocManaged(&Y, ARRAY_BYTES);
  cudaMallocManaged(&Z, ARRAY_BYTES);

  // get gpu ID
  int device = -1;
  cudaGetDevice(&device);

  // Mem advise
  cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

  // "prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(X, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  // "prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(Y, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  // "prefetch data" to create GPU page memory
  cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL);

  // initialize array contents
  for (size_t i = 0; i < ARRAY_DIM; ++i) {
    for (size_t j = 0; j < ARRAY_DIM; ++j) {
      X[ARRAY_DIM * i + j] = 1;
      Y[ARRAY_DIM * i + j] = 1;
    }
  }

  // "Prefetch data" from CPU-GPU
  cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL);
  cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL);

  C_Hadamard(ARRAY_DIM, ARRAY_DIM, C, X, Y);

  // setup CUDA kernel
  // https://www.cs.emory.edu/~cheung/Courses/355/Syllabus/94-CUDA/2D-grids.html
  size_t threadDimBlockx = 16;
  size_t threadDimBlocky = 16;

  dim3 blockShape = dim3(threadDimBlockx, threadDimBlocky);
  // https://selkie.macalester.edu/csinparallel/modules/GPUProgramming/build/html/CUDA2D/CUDA2D.html
  dim3 gridShape = dim3(ARRAY_DIM/threadDimBlockx, ARRAY_DIM/threadDimBlocky);

  for (size_t i = 0; i < NUM_EXEC; ++i) {
    cuda_hadamard <<< gridShape, blockShape >>> (ARRAY_DIM, ARRAY_DIM, Z, X, Y);
  }

  cudaDeviceSynchronize();

  // "Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(X, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  cudaMemPrefetchAsync(Y, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  cudaMemPrefetchAsync(Z, ARRAY_BYTES, cudaCpuDeviceId, NULL);


  // dump_arr(ARRAY_DIM, ARRAY_DIM, X);
  // printf("\n\n");
  // dump_arr(ARRAY_DIM, ARRAY_DIM, Z);

  // error checking
  size_t errCount = 0;

  for (size_t i = 0; i < ARRAY_DIM; ++i) {
    for (size_t j = 0; j <ARRAY_DIM; ++j ) {
      if (C[i * ARRAY_DIM + j] != Z[i * ARRAY_DIM + j]) {
        errCount++;
      }
    }
  }

  printf("Total error count: %lu", errCount);

  cudaFree(X);
  cudaFree(Y);
  cudaFree(Z);


  return 0;
}


Overwriting hadamard_cuda16x16.cu


In [None]:
%%shell
nvcc -o hadamard_cuda16x16 hadamard_cuda16x16.cu -arch=sm_75



In [None]:
%%shell
nvprof ./hadamard_cuda16x16

==4109== NVPROF is profiling process 4109, command: ./hadamard_cuda16x16
Total error count: 0==4109== Profiling application: ./hadamard_cuda16x16
==4109== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  71.012ms        30  2.3671ms  2.3635ms  2.3706ms  cuda_hadamard(unsigned long, unsigned long, float*, float*, float*)
      API calls:   47.71%  110.19ms         3  36.731ms  30.468us  110.05ms  cudaMallocManaged
                   30.71%  70.933ms         1  70.933ms  70.933ms  70.933ms  cudaDeviceSynchronize
                   17.73%  40.937ms         8  5.1171ms  42.417us  14.588ms  cudaMemPrefetchAsync
                    3.61%  8.3321ms         3  2.7774ms  1.9757ms  3.3034ms  cudaFree
                    0.16%  363.44us        30  12.114us  3.5660us  243.84us  cudaLaunchKernel
                    0.06%  141.98us       114  1.2450us     107ns  58.433us  cuDeviceGetAttribute
                    0.01%  22.



## CUDA with Shared Memory

In [None]:
%%writefile hadamard_cuda_shared16x16.cu
#include <cstddef>
#include <cstdio>
#include <stdlib.h>
#include <stdio.h>


void dump_arr(size_t numCol, size_t numRow, float* Arr)
{
    for (size_t j = 0; j < numCol; ++j) {
      for (size_t i = 0; i < numRow; ++i) {
      printf("%.2f ", Arr[i * numCol + j]);
    }
    printf("\n");
  }
}

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y)
{
  size_t threadRowID = blockIdx.x * blockDim.x + threadIdx.x;
  size_t threadColId = blockIdx.y * blockDim.y + threadIdx.y;
  __shared__ float shData[1];

  shData[0] = X[threadRowID * numCol + threadColId] * Y[threadRowID * numCol + threadColId];

  __syncthreads();

  Z[threadRowID * numCol + threadColId] = shData[0];

  // Z[threadRowID * numCol + threadColId] = X[threadRowID * numCol + threadColId] * Y[threadRowID * numCol + threadColId];
}



int main()
{
  const size_t ARRAY_DIM = 4096;
  const size_t ARRAY_BYTES = ARRAY_DIM * ARRAY_DIM * sizeof(float);
  size_t NUM_EXEC = 30;

  // Array Initialization
  float* X;
  float* Y;
  float* Z;

  // Initialize C implementation of Hadamard Product, reference for error checking
  // float* C;
  // C = (float*)malloc(ARRAY_DIM * ARRAY_DIM * sizeof(float));

  // Array Malloc
  cudaMallocManaged(&X, ARRAY_BYTES);
  cudaMallocManaged(&Y, ARRAY_BYTES);
  cudaMallocManaged(&Z, ARRAY_BYTES);

  // get gpu ID
  int device = -1;
  cudaGetDevice(&device);

  // Mem advise
  cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
  cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
  cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

  // "prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(X, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  // "prefetch data" to create CPU page memory
  cudaMemPrefetchAsync(Y, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  // "prefetch data" to create GPU page memory
  cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL);

  // initialize array contents
  for (size_t i = 0; i < ARRAY_DIM; ++i) {
    for (size_t j = 0; j < ARRAY_DIM; ++j) {
      X[ARRAY_DIM * i + j] = 1;
      Y[ARRAY_DIM * i + j] = 1;
    }
  }

  // "Prefetch data" from CPU-GPU
  cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL);
  cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL);

  // setup CUDA kernel
  // https://www.cs.emory.edu/~cheung/Courses/355/Syllabus/94-CUDA/2D-grids.html
  size_t threadDimBlockx = 32;
  size_t threadDimBlocky = 32;

  dim3 blockShape = dim3(threadDimBlockx, threadDimBlocky);
  // https://selkie.macalester.edu/csinparallel/modules/GPUProgramming/build/html/CUDA2D/CUDA2D.html
  // https://medium.com/@harsh20111997/cuda-programming-2d-convolution-8476300f566e
  dim3 gridShape = dim3( (ARRAY_DIM + threadDimBlockx - 1) / threadDimBlockx, (ARRAY_DIM + threadDimBlocky - 1)/threadDimBlocky );

  for (size_t i = 0; i < NUM_EXEC; ++i) {
    cuda_hadamard <<< gridShape, blockShape >>> (ARRAY_DIM, ARRAY_DIM, Z, X, Y);
  }

  cudaDeviceSynchronize();

  // "Prefetch data" from GPU-CPU
  cudaMemPrefetchAsync(X, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  cudaMemPrefetchAsync(Y, ARRAY_BYTES, cudaCpuDeviceId, NULL);
  cudaMemPrefetchAsync(Z, ARRAY_BYTES, cudaCpuDeviceId, NULL);


  // error checking
  size_t errCount = 0;

  for (size_t i = 0; i < ARRAY_DIM; ++i) {
    for (size_t j = 0; j <ARRAY_DIM; ++j ) {
      if (X[i * ARRAY_DIM + j] * Y[i * ARRAY_DIM + j] != Z[i * ARRAY_DIM + j]) {
        errCount++;
      }
    }
  }

  printf("Total error count: %lu", errCount);

  cudaFree(X);
  cudaFree(Y);
  cudaFree(Z);


  return 0;
}


Overwriting hadamard_cuda_shared16x16.cu


In [None]:
%%shell
nvcc -o hadamard_cuda_shared16x16 hadamard_cuda_shared16x16.cu -arch=sm_75



In [None]:
%%shell
nvprof ./hadamard_cuda_shared16x16

==3976== NVPROF is profiling process 3976, command: ./hadamard_cuda_shared16x16
Total error count: 0==3976== Profiling application: ./hadamard_cuda_shared16x16
==3976== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  131.78ms        30  4.3927ms  4.1460ms  4.5828ms  cuda_hadamard(unsigned long, unsigned long, float*, float*, float*)
      API calls:   42.55%  131.70ms         1  131.70ms  131.70ms  131.70ms  cudaDeviceSynchronize
                   38.29%  118.52ms         3  39.506ms  22.614us  118.43ms  cudaMallocManaged
                   13.28%  41.096ms         8  5.1370ms  41.663us  14.596ms  cudaMemPrefetchAsync
                    3.46%  10.717ms        30  357.24us  3.4900us  10.599ms  cudaLaunchKernel
                    2.34%  7.2472ms         3  2.4157ms  2.2525ms  2.5137ms  cudaFree
                    0.06%  171.40us       114  1.5030us     108ns  69.912us  cuDeviceGetAttribute
                

