In [None]:
%%writefile matrix_multiplication.cu
#include<bits/stdc++.h>
#include<cuda_runtime.h>
using namespace std;

__global__ void multiply(float* A, float* B, float* C,int M, int N, int P, int offset) {
  int k = threadIdx.x + offset;

  float* a = A + k * M * N;
  float* b = B + k * N * P;
  float* c = C + k * M * P;

  int i, j, l;

  for(i = 0; i < M; i++) {
    for(j = 0; j < P; j++) {
      for(l = 0; l < N; l++) {
        c[i * P + j] += a[i * N + l] * b[l * P + j];
      }
    }
  }
}

int main(int argc, char* argv[]) {
  if(argc < 3) {
    cout<<"provide thread limit and number of matrics"<<endl;
    return 0;
  }

  int threadLimit = atoi(argv[2]);
  int k = atoi(argv[1]);

  int M = 400, N = 400, P = 400;
  int sizeA = k * M * N;
  int sizeB = k * N * P;
  int sizeC = k * M * P;

  float* h_a = new float[sizeA];
  float* h_b = new float[sizeB];
  float* h_c = new float[sizeC];

  for(int i = 0; i < sizeA; i++) {
    h_a[i] = rand();
  }

  for(int i = 0; i < sizeB; i++) {
    h_b[i] = rand();
  }

  float* d_a;
  float* d_b;
  float* d_c;

  cudaMalloc(&d_a, sizeA * sizeof(float));
  cudaMemcpy(d_a, h_a, sizeA * sizeof(float), cudaMemcpyHostToDevice);

  cudaMalloc(&d_b, sizeB * sizeof(float));
  cudaMemcpy(d_b, h_b, sizeB * sizeof(float), cudaMemcpyHostToDevice);


  int remaining = k;
  int offset = 0;
  while(remaining > 0) {
    int batchSize = min(remaining, threadLimit);

    multiply<<<1,batchSize>>>(d_a, d_b, d_c, M, N, P, offset);
    cudaDeviceSynchronize();

    remaining -= batchSize;
    offset += batchSize;
  }

  cudaMemcpy(h_c, d_c, sizeC * sizeof(float), cudaMemcpyDeviceToHost);
  cout<<"ALL Operation Done"<<endl;
  return 0;
}

Overwriting matrix_multiplication.cu


In [None]:
!nvcc -arch=sm_75 matrix_multiplication.cu -o a

In [None]:
!time ./a 10 1 && sleep 2

ALL Operation Done

real	0m0.289s
user	0m0.128s
sys	0m0.131s


In [None]:
!time ./a 10 3 && sleep 2

ALL Operation Done

real	0m0.253s
user	0m0.122s
sys	0m0.109s


In [None]:
!time ./a 10 7 && sleep 2

ALL Operation Done

real	0m0.247s
user	0m0.110s
sys	0m0.114s


In [None]:
!time ./a 10 10 && sleep 2

ALL Operation Done

real	0m0.363s
user	0m0.110s
sys	0m0.107s


In [None]:
!time ./a 25 15 && sleep 2

ALL Operation Done

real	0m0.433s
user	0m0.209s
sys	0m0.107s


In [None]:
!time ./a 25 20 && sleep 2

ALL Operation Done

real	0m0.337s
user	0m0.207s
sys	0m0.108s


In [None]:
!time ./a 25 25 && sleep 2

ALL Operation Done

real	0m0.440s
user	0m0.199s
sys	0m0.115s
