In [10]:
import torch
torch.cuda.is_available()

True

In [39]:
cuda_code = r"""
#include <stdio.h>
#include <cuda_runtime.h>

#define N 500

__global__ void add(int *a, int *b, int *c)
{
  int tid = threadIdx.x;
  if(tid<N)
  {
    c[tid] = a[tid]+b[tid];
  }
}

int main(void)
{
  int a[N], b[N], c[N];
  int *dev_a, *dev_b, *dev_c;

  cudaError_t err = cudaSuccess;

  err = cudaMalloc((void**)&dev_a, N*sizeof(int));
  if(err != cudaSuccess)
  {
    printf("Failed to allocate memory\n");
    printf("Error: %s\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  cudaMalloc((void**)&dev_b, N*sizeof(int));
  cudaMalloc((void**)&dev_c, N*sizeof(int));

  for(int i=0; i<N; i++)
  {
    a[i] = i;
    b[i] = i*i;
    c[i] = 0;
  }

  cudaEvent_t start, end;
  cudaEventCreate(&start);
  cudaEventCreate(&end);
  cudaEventRecord(start);

  cudaMemcpy(dev_a, a, N*sizeof(int),cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, N*sizeof(int),cudaMemcpyHostToDevice);
  cudaMemcpy(dev_a, c, N*sizeof(int),cudaMemcpyHostToDevice);

  add<<<1, N>>>(dev_a, dev_b, dev_c);

  err = cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
  if(err != cudaSuccess)
  {
    printf("Failed to load from device\n");
    printf("Error: %s\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  cudaEventRecord(end);
  cudaEventSynchronize(end);
  float time = 0;
  cudaEventElapsedTime(&time, start, end);
  printf("Execution Time: %f\n", time);

  for(int i=0; i<10; i++)
  {
    printf("%d + %d = %d\n", a[i], b[i],c[i]);
  }

  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);

  return 0;
}
"""

with open("addVectors.cu", "w") as f:
  f.write(cuda_code)

In [41]:
!nvcc addVectors.cu -o addVectors -arch=sm_75

In [42]:
!./addVectors

Execution Time: 0.154368
0 + 0 = 0
1 + 1 = 1
2 + 4 = 4
3 + 9 = 9
4 + 16 = 16
5 + 25 = 25
6 + 36 = 36
7 + 49 = 49
8 + 64 = 64
9 + 81 = 81


In [23]:
cuda_code = r"""
#include <iostream>
#include <cstdlib>
#include <chrono>
#include <ctime>
#include <cuda_runtime.h>
using namespace std;
using namespace std::chrono;

__global__ void multiply(int* X, int* Y, int* Z, int M, int N, int K)
{
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  if(row<M && col<K)
  {
    int sum = 0;
    for(int i=0; i<N; i++)
    {
      sum += A[row*N+i] * B[col*i];
    }
    C[row*K+col] = sum;
  }
}

void initialize(int* matrix, int rows, int cols)
{
  for(int i=0; i<rows*cols; i++)
  {
    matrix[i] = rand()%10;

  }
}

void print(int* matrix, int rows, int cols)
{
  for(int row=0; row<rows; row++)
  {
    for(int col=0; col<cols; col++)
    {
      cout<<matrix[row*cols+col]<<" ";
    }
    cout<<"\n";
  }
  cout<<"\n";
}

for seqMultiply(int* A, int* B, int* C, int M, int N, int K)
{
  for(int row=0; row<M; row++)
  {
    for(int col=0; col<K; col++)
    {
      int sum=0;
      for(int i=0; i<N; i++)
      {
        sum += A[row*N+i] * B[i*N+col];
      }
      C[row*K+col] = sum;
    }
  }
}

int main()
{
  srand(time(0));

  int M,N,K;
  cout<<"M1 row: ";
  cin>>M;
  cout<<"M1 col: ";
  cin>>N;
  cout<<"M2 col: ";
  cin>>K;

  int *A = new int[M*N];
  int *B = new int[N*K];
  int *C = new int[M*K];

  initialize(A, M, N);
  initialize(B, N, K);

  cout<<"M1: \n";
  print(A, M, N);

  cout<<"M2: \n";
  print(B, N, K);

  int* x, int* Y, int* Z;
  cudaMalloc(&X, M*N*sizeof(int));
  cudaMalloc(&Y, N*K*sizeof(int));
  cudaMalloc(&Z, M*K*sizeof(int));

  cudaMemcpy(X, A, M*N*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(Y, B, N*K*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(Z, C, M*K*sizeof(int), cudaMemcpyHostToDevice);

  int THREADS = 16;
  int BLOCK_X = (K+THREADS-1)/THREADS;
  int BLOCK_Y = (M+THREADS-1)/THREADS;

  dim3 threads(THREADS, THREADS);
  dim3 blocks(BLOCK_X, BLOCK_Y);

  auto start = high_resolution_clock::now();
  seqMultiply(A, B, C, M, N, K);
  auto stop = high_resolution_clock::now();
  auto seqTime = duration_cast<microseconds>(stop-start);

  return 0;
}
"""

In [20]:

!nvcc mat_mult.cu -o mat_mult -arch=sm_75

In [21]:
!./mat_mult

Enter the number of rows and columns of the first matrix: 4
5
Enter the number of columns of the second matrix: 3
Matrix A: 
2 6 0 5 3 
4 1 3 6 6 
1 0 9 0 0 
3 6 6 8 1 

Matrix B: 
9 7 3 
8 2 2 
1 3 2 
1 3 4 
7 5 9 

Sequential Multiplication of matrix A and B: 
92 56 65 
95 87 98 
18 34 21 
96 80 74 

Parallel Multiplication of matrix A and B: 
92 56 65 
95 87 98 
18 34 21 
96 80 74 

Sequential Multiplication Time: 0 microseconds
Parallel Multiplication Time: 174 microseconds
