<a href="https://colab.research.google.com/github/ickma2311/mycolab/blob/main/cuda/conv2d.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Formula
$$
\left(Y*k \right)(i,j)= \sum_m \sum_n Y(i+m,j+n)K(m,n)
$$

In [1]:
import numpy as np

In [33]:
kernel_code="""
extern "C" __global__
void conv2d(float* input,float* k,int k_size_a, int k_size_b,
float* result,int input_size_a, int input_size_b){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  int j=blockIdx.y*blockDim.y+threadIdx.y;
  if(i>=input_size_a-k_size_a+1||j>=input_size_b-k_size_b+1){
    return;
  }
  int output_row_length=input_size_a-k_size_a+1;
  float sum=0.0;
  for(int m=0;m<k_size_a;m++){
    for(int n=0;n<k_size_b;n++){
      //current x for input is i+m,y is j+n
      //index is y*x_length+x
      //current x for kernel is m,y is n
      //index is y*x_length+x
      sum+=input[(j+n)*input_size_a+(i+m)]*k[n*k_size_a+m];
    }
  }
  result[j*output_row_length+i]=sum;
}
"""

In [3]:
import cupy as cp
import numpy as np

In [45]:
kernel=np.random.rand(3,3).astype(np.float32).flatten()
inputs=np.random.rand(32,32).astype(np.float32).flatten()

In [58]:
outputs_shape=(30,30) #input_length+1-kernel_length
outputs=np.zeros(outputs_shape).astype(np.float32).flatten()

In [59]:

threads_per_block=(8,8)
import math
blocks_per_grid=(math.ceil(28/threads_per_block[0]),math.ceil(28/threads_per_block[1]))

In [60]:
moudle=cp.RawModule(code=kernel_code)
conv2d_kernel=moudle.get_function('conv2d')

In [61]:
m_kernel=cp.asarray(kernel,dtype=cp.float32)
m_inputs=cp.asarray(inputs,dtype=cp.float32)
m_outputs=cp.asarray(outputs,dtype=cp.float32)
conv2d_kernel(
    blocks_per_grid,
    threads_per_block,
    (m_inputs,
    m_kernel,
    np.int32(3),
    np.int32(3),
    m_outputs,
    np.int32(32),
    np.int32(32))
)
cp.cuda.runtime.deviceSynchronize()

In [62]:
result=m_outputs.get()

In [63]:
# result.reshape(28,28)
from torch.nn.functional import conv2d
from torch import Tensor
conv_2d=conv2d(Tensor(inputs.reshape(1,1,32,32)),Tensor(kernel.reshape(1,1,3,3))).flatten()
conv_2d.numpy().shape

(900,)

In [64]:
result[:10]

array([2.226335 , 2.390226 , 2.4810112, 2.1960325, 2.0014567, 1.4875925,
       1.6018865, 1.4656465, 1.6580648, 1.2654059], dtype=float32)

In [65]:
conv_2d.numpy().flatten()[:10]

array([2.226335 , 2.390226 , 2.481011 , 2.1960328, 2.0014567, 1.4875925,
       1.6018865, 1.4656465, 1.658065 , 1.2654059], dtype=float32)

In [66]:
np.allclose(result,conv_2d.flatten(),atol=1e-5)

True