In [41]:
import numpy as np
from typing import Tuple

def image2col(x, filter_size: Tuple, pad=0, stride=1):   
    assert len(x.shape) == 4, f"Input data should have 4 dimensions, but got {len(x.shape)} dimensions"
    
    B, C, H, W = x.shape

    out_h = (H + 2*pad - filter_size[0]) // stride + 1
    out_w = (W + 2*pad - filter_size[1]) // stride + 1
    
    x_pad = np.pad(sample_data, [(0, 0), (0, 0), (pad, pad), (pad, pad)], 'constant')
    col = np.zeros((B, C, filter_size[0], filter_size[1], out_h, out_w))

    for y in range(filter_size[0]):
        y_max = y + stride * out_h

        for x in range(filter_size[1]):
            x_max = x + stride * out_w

            col[:, :, y, x, :, :] = x_pad[:, :, y:y_max:stride, x:x_max:stride]
    
    col = col.transpose(0, 4, 5, 1, 2, 3).reshape(B * out_h * out_w, -1)
    
    return col

In [44]:
from torchvision.datasets import FashionMNIST

data = FashionMNIST(root=".", download=True)
sample_data = data.data[:32]
sample_data = sample_data.unsqueeze(1).numpy()

sample_data.shape

(32, 1, 28, 28)

In [45]:
transformed_image = image2col(sample_data, (3, 3), pad=0, stride=1)
transformed_image.shape

(21632, 9)

In [46]:
class Covolution:
    def __init__(self, W, b, stride, pad):
        self.W = W
        self.b = b
        self.stride = stride
        self.pad = pad
        
    def forward(self, x):
        FN, C, FH, FW = self.w.shape
        N, C, H, W = x.shape
        
        out_h = int(1 + (H + 2*self.pad - FH) / self.stride)
        out_w = int(1 + (W + 2*self.pad - FW) / self.stride)

        col = image2col(x)
        col_W = self.W.reshape(FN, -1).T
        out = np.dot(col, col_W) + self.b
        
        out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
        
        return out

In [47]:
class Pooling:
    def __init__(self, kernel_size: Tuple, stride=1, pad=0):
        self.kernel_size = kernel_size
        self.stride = stride
        self.pad = pad
        
    def forward(self, x):
        B, C, H, W = x.shape
        
        out_h = int(1 + (H - self.kernel_size[0]) / self.stride)
        out_w = int(1 + (W - self.kernel_size[1]) / self.stride)
        
        col = image2col(x, self.kernel_size[0], self.kernel_size[1], self.stride, self.pad)
        col = col.reshape(-1, self.kernel_size[0] * self.kernel_size[1])
        
        out = np.max(col, axis=1)
        
        out = out.reshape(B, out_h, out_w, C).transpose(0, 3, 1, 2)
        
        return out

In [48]:
from collections import OrderedDict

class SimpleConvNet:
    def __init__(self, input_dim=(1, 28, 28), 
                conv_param={"filter_num": 30, "filter_size": 5,
                            "pad": 100, "stride": 1},
                hidden_size=100, output_size=10, weight_init_std=0.01):
    
        self.filter_num = conv_param["filter_num"]
        self.filter_size = conv_param["filter_size"]
        self.filter_pad = conv_param["pad"]
        self.filter_stride = conv_param["stride"]
        
        self.input_dim = input_dim
        input_size = input_dim[1]
        
        conv_output_size = (input_size - filter_size + 2 * filter_pad) / filter_stride + 1
        pad_output_size = int(filter_num * (conv_output_size / 2) * (conv_output_size / 2))
        
        
        self.__weight_init(weight_init_std)
    
    def __define_layers(self):
        self.layers = OrderedDict()
        
        self.layers["Conv1"] = Convolution(self.param["W1"],
                                           self.param["b1"],
                                           self.filter_stride,
                                           self.filter_pad)
        
        self.layers["Relu1"] = Relu()
        self.layers["Pool1"] = Pooling(kernel_size=(2, 2), stride=2)
        self.layers["FC1"] = Affine(self.param["W2"], self.param["b2"])
        self.layers["Relu2"] = Relu()
        self.layers["FC2"] = Affine(self.param["W3"], self.param["b3"])
        self.layers["Out"] = SoftmaxWithLoss()
        
    
    def __weight_init(self, weight_init_std):
        self.param = {}
        
        # Conv layer
        self.param["W1"] = weight_init_std * np.random.randn(self.filter_num, self.input_dim[0], 
                                                             filter_size, filter_size)
        self.param["b1"] = np.zeros(filter_size)
        
        # FC layer 1
        self.param["W2"] = weight_init_std * np.random.randn(pool_output_size, hidden_size)
        self.param["b2"] = np.zeros(hidden_size)

        # FC layer 2
        self.param["W3"] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.param["b3"] = np.zeros(output_size)

In [None]:
# TODO
# Affine layer
# Relu layer
# SoftmaxWithLoss layer