In [1]:
import torch
import torch.nn as nn

class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1=nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(2, 2), stride=(1, 1), padding=0)
        # Activation function after convolution layer
        self.relu = nn.ReLU()
        # Max Pooling Layer
        self.max1 = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 1)) # no hyperparameter channels anymore

    def forward(self, input: torch.Tensor):
        conv1out = self.conv1(input)
        print(conv1out.shape)
        relu1 = self.relu(conv1out)
        print(relu1.shape)
        max1out = self.max1(relu1)
        print(max1out.shape)
        return max1out

# batch_size: 4
# H: 3
# W: 4
# Conv2D: (N, C, H, W) N -> batch_size, C -> channels, H -> height, W -> width
input = torch.randn((4, 1, 3, 4))
simple_cnn = SimpleCNN()
output = simple_cnn(input)
print(output)

torch.Size([4, 1, 2, 3])
torch.Size([4, 1, 2, 3])
torch.Size([4, 1, 2, 2])
tensor([[[[0.0000, 0.0000],
          [0.0000, 0.0000]]],


        [[[0.0000, 0.3567],
          [0.3209, 0.5673]]],


        [[[0.0000, 0.0000],
          [0.5480, 0.5480]]],


        [[[0.3254, 0.3254],
          [0.0825, 0.3678]]]], grad_fn=<MaxPool2DWithIndicesBackward0>)


In [2]:
class MultiChannelCNN(nn.Module):
    def __init__(self):
        super().__init__()

        # input and output channels must be tuned because they are hyper-parameters
        self.conv1=nn.Conv2d(in_channels=3, out_channels=6, kernel_size=(2, 2), stride=(1, 1), padding=0)
        # Activation function after convolution layer
        self.relu = nn.ReLU()
        # Max Pooling Layer
        self.max1 = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 1)) # no hyperparameter channels anymore

    def forward(self, input: torch.Tensor):
        conv1out = self.conv1(input)
        print(conv1out.shape)
        relu1 = self.relu(conv1out)
        print(relu1.shape)
        max1out = self.max1(relu1)
        print(max1out.shape)
        return max1out

In [3]:
input = torch.randn((4, 3, 3, 4))
multi_channel_cnn = MultiChannelCNN()
output = multi_channel_cnn(input)
print(output)


torch.Size([4, 6, 2, 3])
torch.Size([4, 6, 2, 3])
torch.Size([4, 6, 2, 2])
tensor([[[[0.6714, 0.6714],
          [0.0000, 1.9236]],

         [[1.7209, 1.7209],
          [0.0000, 0.0000]],

         [[0.7667, 0.8927],
          [0.0000, 0.4141]],

         [[0.5412, 0.5412],
          [0.5412, 0.0000]],

         [[0.0000, 0.0000],
          [1.0475, 1.0028]],

         [[0.0000, 0.0000],
          [0.6448, 0.8287]]],


        [[[0.4833, 0.4833],
          [0.6043, 0.6043]],

         [[0.3296, 0.3296],
          [0.4693, 0.4693]],

         [[0.0806, 0.7440],
          [0.3055, 0.6293]],

         [[0.0000, 0.1928],
          [0.3018, 0.2471]],

         [[0.3296, 0.3296],
          [0.6285, 0.6285]],

         [[0.0572, 0.0000],
          [0.0000, 0.0000]]],


        [[[0.0000, 0.0000],
          [0.5248, 0.4185]],

         [[0.9558, 0.9558],
          [0.1116, 0.4283]],

         [[0.1920, 0.1920],
          [0.4236, 0.4236]],

         [[0.0063, 0.8014],
          [0.7458, 0.74

- Convolutional Neural Networks expect a matrix as an input. The output is a feature map.
- Feedforward Networks expect a vector as an input. output is a vector [number of classes]
- Flattening is used to transform the feature map into a vector by concatenating all the elements row by row.
- Each feature map becomes a vector concatenated into a long one dimensional vector
- Process by CNN and feed to Feedforward Network

In [4]:
feature_map = torch.randn((4, 6, 2, 2))
print("Initial feature map shape: ", feature_map.shape)

total = 4*6*2*2
print(total)
# [4 (batch size),  length of embedding calculated by dividing the total by 4,  ]
print('Second dimension: ', total/4)
# change the shape to [4, ?]

emb = feature_map.reshape((4, 24))
simple_emb = feature_map.reshape((4, -1)) # a cheat to force Python to do the calculation

# We use flatten to connect a FFN (as predictor) to a CNN (using CNN as a feature extractor)

class DogCatPredictor(nn.Module):
    def __init__(self):
        super().__init__()

        # feature extractor using multichannel CNN
        # input and output channels must be tuned because they are hyper-parameters
        self.conv1=nn.Conv2d(in_channels=3, out_channels=6, kernel_size=(2, 2), stride=(1, 1), padding=0)
        # Activation function after convolution layer
        self.relu = nn.ReLU()
        # Max Pooling Layer
        self.max1 = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 1)) # no hyperparameter channels anymore

        # predictor using feedforward network
        self.pred = nn.Linear(in_features=24, out_features=2)

    def forward(self, input: torch.Tensor):
        # extract features
        conv1out = self.conv1(input)
        print(conv1out.shape)
        relu1 = self.relu(conv1out)
        print(relu1.shape)
        max1out = self.max1(relu1)
        print(max1out.shape)

        # batch size is here so better not to put it here.
        # don't hard code your variables!
        emb = max1out.reshape((-1, 24))

        # flatten max1 and feed to predictor
        logits = self.pred(emb)
        print(logits.shape)
        return logits

dog_cat = DogCatPredictor()
input = torch.randn((4, 3, 3, 4))
logits = dog_cat(input)
print(logits)

Initial feature map shape:  torch.Size([4, 6, 2, 2])
96
Second dimension:  24.0
torch.Size([4, 6, 2, 3])
torch.Size([4, 6, 2, 3])
torch.Size([4, 6, 2, 2])
torch.Size([4, 2])
tensor([[ 0.1864, -0.1845],
        [ 0.3665, -0.1059],
        [ 0.4957, -0.0530],
        [ 0.0438, -0.1378]], grad_fn=<AddmmBackward0>)


In [5]:
class SimpleDogCat(nn.Module):
    def __init__(self):
        super().__init__()
        self.feat_extractor = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=(2, 2), stride=(1, 1), padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 1))
        )
        self.pred = nn.Linear(in_features=24, out_features=2)

    def forward(self, input: torch.Tensor):
        feat = self.feat_extractor(input)
        emb = feat.reshape((-1, 24))
        logits = self.pred(emb)
        return logits

simple_dog_cat = SimpleDogCat()
input = torch.randn((4, 3, 3, 4))
logits = simple_dog_cat(input)
print(logits)

tensor([[-0.1182, -0.4845],
        [-0.2371, -0.4347],
        [-0.3365, -0.4321],
        [-0.1892, -0.2207]], grad_fn=<AddmmBackward0>)
