<a href="https://colab.research.google.com/github/ismael-rtellez/CNN2_Series_Assignment/blob/main/Convolutional_Neural_Network_2(CNN2)_Sprint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## CNN2 Series Assignment: SimpleConv2d

### 【Problem 1】Creating a 2-D convolutional layer

In [3]:
from re import L
import numpy as np

# Problem 1: Creating a 2D convolutional layer
class Conv2d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, learning_rate=0.01):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kh, self.kw = kernel_size
        self.stride = stride
        self.padding = padding
        self.lr = learning_rate


        # Xavier initialization
        scale  = np.sqrt(1. / (in_channels * self.kh * self.kw))
        self.W = np.random.randn(out_channels, in_channels, self.kh, self.kw) * scale
        self.B = np.zeros(out_channels)

    def forward(self, x):
        self.x = x
        N, C, H, W = x.shape
        out_h= (H + 2 * self.padding - self.kh) // self.stride + 1
        out_w= (W + 2 * self.padding - self.kw) // self.stride + 1
        self.out_shape = (N, self.out_channels, out_h, out_w)

        if self.padding > 0:
            x = np.pad(x, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')

        self.x_padded = x
        out = np.zeros((N, self.out_channels, out_h, out_w))

        for n in range(N):
            for oc in range(self.out_channels):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride
                        h_end = h_start + self.kh
                        w_start = j * self.stride
                        w_end = w_start + self.kw
                        region = x[n, :, h_start:h_end, w_start:w_end]
                        out[n, oc, i, j] = np.sum(region * self.W[oc]) + self.B[oc]
        return out

    def backward(self, d_out):
        N, C, H, W = self.x.shape
        dx = np.zeros_like(self.x_padded, dtype=np.float32)
        dW = np.zeros_like(self.W, dtype=np.float32)
        dB = np.zeros_like(self.B, dtype=np.float32)

        _, _, out_h, out_w = d_out.shape

        for n in range(N):
            for oc in range(self.out_channels):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride
                        h_end = h_start + self.kh
                        w_start = j * self.stride
                        w_end = w_start + self.kw
                        region = self.x_padded[n, :, h_start:h_end, w_start:w_end]

                        dW[oc] += region * d_out[n, oc, i, j]
                        dB[oc] += d_out[n, oc, i, j]
                        dx[n, :, h_start:h_end, w_start:w_end] += self.W[oc] * d_out[n, oc, i, j]

        # removing padding if added
        if self.padding > 0:
            dx = dx[:, :, self.padding:-self.padding, self.padding:-self.padding]

        # Updating weights
        self.W -= self.lr * dW
        self.B -= self.lr * dB

        return dx

### 【Problem 2】Experiments with 2D convolutional layers on small arrays

In [4]:
# Problem 2: Experiments with 2D convolutional layerson small arrays

# Input data when flowing CNN2 forwards (1, 1, 4, 4)
x = np.array([[[[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12],
                [13, 14, 15, 16]]]])

# Manually setting filters
w = np.array([
    [[[0, 0, 0], [0, 1, 0], [0, -1, 0]]],
    [[[0, 0, 0], [0, -1, 1], [0, 0, 0]]]
    ]).astype(np.float32)

b = np.array([0, 0], dtype=np.float32)

# Conv2d with 1 input channel, 2 outputs channels, kernel 3x3
conv = Conv2d(in_channels=1, out_channels=2, kernel_size=(3, 3), stride=1, padding=0)
conv.W = w.copy()
conv.B = b.copy()

# forward pass
out = conv.forward(x)
print("Forward Output: \n", out)
print("\n")

# backward test
d_out = np.array([[[[-4, -4], [-4, -4]],
                   [[1, -7], [1, -11]]]], dtype=np.float32)
dx = conv.backward(d_out)
print("Backward Output (dx): \n", dx)
print("\n")

Forward Output: 
 [[[[-4. -4.]
   [-4. -4.]]

  [[ 1.  1.]
   [ 1.  1.]]]]


Backward Output (dx): 
 [[[[  0.   0.   0.   0.]
   [  0.  -5.   4.  -7.]
   [  0.  -1.  12. -11.]
   [  0.   4.   4.   0.]]]]




### 【Problem 3】Output size after 2-dimensional convolution

In [5]:
# Problem 3: Output size after 2-dimensional convolution
def conv2d_output_size(H_in, W_in, kernel_size, stride=1, padding=0):
    kh, kw = kernel_size
    H_out= (H_in + 2 * padding - kh) // stride + 1
    W_out= (W_in + 2 * padding - kw) // stride + 1
    return H_out, W_out

### 【Problem 4】Creation of maximum pooling layer

In [6]:
# Problem 4: Creating a max pooling layer
class MaxPool2D:
    def __init__(self, pool_size=(2, 2), stride=1):
        self.ph, self.pw = pool_size
        self.stride = stride

    def forward(self, x):
        self.x = x
        N, C, H, W = x.shape
        out_h= (H - self.ph) // self.stride + 1
        out_w= (W - self.pw) // self.stride + 1
        self.arg_max = np.zeros((N, C, out_h, out_w), dtype=np.int32)

        out = np.zeros((N, C, out_h, out_w))

        for n in range(N):
            for c in range(C):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride
                        w_start = j * self.stride
                        window = x[n, c, h_start:h_start+self.ph, w_start:w_start+self.pw]
                        out[n, c, i, j] = np.max(window)
                        self.arg_max[n, c, i, j] = np.argmax(window)
        return out

    def backward(self, d_out):
        N, C, H, W = self.x.shape
        out_h, out_w = d_out.shape[2:]
        dx = np.zeros_like(self.x)

        for n in range(N):
            for c in range(C):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride
                        w_start = j * self.stride
                        index = int(self.arg_max[n, c, i, j])
                        h_index = h_start + index // self.pw
                        w_index = w_start + index % self.pw
                        dx[n, c, h_index, w_index] += d_out[n, c, i, j]
        return dx

### 【Problem 5】(Advance task) Creating average pooling

In [7]:
# Problem 5: (Advance task) Creating average pooling
class AveragePool2D:
    def __init__(self, pool_size=(2, 2), stride=2):
        self.ph, self.pw = pool_size
        self.stride = stride

    def forward(self, x):
        self.x = x
        N, C, H, W = x.shape
        out_h= (H - self.ph) // self.stride + 1
        out_w= (W - self.pw) // self.stride + 1

        out = np.zeros((N, C, out_h, out_w))

        for n in range(N):
            for c in range(C):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride
                        w_start = j * self.stride
                        window = x[n, c, h_start:h_start+self.ph, w_start:w_start+self.pw]
                        out[n, c, i, j] = np.mean(window)
        return out

    def backward(self, d_out):
        N, C, H, W = self.x.shape
        out_h, out_w = d_out.shape[2:]
        dx = np.zeros_like(self.x)

        for n in range(N):
            for c in range(C):
                for i in range(out_h):
                    for j in range(out_w):
                        h_start = i * self.stride
                        w_start = j * self.stride
                        dx[n, c, h_start:h_start+self.ph, w_start:w_start+self.pw] += d_out[n, c, i, j] / (self.ph * self.pw)
        return dx

###【Problem 6】Smoothing

In [8]:
# Problem 6: Smoothing
class Flatten:
    def forward(self, x):
        self.orig_shape = x.shape
        return x.reshape(x.shape[0], -1)

    def backward(self, d_out):
        return d_out.reshape(self.orig_shape)

### 【Problem 7】Learning and estimation

In [2]:
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical


# Problem 7: Learning and estimation
class ReLu:
    def forward(self, x):
        self.mask = (x > 0)
        return x * self.mask
    def backward(self, d_out):
        return d_out * self.mask

class Dense:
    def __init__(self, in_features, out_features, lr=0.01):
        scale  = np.sqrt(1. / (in_features))
        self.W = np.random.randn(in_features, out_features) * scale
        self.B = np.zeros(out_features)
        self.lr = lr

    def forward(self, x):
        self.x = x
        return np.dot(x, self.W) + self.B

    def backward(self, d_out):
        dW = np.dot(self.x.T, d_out)
        dB = np.sum(d_out, axis=0)
        dx = np.dot(d_out, self.W.T)
        self.W -= self.lr * dW
        self.B -= self.lr * dB
        return dx

class SoftmaxCrossEntropy:
    def forward(self, x, y):
        self.y = y
        self.y_pred = self._softmax(x)
        return self._cross_entropy(self.y_pred, y)

    def backward(self):
        return (self.y_pred - self.y) / self.y.shape[0]

    def _softmax(self, x):
        x = x - np.max(x, axis=1, keepdims=True)
        return  np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)

    def _cross_entropy(self, y_pred, y_true):
        return -np.sum(y_true * np.log(y_pred + 1e-7)) / y_true.shape[0]

# Preprocess MNIST
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train[:1000].astype(np.float32) / 255.0
x_test = x_test[:200].astype(np.float32) / 255.0
y_train = to_categorical(y_train[:1000], 10)
y_test = to_categorical(y_test[:200], 10)

x_train = x_train.reshape(-1, 1, 28, 28)
x_test = x_test.reshape(-1, 1, 28, 28)

# Define simple CNN
class Scratch2dCNNClassifier:
    def __init__(self):
        self.conv = Conv2d(1, 8, (3, 3), stride=1, padding=1)
        self.relu1 = ReLu()
        self.pool = MaxPool2D(pool_size=(2, 2), stride=2)
        self.flatten = Flatten()
        self.fc1 = Dense(14*14*8, 64)
        self.relu2 = ReLu()
        self.fc2 = Dense(64, 10)
        self.loss_fn = SoftmaxCrossEntropy()

    def forward(self, x):
        x = self.conv.forward(x)
        x = self.relu1.forward(x)
        x = self.pool.forward(x)
        x = self.flatten.forward(x)
        x = self.fc1.forward(x)
        x = self.relu2.forward(x)
        x = self.fc2.forward(x)
        return x

    def backward(self, d_out):
        d_out = self.fc2.backward(d_out)
        d_out = self.relu2.backward(d_out)
        d_out = self.fc1.backward(d_out)
        d_out = self.flatten.backward(d_out)
        d_out = self.pool.backward(d_out)
        d_out = self.relu1.backward(d_out)
        d_out = self.conv.backward(d_out)

    def train(self, x, y):
        out = self.forward(x)
        loss = self.loss_fn.forward(out, y)
        d_out = self.loss_fn.backward()
        self.backward(d_out)
        return loss

    def predict(self, x):
        out = self.forward(x)
        return np.argmax(out, axis=1)


# Training
model = Scratch2dCNNClassifier()
epochs = 3
batch_size = 100

for epoch in range(epochs):
    loss_sum = 0
    for i in range(0, len(x_train), batch_size):
        x_batch = x_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        loss = model.train(x_batch, y_batch)
        loss_sum += loss
    print(f"Epoch {epoch+1}, Loss: {loss_sum:.4f}")

# Accuracy
preds = model.predict(x_test)
true = np.argmax(y_test, axis=1)
accuracy = np.mean(preds == true)
print("Test Accuracy: ", accuracy)
print("\n")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1, Loss: 22.6792
Epoch 2, Loss: 22.0936
Epoch 3, Loss: 21.5214
Test Accuracy:  0.41




### 【Problem 8】(Advance assignment) LeNet

In [3]:
# Problem 8: (Advanced Assignment) LeNet
class LeNet:
    def __init__(self):
        self.conv1 = Conv2d(1, 6, (5, 5), stride=1, padding=0)
        self.relu1 = ReLu()
        self.pool1 = MaxPool2D(pool_size=(2, 2), stride=2)
        self.conv2 = Conv2d(6, 16, (5, 5), stride=1, padding=0)
        self.relu2 = ReLu()
        self.pool2 = MaxPool2D(pool_size=(2, 2), stride=2)
        self.flatten = Flatten()
        self.fc1 = Dense(16*4*4, 120)
        self.relu3 = ReLu()
        self.fc2 = Dense(120, 84)
        self.relu4 = ReLu()
        self.fc3 = Dense(84, 10)
        self.loss_fn = SoftmaxCrossEntropy()

    def forward(self, x):
        x = self.conv1.forward(x)
        x = self.relu1.forward(x)
        x = self.pool1.forward(x)
        x = self.conv2.forward(x)
        x = self.relu2.forward(x)
        x = self.pool2.forward(x)
        x = self.flatten.forward(x)
        x = self.fc1.forward(x)
        x = self.relu3.forward(x)
        x = self.fc2.forward(x)
        x = self.relu4.forward(x)
        x = self.fc3.forward(x)
        return x

    def backward(self, d_out):
        d_out = self.fc3.backward(d_out)
        d_out = self.relu4.backward(d_out)
        d_out = self.fc2.backward(d_out)
        d_out = self.relu3.backward(d_out)
        d_out = self.fc1.backward(d_out)
        d_out = self.flatten.backward(d_out)
        d_out = self.pool2.backward(d_out)
        d_out = self.relu2.backward(d_out)
        d_out = self.conv2.backward(d_out)
        d_out = self.pool1.backward(d_out)
        d_out = self.relu1.backward(d_out)
        d_out = self.conv1.backward(d_out)

    def train(self, x, y):
        out = self.forward(x)
        loss = self.loss_fn.forward(out, y)
        d_out = self.loss_fn.backward()
        self.backward(d_out)
        return loss

    def predict(self, x):
        out = self.forward(x)
        return np.argmax(out, axis=1)

# Training Lenet
lenet = LeNet()
epochs = 3
batch_size = 100

for epoch in range(epochs):
    loss_sum = 0
    for i in range(0, len(x_train), batch_size):
        x_batch = x_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        loss = lenet.train(x_batch, y_batch)
        loss_sum += loss
    print(f"[LeNet] Epoch {epoch+1}, Loss: {loss_sum:.4f}")

# Accuracy
preds_ln = lenet.predict(x_test)
true_ln = np.argmax(y_test, axis=1)
acc = np.mean(preds_ln == true_ln)
print("[LeNet] Test Accuracy: ", acc)
print("\n")

[LeNet] Epoch 1, Loss: 23.0647
[LeNet] Epoch 2, Loss: 22.7274
[LeNet] Epoch 3, Loss: 22.4579
[LeNet] Test Accuracy:  0.235




###【Problem 9】(Advance Assignment) Survey of famous image recognition models

**AlexNet (2012)**

AlexNet can be considered as one of the first deep convolutional neural networks that has achieved significant breakthrough in the domain of the computer vision and contributed to making deep learning popular. It has five convolutional layers then three fully connected layers. AlexNet had some of the following innovations. First, it replaced traditional sigmoid or tanh functions with ReLu (Rectifier Linear Unit) and that allowed it to speed up the trainnig process and bring better results. It also integrated dropout, regularization to minimize overfitting. It was trained by GPU on with data parallelism, and larger models could be trained. It also used Local Response Normalization (LRN) technique, but this is seldom applied in the contemporary architectures. The input of AlexNet is RGB images of dimension 224x224

**VGG16 (2014)**

VGG16 put forward deeper architecture involving a network than the AlexNet, entailing 13 convolutional layers and the¿ree fully  conected layers- which total up to 16 learnable layers. The main contribution of the VGG16 training process is that the small size (3*3) convolutional filters are used across the network whitout variation. This strategy proved than an aggregate or series of small filters is capable of a higher performance than 7x7 filters. The network has a plain and homogeneous structure  an this helps the creation as well as the expansion of the network. To bring about spatial dimensionality reduction, the insertion of Max pooling layers at the end of some convolutional blocks is done. Though VGG16 met better accuracy and depth unlike AlexNet, it is also slow and consumes more memory because it has more paarameters than AlexNet.

### 【Problem 10】Calculation of output size and number of parameters

In [4]:
# Problem 10: Calculation of output size and numbers of parameters
def compute_conv_output_and_parameters(H_in, W_in, C_in, kernel_size, C_out, stride=1, padding=0):
    kh, kw = kernel_size

    # Output dimensions
    H_out = (H_in + 2 * padding - kh) // stride + 1
    W_out = (W_in + 2 * padding - kw) // stride + 1

    # Parameters per filter: C_in * kh * kw, plus 1 bias per output channel
    params_per_filter = C_in * kh * kw + 1
    total_params = params_per_filter * C_out

    return (H_out, W_out, C_out), total_params

# 1. Input: 144x144x3, Filter: 3x3, 6 filters, stride=1, padding=0
out1, params1 = compute_conv_output_and_parameters(144, 144, 3, (3, 3), 6)
print("Layer 1 Output: ", out1, "Params: ", params1)

# 2. Input: 60x60x24, Filter: 3x3, 48 filters, stride=1, padding=0
out2, params2 = compute_conv_output_and_parameters(60, 60, 24, (3, 3), 48)
print("Layer 1 Output: ", out2, "Params: ", params2)

# 3. Input: 20x20x10, Filter: 3x3, 20 filters, stride=2, padding=0
out3, params3 = compute_conv_output_and_parameters(20, 20, 10, (3, 3), 20, stride=2)
print("Layer 1 Output: ", out3, "Params: ", params3)

Layer 1 Output:  (142, 142, 6) Params:  168
Layer 1 Output:  (58, 58, 48) Params:  10416
Layer 1 Output:  (9, 9, 20) Params:  1820


###【Problem 11】(Advanced assignment) Survey on filter size

**Why 3x3 filters are commonly used instead of larger ones such as 7x7**

The use smaller-sized convolutional filters, e.g., 3x3 rather than large convolutional filters, e.g., 7x7, is a common convention of the  recent CNN architectures due tomultiple reasons. First, residual stacking with several layers composed of 3x3 is more non-linear to the network. The activation functions occur after each layer, thus with three 3x3 layers more activation operations can be performed than with one 7x7 layer, and in this away deeper and expressive networks are possible. Secondly, 3x3 filters have better usage of parameters. As an example, that 7x7 convolutional layer with input and output channels, C would have 49C 2 parameters, but with three convolutional layers of three pixels in size stacked on top of each other, we need only 27C 2 parameters a little more tanh half. The three 3x3 layers were keeping only 7x7 fields of influence despite the reduced parameter number effectively, the network would be able to comprehend the same spatial information with the benefit of marginally more learning capabilities and less computations.

**The effect of a 1x1 filter with no height or width direction**

This convolution filter fails to extract spatial information because height has a value of one and also the width. Rather, it acts on the channel dimension instead and has the effect of instantaneously performing a fully conected layer on every pixel position in the image. Primary applications of 1x1 filters are: reducing dimensionality of the input, e.g. reducing the number of channels prior to undergoing a more expensive computation-wise convolution and embedding the network with additional depth whitout expanding it spatially. Further, learning non-linear combination in the feature channels can be done using 1 by 1 convolutions. The idea was conceived in the Network-in-Network (NIN) architecture and it has been successfully implemented by GoogLeNet (Inception), in which it was instrumental to reduce the depth of the networks and improve efficiency.