In [None]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist

In [None]:
# physical_devices = tf.config.list_physical_devices("GPU")
# tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1, 28, 28, 1).astype("float32") / 255.0
x_test = x_test.reshape(-1, 28, 28, 1).astype("float32") / 255.0

CNN architecture fundamental CNN->Batchnorm->ReLU. If we want to code for 10 layers then it is not convenient to write same code multiple times for different layers. Instead we can define a class for the model and use it.

In [None]:
class CNNBlock(layers.Layer):
    def __init__(self, out_channels, kernel_size=3):
        super(CNNBlock, self).__init__()
        self.conv = layers.Conv2D(out_channels, kernel_size, padding="same")
        self.bn = layers.BatchNormalization()

    def call(self, input_tensor, training=False):      
        x = self.conv(input_tensor)
        x = self.bn(x, training=training)
        x = tf.nn.relu(x)
        return x

**NOTE**
- out_channels=number of filters: refers to depth of the convolutional layers
- self.conv : defines a conv layer
- def call : main forward pass

In [47]:
class ResBlock(layers.Layer):
    def __init__(self, channels):
        super(ResBlock, self).__init__()
        self.channels = channels
        self.cnn1 = CNNBlock(channels[0], 3)
        self.cnn2 = CNNBlock(channels[1], 3)
        self.cnn3 = CNNBlock(channels[2], 3)
        self.pooling = layers.MaxPooling2D()
        self.identity_mapping = layers.Conv2D(channels[1], 3, padding="same")

    def call(self, input_tensor, training=False):
        x = self.cnn1(input_tensor, training=training)
        x = self.cnn2(x, training=training)
        x = self.cnn3(x + self.identity_mapping(input_tensor), training=training,)
        x = self.pooling(x)
        return x

**NOTE**
- channels : is a list of 3 integers representing the output size of 3 CNN layers
- identity_mapping: this is the main part of understanding of this block of code.
  - The purpose of self.identity_mapping in the ResBlock is to introduce an identity shortcut connection. By this shortcut connection we have a several advantages.
    - It adds the input tensors with the output of the second cnn. this addition actually tells us the difference between the input and the second cnn output. By this difference check the model have the chance to check if the residual features are retained. $output= Input + Residual$. here, residual is the difference. By adding the residual, the network "corrects" the output and ensures that the relevant information is preserved.
    - The term "shortcut" comes from the fact that this connection provides a shortcut for the gradient during backpropagation, allowing it to flow more easily through the network. In traditional deep networks, as the number of layers increases, the gradients can diminish (vanishing gradient) or explode (exploding gradient) as they propagate backward during training. This makes training deeper networks more challenging. With the shortcut connection, the gradients during backpropagation can directly "shortcut" through the identity mapping (the addition operation) without being affected by the convolutional layers. The addition operation creates a "skip connection" that enables the gradient to flow directly from the output to the input of cnn2, effectively bypassing cnn1 and cnn3.

In [52]:
class ResNet_Like(keras.Model):
    def __init__(self, num_classes=10):
        super(ResNet_Like, self).__init__()
        self.block1 = ResBlock([32, 32, 64])
        self.block2 = ResBlock([128, 128, 256])
        self.block3 = ResBlock([128, 256, 512])
        self.pool = layers.GlobalAveragePooling2D()  #layers.Flatten()
        self.classifier = layers.Dense(num_classes)

    def call(self, input_tensor, training=False):
        x = self.block1(input_tensor, training=training)
        x = self.block2(x, training=training)
        x = self.block3(x, training=training)
        x = self.pool(x, training=training)
        x = self.classifier(x)
        return x
    def model(self):
        x = keras.Input(shape=(28, 28, 1))
        return keras.Model(inputs=[x], outputs=self.call(x))

**NOTE**
- this is Model like ResNet, not exactly!
- Global Average Pooling 2D reduces the spatial dimensions of the tensor to a single value per channel by averaging the values across the spatial dimensions. This is often used as an alternative to traditional flattening before the final classification layer.

In [53]:
model = ResNet_Like().model()
base_input = model.layers[0].input
base_output = model.layers[2].output
output = layers.Dense(10)(layers.Flatten()(base_output))
model = keras.Model(base_input, output)

In [54]:
model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)


In [55]:
model.fit(x_train, y_train, batch_size=64, epochs=1, verbose=2)
model.evaluate(x_test, y_test, batch_size=64, verbose=2)
model.save("pretrained")

938/938 - 590s - loss: 0.1025 - accuracy: 0.9680 - 590s/epoch - 629ms/step
157/157 - 19s - loss: 0.0308 - accuracy: 0.9905 - 19s/epoch - 122ms/step
INFO:tensorflow:Assets written to: pretrained\assets


INFO:tensorflow:Assets written to: pretrained\assets
