In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist

In [None]:
inputs = keras.Input(shape = (28, 28, 1))
# height, widhth, number of channels

x = layers.Conv2D(filters = 32, kernel_size = 3, activation = "relu") (inputs) # 26, 26, 32
x = layers.MaxPool2D(pool_size = 2) (x) # 13, 13, 32
x = layers.Conv2D(filters = 64, kernel_size = 3, activation = "relu")(x) # 11, 11, 64
x = layers.MaxPool2D(pool_size = 2) (x) # 5, 5, 64
x = layers.Conv2D(filters = 128, kernel_size = 3, activation = "relu")(x) # 3, 3, 128
x = layers.Flatten()(x)
outputs = layers.Dense(10, activation = "softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

In [None]:
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images.reshape((60000, 28, 28, 1))
train_images = train_images.astype("float32") / 255
test_images = test_images.reshape((10000, 28, 28, 1))
test_images = test_images.astype("float32") / 255

model.compile(optimizer = "rmsprop",
              loss = "sparse_categorical_crossentropy",
              metrics = ["accuracy"])
model.fit(train_images, train_labels, epochs = 3, batch_size = 64)

Epoch 1/3
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8826 - loss: 0.3648
Epoch 2/3
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9852 - loss: 0.0474
Epoch 3/3
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9907 - loss: 0.0307


<keras.src.callbacks.history.History at 0x7e11732f4e30>

In [None]:
test_loss, test_acc = model.evaluate(test_images, test_labels)
print(f"Test accuracy: {test_acc:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9771 - loss: 0.0664
Test accuracy: 0.9804


In [None]:
"""
Dense layers learn global patterns, for example, all pixels in an MNIST images
(i) The convolution operation
Convolution layers learn local patterns, patterns found in windows of shape 3*3, for example
Local patters include edges, textures, and so on

  (a) Patterns learnt are translation invariant, making it data-efficient when processing images
  eg: if seen in lower-right, can recognize if that pattern is in upper-left as well

  (b) They can learn spatial hierarchies of patterns,
  first layer learns local patterns such as edges, second convolution layer will learn larger patters made of the features of the first layer
  efficiently learn increasingly complex and abstract visual concepts

"""


In [None]:
"""
 (ii) The maxpooling operation
  - Downsample feature maps, for example, after first convolution layer, Maxpooling2D will downsample 26*26 to 13*13
  - Extracting windows from the input feature maps and outputting the max value of each channel
  - Maxpooling is usually done with 2*2 windows and stride 2
  (a) Without maxpooling, there will be intense overfitting as total coefficients per sample will be huge
  (b) Induce spatial-filter hierarchies by making successive convolution layers look at increasingly large windows (in terms of the fraction of the original input they cover)
"""