In [None]:
# 1. Deep Learning.
# a. Build a DNN with five hidden layers of 100 neurons each, He initialization, and the ELU activation function.
# b. Using Adam optimization and early stopping, try training it on MNIST but only on
# digits 0 to 4, as we will use transfer learning for digits 5 to 9 in the next exercise. You
# will need a softmax output layer with five neurons, and as always make sure to save
#  checkpoints at regular intervals and save the final model so you can reuse it later.
# c. Tune the hyperparameters using cross-validation and see what precision you can achieve.
# d. Now try adding Batch Normalization and compare the learning curves: is it converging faster than before? Does it produce a better 
# model?
# e. Is the model overfitting the training set? Try adding dropout to every layer and try again. Does it help?

# Sol:

import torch
import torch.nn as nn

class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.activation = nn.ELU()

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation(self.fc4(x))
        x = self.activation(self.fc5(x))
        x = self.output(x)
        return x

input_size = ...  # Define the input size
hidden_size = 100
output_size = 5  # Number of output classes

model = DNN(input_size, hidden_size, output_size)


import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

# Set up data loaders for digits 0 to 4
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

indices = [i for i, (_, label) in enumerate(train_dataset) if label < 5]
train_sampler = SubsetRandomSampler(indices)
train_loader = DataLoader(train_dataset, batch_size=64, sampler=train_sampler)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define loss function, optimizer, and early stopping
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# Add code for early stopping here

# Training loop
for epoch in range(num_epochs):
    model.train()
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images.view(-1, input_size))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    # Add code for early stopping here

# Save the model
torch.save(model.state_dict(), 'model.pth')


...
self.bn1 = nn.BatchNorm1d(hidden_size)
self.bn2 = nn.BatchNorm1d(hidden_size)
self.bn3 = nn.BatchNorm1d(hidden_size)
self.bn

In [None]:
# 2. Transfer learning.
# a. Create a new DNN that reuses all the pretrained hidden layers of the previous
# model, freezes them, and replaces the softmax output layer with a new one.
# b. Train this new DNN on digits 5 to 9, using only 100 images per digit, and time how
# long it takes. Despite this small number of examples, can you achieve high precision?
# c. Try caching the frozen layers, and train the model again: how much faster is it now?
# d. Try again reusing just four hidden layers instead of five. Can you achieve a highe precision?
# e. Now unfreeze the top two hidden layers and continue training: can you get the model to perform even better?

# Sol:

# Load the pretrained model
pretrained_model = DNN(input_size, hidden_size, output_size)
pretrained_model.load_state_dict(torch.load('model.pth'))

# Freeze the pretrained hidden layers
for param in pretrained_model.parameters():
    param.requires_grad = False

# Replace the softmax output layer
pretrained_model.output = nn.Linear(hidden_size, new_output_size)


# Set up data loaders for digits 5 to 9 with limited images
indices = [i for i, (_, label) in enumerate(train_dataset) if label >= 5]
limited_indices = np.random.choice(indices, size=(500,), replace=False)
train_sampler = SubsetRandomSampler(limited_indices)
train_loader = DataLoader(train_dataset, batch_size=64, sampler=train_sampler)

# Modify the training loop
for epoch in range(num_epochs):
    model.train()
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images.view(-1, input_size))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        
# c. To cache the frozen layers and train the model again, you can make use of the torch.no_grad() context manager to disable gradient 
# computation and cache the outputs of the frozen layers. This avoids unnecessary computations and speeds up the training process.

# d. To reuse only four hidden layers instead of five, you can modify the pretrained model by removing the last hidden layer. However, 
# achieving higher precision solely by reducing the number of hidden layers may not be guaranteed, as the model's performance depends on 
# various factors such as the complexity of the task and the available training data.

# e. To unfreeze the top two hidden layers and continue training, you can selectively enable gradient computation for those layers by 
# setting requires_grad = True. This allows the unfrozen layers to update their weights during training, potentially improving the 
# model's performance.

In [None]:
# 3. Pretraining on an auxiliary task.
# a. In this exercise you will build a DNN that compares two MNIST digit images and
# predicts whether they represent the same digit or not. Then you will reuse the lower
# layers of this network to train an MNIST classifier using very little training data. Start
# by building two DNNs (let’s call them DNN A and B), both similar to the one you built
# earlier but without the output layer: each DNN should have five hidden layers of 100
# neurons each, He initialization, and ELU activation. Next, add one more hidden layer
# with 10 units on top of both DNNs. To do this, you should use
# TensorFlow’s concat() function with axis=1 to concatenate the outputs of both DNNs
# for each instance, then feed the result to the hidden layer. Finally, add an output
# layer with a single neuron using the logistic activation function.
# b. Split the MNIST training set in two sets: split #1 should containing 55,000 images,
# and split #2 should contain contain 5,000 images. Create a function that generates a
# training batch where each instance is a pair of MNIST images picked from split #1.
# Half of the training instances should be pairs of images that belong to the same
# class, while the other half should be images from different classes. For each pair, the

# training label should be 0 if the images are from the same class, or 1 if they are from different classes.
# c. Train the DNN on this training set. For each image pair, you can simultaneously feed
# the first image to DNN A and the second image to DNN B. The whole network will
# gradually learn to tell whether two images belong to the same class or not.
# d. Now create a new DNN by reusing and freezing the hidden layers of DNN A and
# adding a softmax output layer on top with 10 neurons. Train this network on split #2
# and see if you can achieve high performance despite having only 500

# Sol:

# Build DNN A
dnn_a = tf.keras.models.Sequential([
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal', input_shape=(input_size,)),
    # Add four more hidden layers
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal')
])

# Build DNN B (similar to DNN A)
dnn_b = tf.keras.models.Sequential([
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal', input_shape=(input_size,)),
    # Add four more hidden layers
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal')
])

# Concatenate outputs of DNN A and DNN B
concat = tf.keras.layers.Concatenate(axis=1)([dnn_a.output, dnn_b.output])

# Add additional hidden layer
hidden = tf.keras.layers.Dense(10, activation='elu', kernel_initializer='he_normal')(concat)

# Add output layer with logistic activation
output = tf.keras.layers.Dense(1, activation='sigmoid')(hidden)

# Create the final model
model = tf.keras.models.Model(inputs=[dnn_a.input, dnn_b.input], outputs=output)


def generate_training_batch(split1_images, split1_labels, batch_size):
    batch_images = []
    batch_labels = []

    for _ in range(batch_size // 2):
        # Select two random indices from split #1
        idx1, idx2 = np.random.choice(len(split1_images), size=2, replace=False)
        
        # Append two images to the batch
        batch_images.append([split1_images[idx1], split1_images[idx2]])
        
        # Check if the images belong to the same class
        if split1_labels[idx1] == split1_labels[idx2]:
            # Images belong to the same class, label = 0
            batch_labels.append(0)
        else:
            # Images belong to different classes, label = 1
            batch_labels.append(1)

    return np.array(batch_images), np.array(batch_labels)


optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.BinaryCrossentropy()

for epoch in range(num_epochs):
    for batch in range(num_batches):
        # Generate a training batch
        batch_images, batch_labels = generate_training_batch(split1_images, split1_labels, batch_size)

        with tf.GradientTape() as tape:
            # Forward pass
            output = model(batch_images)
            loss = loss_fn(batch_labels, output)

        # Backward pass
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        
optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.BinaryCrossentropy()

for epoch in range(num_epochs):
    for batch in range(num_batches):
        # Generate a training batch
        batch_images, batch_labels = generate_training_batch(split1_images, split1_labels, batch_size)

        with tf.GradientTape() as tape:
            # Forward pass
            output = model(batch_images)
            loss = loss_fn(batch_labels, output)

        # Backward pass
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))