# HW5 Coding Part

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import TensorDataset, DataLoader, random_split
import torchvision.transforms as transforms
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report

## Problem 0: Pytorch Tutorial (12 points)

### Tensors
Tensors can be created from numpy data or by using pytorch directly.

In [None]:
x_data = [[1, 2], [3, 4]]
x = torch.tensor(x_data)

np_array = np.array(x_data)
x_np = torch.from_numpy(np_array)

shape = (2,3)
rand_tensor = torch.rand(shape)
np_rand_array = rand_tensor.numpy()

print(f"Tensor from np: \n {x_np} \n")
print(f"Rand Tensor: \n {rand_tensor} \n")
print(f"Rand Numpy Array: \n {np_rand_array} \n")

#### 1) Tensor squeezing, unsqueezing and viewing (3 points)

Tensor squeezing, unsqueezing and viewing are important methods to change the dimension of a Tensor, and the corresponding functions are [torch.squeeze](https://pytorch.org/docs/stable/torch.html#torch.squeeze), [torch.unsqueeze](https://pytorch.org/docs/stable/torch.html#torch.unsqueeze) and [torch.Tensor.view](https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view). Please read the documents of the functions, and finish the following practice.

In [None]:
# x is a tensor with size being (3, 2)
x = torch.Tensor([[1, 2],[3, 4],[5, 6]])
print("Original shape:", x.shape)  # Should be (3, 2)

############## YOUR CODE HERE ##############
# TODO: Add two new dimensions to x by using the function torch.unsqueeze(input, dim) -> Tensor, so that the size of x becomes (3, 1, 2, 1).
# Hint: Add dimension at position 1 (second dimension)
# Hint: Add dimension at position 3 (fourth dimension)

############## END YOUR CODE ##############
print(x.shape) # Should be torch.Size([3, 1, 2, 1])

############## YOUR CODE HERE ##############
# TODO: Remove the two dimensions just added by using the function torch.squeeze(input, dim) -> Tensor, and change the size of x back to (3, 2).
# Hint: Remove the dimension at position 3
# Hint: Remove the dimension at position 1

############## END YOUR CODE ##############
print(x.shape) # Should be torch.Size([3, 2])

############## YOUR CODE HERE ##############
# TODO: x is now a two-dimensional tensor, or in other words a matrix. Now use the function torch.Tensor.view(*shape) and change x to a one-dimensional vector with size being (6).

############## END YOUR CODE ##############
print("After view:", x.shape)  # Should be torch.Size([6])

### 2) Tensor concatenation and stack (2 points)

Tensor concatenation and stack are operations to combine small tensors into big tensors. The corresponding functions are [torch.cat](https://pytorch.org/docs/stable/torch.html#torch.cat) and [torch.stack](https://pytorch.org/docs/stable/torch.html#torch.stack). Please read the documents of the functions, and finish the following practice.

**Hints:**<br>
1. `torch.stack((obj1, obj2))`: A new dimension is automatically added (dim=0 by default) and the input tensors are then stacked along that dimension.
2. `torch.cat((obj1, obj2))`: Dimensions are not added, but are spliced directly over existing dimensions, you should consider this case.

In [None]:
# x is a tensor with size being (3, 2)
x = torch.Tensor([[1, 2], [3, 4], [5, 6]])

# y is a tensor with size being (3, 2)
y = torch.Tensor([[-1, -2], [-3, -4], [-5, -6]])

############## YOUR CODE HERE ##############
# TODO: Our goal is to generate a tensor z with size as (2, 3, 2), and z[0,:,:] = x, z[1,:,:] = y. Use torch.stack to generate such a z
z = ... # Fill in this
############## END YOUR CODE ##############
print(z)

############## YOUR CODE HERE ##############
# TODO: Use torch.cat and torch.unsqueeze to generate z
z = ... # Fill in this
############## END YOUR CODE ##############
print(z)

# The tow outputs are expected to be tensor([[[ 1.,  2.], [ 3.,  4.], [ 5.,  6.]], [[-1., -2.], [-3., -4.], [-5., -6.]]])

#### 3) Tensor expansion (2 points)

Tensor expansion is to expand a tensor into a larger tensor along singleton dimensions. The corresponding functions are [torch.Tensor.expand](https://pytorch.org/docs/stable/tensors.html#torch.Tensor.expand) and [torch.Tensor.expand_as](https://pytorch.org/docs/stable/tensors.html#torch.Tensor.expand_as). Please read the documents of the functions, and finish the following practice.

In [None]:
# x is a tensor with size being (3)
x = torch.Tensor([1, 2, 3])

# Our goal is to generate a tensor z with size (2, 3), so that z[0,:,:] = x, z[1,:,:] = x.

############## YOUR CODE HERE ##############
# TODO: Change the size of x into (1, 3) by using torch.unsqueeze.
x = ...  # Fill in this
############## END YOUR CODE ##############
print(x) # Output is expected to be tensor([[1., 2., 3.]])

############## YOUR CODE HERE ##############
# TODO: Then expand the new tensor to the target tensor by using torch.Tensor.expand.
z = ...  # Fill in this
############## END YOUR CODE ##############
print(z) # Output is expected to be tensor([[1., 2., 3.], [1., 2., 3.]])

#### 4) Tensor reduction in a given dimension (5 points)

In deep learning, we often need to compute the mean/sum/max/min value in a given dimension of a tensor. Please read the document of [torch.mean](https://pytorch.org/docs/stable/torch.html#torch.mean), [torch.sum](https://pytorch.org/docs/stable/torch.html#torch.sum), [torch.max](https://pytorch.org/docs/stable/torch.html#torch.max), [torch.min](https://pytorch.org/docs/stable/torch.html#torch.min), [torch.topk](https://pytorch.org/docs/stable/torch.html#torch.topk), and finish the following practice.

In [None]:
# x is a random tensor with size being (10, 50)
x = torch.randn(10, 50)

############## YOUR CODE HERE ##############
# TODO: Compute the mean value for each row of x.
# You need to generate a tensor x_mean of size (10), and x_mean[k, :] is the mean value of the k-th row of x.
# dim = 1: eliminate the second(1)'s dimension
x_mean = ...  # Fill in this
############## END YOUR CODE ##############
print(x_mean.shape)

############## YOUR CODE HERE ##############
# TODO: Compute the sum value for each row of x.
# You need to generate a tensor x_sum of size (10).
x_sum = ...  # Fill in this
############## END YOUR CODE ##############
print(x_sum.shape)

############## YOUR CODE HERE ##############
# TODO: Compute the max value for each row of x.
# You need to generate a tensor x_max of size (10).
# Hint: torch.max() -> (max_val, indices)
(x_max, indices) = ...  # Fill in this
############## END YOUR CODE ##############
print(x_max.shape)

############## YOUR CODE HERE ##############
# TODO: Compute the min value for each row of x.
# You need to generate a tensor x_min of size (10).
# Hint: torch.max() -> (min_val, indices)
(x_min, indices) = ...  # Fill in this
############## END YOUR CODE ##############
print(x_min.shape)

############## YOUR CODE HERE ##############
# TODO: Compute the top-5 values for each row of x.
# You need to generate a tensor x_min of size (10. 5).
# Hint: torch.max() -> (min_val, indices)
(x_xtop, indices) = ...  # Fill in this
############## END YOUR CODE ##############
print((x_xtop.shape))

### Autograd (0 point) (Highly recommend checking it out)
This small section shows you how pytorch computes gradients. When we create tenors, we can set `requires_grad` to be true to indicate that we are using gradients. For most of the work that you actually do, you will use the `nn` package, which automatically sets all parameter tensors to have `requires_grad=True`.

In [None]:
# Below is an example of computing the gradient for a single data point in logistic regression using pytorch's autograd.

x = torch.ones(5)  # input tensor
y = torch.zeros(1)  # label
# requires_grad = True : we are using this parameter's gradient
# use nn package, set all parameter tensors to have required_grad = True
w = torch.randn(5, 1, requires_grad=True)
b = torch.randn(1, requires_grad=True)
pred = torch.sigmoid(torch.matmul(x, w) + b)
loss = torch.nn.functional.binary_cross_entropy(pred, y)
loss.backward()  # Computers gradients
print("W gradient:", w.grad)
print("b gradient:", b.grad)

# when we want to actually take an update step, we can use optimizers:
optimizer = torch.optim.SGD([w, b], lr=0.1)  # [w,b] is the model.parameters()
print("Weight before", w)
optimizer.step()  # use the computed gradients to update
# Print updated weights
print("Updated weight", w)

# Performing operations with gradients enabled is slow...
# You can disable gradient computation using the following enclosure:
with torch.no_grad():
    # Perform operations without gradients
    ...

### Devices (0 point) (Highly recommend checking it out)

Pytorch supports accelerating computation using GPUs which are available on google colab. To use a GPU on google colab, go to runtime -> change runtime type -> select GPU.

Note that there is some level of strategy for knowing when to use which runtime type. Colab will kick users off of GPU for a certain period of time if you use it too much. Thus, its best to run simple models and prototype to get everything working on CPU, then switch the instance type over to GPU for training runs and parameter tuning.

Its best practice to make sure your code works on any device (GPU or CPU) for pytorch, but note that numpy operations can only run on the CPU. Here is a standard flow for using GPU acceleration:

In [None]:
# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", device)
# Next create your tensors
tensor = torch.zeros(4, 4, requires_grad=True)
# Move the tensor to the device you want to use
tensor = tensor.to(device)

# Perform whatever operations you want.... (often this will involve gradients)
# These operations will be accelerated by GPU.
tensor = 10*(tensor + 1)

# bring the tensor back to CPU, first detaching it from any gradient computations
tensor = tensor.detach().cpu()

# Convert to numpy if you want to perform numpy operations.
tensor_np = tensor.numpy()

### Build an NN (0 point) (Highly recommend checking it out)
Pytorch implements composable blocks in `Module` classes. All layers and modules in pytorch inherit from `nn.Module`. When you make a module you need to implement two functions: `__init__(self, *args, **kwargs)` and `foward(self, *args, **kwargs)`. Modules also have some nice helper functions, namely `parameters` which will recursively return all of the parameters. Here is an example of a logistic regression model:

In [None]:
class Perceptron(nn.Module):
  def __init__(self, in_dim):
    super().__init__()
    # This is a linear layer, it computes Xw + b
    self.layer = nn.Linear(in_dim, 1)

  def forward(self, x):
    return torch.sigmoid(self.layer(x)).squeeze(-1)


perceptron = Perceptron(10)
# Move all the perceptron's tensors to the device
perceptron = perceptron.to(device)
# module.parameters() return all the parameters in this module ; There W and b
print("Parameters", list(perceptron.parameters()))

### Datasets (0 point) (Highly recommend checking it out)
Pytorch has nice interfaces for using datasets. Suppose we create a logistic regression dataset as follows:

In [None]:
c1_x1, c1_x2 = np.random.multivariate_normal(
    [-2.5, 3], [[1, 0.3], [0.3, 1]], 500).T
c2_x1, c2_x2 = np.random.multivariate_normal([1, 1], [[2, 1], [1, 2]], 500).T
c1_X = np.vstack((c1_x1, c1_x2)).T
c2_X = np.vstack((c2_x1, c2_x2)).T
X = np.concatenate((c1_X, c2_X))
y = np.concatenate((np.zeros(500), np.ones(500)))
# Shuffle the data
permutation = np.random.permutation(X.shape[0])
X = X[permutation, :]
y = y[permutation]
# Plot the data
plt.plot(c1_x1, c1_x2, 'x')
plt.plot(c2_x1, c2_x2, 'o')
plt.axis('equal')
plt.show()

We can then create a pytorch dataset object as follows. Often times, the default pytorch datasets will create these objects for you. Then, we can apply dataloaders to iterate over the dataset in batches.

In [None]:
dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(X), torch.from_numpy(y))
print(dataset)
# We can create a dataloader that iterates over the dataset in batches.
dataloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True)
for x, y in dataloader:
    print("Batch x:", x)
    print("Batch y:", y)
    break

# Clean up the dataloader as we make a new one later, you can ignore it here
del dataloader

Splitting Train, Validation and Test sets randomly.

In [None]:
#Training: 70%, Validation: 15%, Testing: 15%
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size]
)

# Creat the data_loders
batch_size = 10
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, shuffle=False)
test_loader = DataLoader(test_dataset, shuffle=False)

print(f"Training size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")
print(f"Testing size: {len(test_dataset)}")

### <font color='red'>Training Loop and Progress Bar (0 point) (Very important! Highly recommend checking it out)</font>

Here is an example of training a full Logistic Regression model in pytorch. Note the extensive use of modules -- modules can be used for storing networks, computation steps etc.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", device)

epochs = 10
batch_size = 10
learning_rate = 0.01

num_features = dataset[0][0].shape[0]
model = Perceptron(num_features).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCELoss()

model.train()  # Put model in training mode
for epoch in range(epochs):
    training_losses = []
    train_correct = 0
    ProgressBar = tqdm(train_loader)
    for x, y in ProgressBar:
        x, y = x.float().to(device), y.float().to(device)
        #  for every mini-batch during the training phase, we typically want to explicitly set the gradients to zero before starting to do backpropagation
        # Remove the gradients from the previous step ;Sets the gradients of all optimized torch.Tensor s to zero.
        optimizer.zero_grad()
        pred = model(x)   # The value predicted using our model
        # compute loss
        loss = criterion(pred, y)
        # Compute gradients.
        loss.backward()
        # update the parameter using the gradient computed
        optimizer.step()
        training_losses.append(loss.item())
        # In a classification task, the output of a neural network model is typically the scores or probabilities for each class.
        # For example, in a 10-class classification task, the output of the last layer of the model is a tensor with the shape of (batch_size, 10). For each sample (i.e., each row in pred), we can obtain the predicted class index of each sample by using torch.argmax(pred, dim=1).
        # So for multi-class classification tasks, try to use: train_correct += torch.sum(torch.argmax(pred, dim=1) == y).item()
        train_correct += torch.sum(torch.round(pred) == y).item()
    train_accuracy = train_correct / len(train_dataset)
    val_losses = []
    num_correct = 0
    with torch.no_grad():
        model.eval()  # Put model in eval mode
        for x_val, y_val in val_loader:
            x_val, y_val = x_val.float().to(device), y_val.float().to(device)
            pred_val = model(x_val)
            loss_val = criterion(pred_val, y_val)
            val_losses.append(loss_val.item())
            num_correct += torch.sum(torch.round(pred_val) == y_val).item()
        model.train()  # Put model back in train mode
    print(f"Epoch {epoch+1}/{epochs} | "
          f"Train Loss: {np.mean(training_losses):.4f} | "
          f"Train Acc: {train_accuracy:.4f} | "
          f"Val Loss: {np.mean(val_losses):.4f} | "
          f"Val Acc: {num_correct / len(val_dataset):.4f}")

# We can run predictions on the data to determine the Testing accuracy.
model.eval()
test_correct = 0
with torch.no_grad():
    for x, y in test_loader:
        x, y = x.float().to(device), y.float().to(device)
        pred = model(x)
        test_correct += torch.sum(torch.round(pred) == y).item()

print(f"\nTest Accuracy: {test_correct / len(test_dataset):.2%}")

## Problem 1: MLP for FashionMNIST (18 points)

Now you will train a multi-layer perceptron model on the FashionMNIST dataset. Your deliverables are as follows:

1. Code for training an MLP on FashionMNIST.
2. A plot of the training and validation loss for at least 8 epochs.
3. A plot of the training and validation accuracy for each epoch, achieving a final validation accuracy of at least 82% by the end of the training.

Below we will create the training and validation datasets for you. It is on you to implement an MLP / Feed Forward neural network yourself. Please leverage the example training loop from above.

Here are some pytorch components that you should definitely use:
1. `nn.Linear`
2. Some activation: `nn.ReLU`, `nn.Tanh`, `nn.Sigmoid`, etc.
3. `nn.CrossEntropyLoss`

Here are some challenges that you will need to overcome:
1. The data is, by default, configured in image form, i.e. a (28 x 28) tensor per sample, instead of single feature vector. You will need to **reshape** it somewhere to feed it in as vector to the MLP. There are many ways of doing this according to **Problem 0**.
2. You need to write code for plotting.
3. You need to find the appropriate hyper-parameters to achieve good accuracy.

Your underlying model must be fully connected or "dense", and may not use any convolutions etc., but you can use anything in `torch.optim` or any layers in `torch.nn` besides `nn.Linear` that do not have weights.

In [None]:
# Creating the datasets
# feel free to modify this as you see fit.
transform = torchvision.transforms.ToTensor()

training_data = torchvision.datasets.FashionMNIST(
    root="./data",
    train=True,
    download=False,
    transform=transform,
)

validation_data = torchvision.datasets.FashionMNIST(
    root="./data",
    train=False,
    download=False,
    transform=transform,
)

In [None]:
images = [training_data[i][0] for i in range(9)]
plt.imshow(torchvision.utils.make_grid(torch.stack(images),
           nrow=3, padding=5).numpy().transpose((1, 2, 0)))

In [None]:
# Get the knowledge of the Training and Validation Set
print("number of training samples: " + str(len(training_data)) + "\n" +
      "number of validation samples: " + str(len(validation_data)))
print("datatype of the 1st training sample: ", training_data[0][0].type())
print("size of the 1st training sample: ", training_data[0][0].size())

# Find out how many categories in the sample.
max_label = float("-inf")
min_label = float("inf")
for i in range(len(training_data)):
  if training_data[i][1] > max_label:
    max_label = training_data[i][1]
  if training_data[i][1] < min_label:
    min_label = training_data[i][1]
print("max_label = "+str(max_label))
print("min_label = "+str(min_label))

In the cell below, you'll implement a MLP. Please follow the guidence in comments. <br>
**Scoring criteria:**<br>
**1. Print out the train loss, train accuracy, validation loss, and validation accuracy for each epoch. (8 points)**<br>
**2. Grading criteria: <br>
At the last epoch, if the validation accuracy (validation_acc) of your model is greater than or equal to 0.86, you will get `10 points`. <br>
If 0.83 <= validation_acc < 0.86, you will get `6 points`. <br>
If 0.80 <= validation_acc < 0.83, you will get `4 points`. <br>
If validation_acc < 0.80, you will not get any score. <br>
You are free to use the *early stopping* strategy that can prevent overfitting.**

In [None]:
############## YOUR CODE HERE ##############
# In this part, please follow our instruction step by step to get familiar with Pytorch, which would reduce your workload.

# 1. Confirm whether the CUDA is available or not.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", device)

# 2. TODO: Encapsulate your Multilayer Perceptron (MLP) model within a class.
# This class should include a constructor function __init__(...) and a forward(...) function that is used to carry out forward propagation.
# You can modify these two functions' input parameters.
class MLP(nn.Module):
    def __init__(self):
    ######## YOUR CODE HERE ########
        pass
    ######## END YOUR CODE ########

    def forward(self, x):
    ######## YOUR CODE HERE ########
        pass
    ######## END YOUR CODE ########


# 3. TODO: Initialize your model with proper input size and output size
######## YOUR CODE HERE ########
n_inputs = ...
n_outputs = ...
model = MLP(n_inputs, n_outputs).to(device)
####### END YOUR CODE ########

# 4. TODO: Define the Training Parameters like epochs, batch_size, learning_rate, optimizer, criterion_loss, etc.
######## YOUR CODE HERE ########
epochs = ...
batch_size = ...
learning_rate = ...
optimizer = ...
criterion = ...
######## END YOUR CODE ########

# 5. TODO: Put the training and testing data into a DataLoader
# you can use torch.utils.data.DataLoader() to complete this step.
######## YOUR CODE HERE ########
train_loader = ...
validation_loader = ...
######## END YOUR CODE ########

# 6. Training
# During the training process, make sure to add the training accuracy, validation accuracy, training loss, and validation loss to the list below.
# This will allow you to visualize these metrics and assess whether the model is overfitting.
train_acc = []
valid_acc = []
train_loss = []
valid_loss = []

model.train()  # Put model in training mode
for epoch in range(epochs):
    # TODO: Please mimic the training example of logistic regression above to write the training code here.
    ######## YOUR CODE HERE ########
    pass
    ######## END YOUR CODE ########

In [None]:
# plot
plt.figure(figsize=(5, 5))
plt.plot(train_loss, label='Training_Loss')
plt.plot(valid_loss, label='Validation_Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title("Loss with epoches")
plt.legend()
plt.show()

plt.figure(figsize=(5, 5))
plt.plot(train_acc, label='Training_Acc')
plt.plot(valid_acc, label='Validation_Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title("Accuracy with epoches")
plt.legend()
plt.show()

## Problem 2: CNN for CIFAR-10 (20 points)

In this section, you will construct a Convolutional Neural Network (CNN) for the CIFAR - 10 dataset. You have already utilized this dataset in the coding part of Homework 2. However, in this particular part, there is no need to download the dataset separately. It is advisable to employ GPU acceleration for this section to enhance the operational efficiency. Nevertheless, this is not a mandatory requirement.

Here are some of the components you should consider using:
1. `nn.Conv2d`
2. `nn.ReLU`
3. `nn.Linear`
4. `nn.CrossEntropyLoss`
5. `nn.MaxPooling2d` (Optional, many implementations without it exist)

We encourage you to explore different ways of improving your model to obtain higher accuracies. Here are some suggestions for things to look into:
1. Popular CNN architectures like ResNets, etc.
2. Different optimizers and their parameters (see `torch.optim`)
3. Image preprocessing / data augmentation (see `torchvision.transforms`)
4. Regularization or dropout (see `torch.optim` and `torch.nn` respectively)
5. Learning rate scheduling: https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate

Though we encourage you to explore, there are some rules:
1. You are not allowed to use any pre-defined architectures or feature extractors in your network.
2. You are not allowed to use **any** pretrained weights, i.e. no transfer learning.
3. You cannot train on the test data (that would pretty much defeat the whole point of machine learning).

<font color='red'>Scoring Criteria:</font>
1. The final test accuracy of your model should be $\geq 0.80$ to obtain **10 points**.<br>
If $0.7\leq$ test_acc $< 0.8$, you will get **5 points**. <br>
 If test_acc $< 0.7$, no score will be awarded.
2. Print out the train loss, train accuracy, validation loss, and validation accuracy for each epoch.**(5 points)**
3. Provide at least one training curve for your model. This curve should depict the training loss and validation loss per epoch or step after training for at least 10 epochs. **(5 points)**

In [None]:
# Creating the datasets, feel free to change this as long as you do the same to the test data.
# You can also modify this to split the data into training and validation.
# See https://pytorch.org/docs/stable/data.html#torch.utils.data.random_split

# Training and Validation transforms
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Testing transform
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Training and validation data
train_val_data = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=False,
    transform=transform_train,
)

train_size = int(0.9*len(train_val_data))
val_size = len(train_val_data) - train_size

# 9:1 randomly split the original training set to training and validation set
generator = torch.Generator().manual_seed(42)

train_data, valid_data = torch.utils.data.random_split(
    train_val_data,
    [train_size, val_size],
    generator=generator
)

# Official testing set.
test_data = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=False,
    transform=transform_test,
)

print(f"Training size: {len(train_data)}")
print(f"Validation size: {len(valid_data)}")
print(f"Testing size: {len(test_data)}")

Again, let's first visualize our data.

In [None]:
images = [train_data[i][0] for i in range(9)]
print(images[0].size())
plt.imshow(torchvision.utils.make_grid(torch.stack(images),
           nrow=3, padding=5).numpy().transpose((1, 2, 0)))

In [None]:
print("number of training samples: " + str(len(train_data)) + "\n" +
      "number of testing samples: " + str(len(valid_data)))
print("datatype of the 1st training sample: ", train_data[0][0].type())
print("size of the 1st training sample: ", train_data[0][0].size())

In [None]:
max_label = float("-inf")
min_label = float("inf")
for i in range(len(train_data)):
  if train_data[i][1] > max_label:
    max_label = train_data[i][1]
  if train_data[i][1] < min_label:
    min_label = train_data[i][1]
print("max_label = "+str(max_label))
print("min_label = "+str(min_label))

### CNN Construction and Training

Here are some training tips for CNN training.
1. **Optimization for Gradient Descent.**
   1. **Stochastic gradient descent (SGD)** <br>
    `torch.optim.SGD` <br>
    When updating model parameters using only one training sample at each epoch, compared to traditional gradient descent methods that process the entire dataset, the memory and computational requirements are lower, which can accelerate convergence. Additionally, the randomness in the optimization process helps escape local minima. However, the noise in the updates may lead to a more erratic optimization path, unstable convergence speed, and the choice of learning rate becomes particularly crucial. <br>
    Moreover, you can use SGD with momentum. The momentum parameter (typically between 0.9 and 0.99) controls the influence of past gradients on the current update, enabling the algorithm to persistently move in promising directions.
    2. **RMSprop** <br>
        `torch.optim.RMSprop`<br>
        This method adaptively adjusts the learning rate based on the recent gradient history of parameters: it reduces the learning rate for parameters with consistently large gradients while increasing the update magnitude for those with smaller gradients, thereby balancing the step sizes of different parameter updates. The `alpha` parameter (decay rate, typically set to 0.9) in RMSprop controls how quickly old squared gradients are forgotten.
    3. **Adam** <br>
        `torch.optim.Adam`<br>
        Adam optimizer efficiently updates parameters by combining momentum (1st-order moment) and RMSprop (2nd-order moment) with adaptive learning rates and bias correction.
2. **Balancing hyperparameters (epochs, batch size, learning rate) is crucial during training.**
3. **Dealing with Overfitting**: <br>
    1. **Batch Normalization**: <br>
        `torch.nn.BatchNorm2d`<br>
        Batch normalization mitigates internal covariate shift by normalizing and rescaling layer inputs within each mini-batch.
    2. **MaxPooling Layers**:  <br>
        `nn.MaxPool2d`<br>
        spatial downsampling
    3. **Dropout**: <br>
        `torch.nn.Dropout`<br>
        Dropout randomly deactivates neurons during training to prevent overfitting and improve generalization.
    4. **L1/L2 Normalization**:<br>
        Configure the `weight_decay` parameter in the `optimizer`.
    5. **Early Stopping**<br>
        Implement early stopping by continuously monitoring validation performance and halting training when no improvement is observed for a predefined patience period.
4. **Dealing with Underfitting**: <br>
   1. **Deeper networks**
   2. **More training epochs**
   3. **Reasonably select activate functions**
   4. **Use a classic network architecture, e.g. ResNet.**
5. You are encouraged to add more optimization in order to reach your goal.

In [None]:
# 1. Confirm whether the CUDA is available or not.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", device)

# 2. TODO: Encapsulate your Convolutional Neural Network (CNN) model within a class.
# This class should include a constructor function __init__(...) and a forward(...) function that is used to carry out forward propagation.
class CNN(nn.Module):
    def __init__(self):
        ######## YOUR CODE HERE ########
        pass
        ######## END YOUR CODE ########

    def forward(self, x):
        ######## YOUR CODE HERE ########
        pass
        ######## END YOUR CODE ########


# 3. Initialize your model
model = CNN().to(device)

# 4. TODO: Define the Training Parameters like epochs, batch_size, learning_rate, optimizer, criterion_loss, etc.
######## YOUR CODE HERE ########
epochs = ...
batch_size = ...
learning_rate = ...
optimizer = ...
criterion = ...
######## END YOUR CODE ########

# 5. TODO: Put the training and testing data into a DataLoader
# you can use torch.utils.data.DataLoader() to complete this step.
######## YOUR CODE HERE ########
train_loader = ... 
validation_loader = ...
######## END YOUR CODE ########

# 6. Training
# During the training process, make sure to add the training accuracy, validation accuracy, training loss, and validation loss to the list below.
# This will allow you to visualize these metrics and assess whether the model is overfitting.
train_acc = []
valid_acc = []
train_loss = []
valid_loss = []

model.train()  # Put model in training mode
for epoch in range(epochs):
    # TODO: Please mimic the training example of logistic regression above to write the training code here.
    ######## YOUR CODE HERE ########
    pass
    ######## END YOUR CODE ########

In [None]:
# Plot Your Training and Validation loss in one picture here.
plt.figure(figsize=(5, 5))
plt.plot(train_loss, label='Training_Loss')
plt.plot(valid_loss, label='Validation_Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title("Loss with epoches")
plt.legend()
plt.show()

In [None]:
# Plot Your Training and Validation accuracy in one picture here.
plt.figure(figsize=(5, 5))
plt.plot(train_acc, label='Training_Acc')
plt.plot(valid_acc, label='Validation_Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title("Accuracy with epoches")
plt.legend()
plt.show()

## Testing

In [None]:
def evaluate_model(model, test_data, device='cuda', batch_size=128):
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    model.eval()
    model.to(device)
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = 100 * correct / total

    print(f'Test Accuracy: {accuracy:.2f}%')
    print(f'Correct/Total: {correct}/{total}')
    print('\nClassification Report:')
    print(classification_report(all_labels, all_preds, target_names=test_data.classes))

In [None]:
evaluate_model(model, test_data)