In [None]:
! pip install --quiet "tabulate" "ipython[notebook]>=8.0.0, <8.12.0" "pytorch-lightning>=1.4, <2.0.0" "torchvision" "setuptools==67.4.0" "seaborn" "torch>=1.8.1, <1.14.0" "matplotlib" "torchmetrics>=0.7, <0.12" "lightning>=2.0.0rc0"

### Why CNN'S
* ANN'S are not designed to process data like Images which have spatial information because we would need to flatten them which leads to a loss of spatial information
* CNN are equivariant meaning that the resultant feature map obtained by first augmenting and then performing convolution is the same as the one obtained by first performing convolution and then performing augmentation making them invariant to spatial transformation

### How they work
* Feature extractor - this component is responsible for performing convolution on images and extracting spatial information(edges, contour, eyes, nose)
    * Convolution Layer
        * Performs the convolution operation on the image to extract spatial information utilizing
            * Kernel - filter supposed to perform convolution on images. It is first randomly initialized but later updated through back propagation to extract the spatial features more accurately. We then do element-wise product and sum of the filter matrix and the original image
            * Stride - Number of pixels by which the kernel moves while performing convolution
            * Padding - Number of pixels by which the image is padded while performing convolution sice convolution on an image leads to a reduction in the image size. So we pad to preserve the image size
            * Kernel size - This is the size of the kernel which is used to perform the convolution on Images. Using a larger kernel size leads to faster training but loss of details in the image
            * Number of filters - this is the number of convolution filters which are convoluted on the images. Each filter is stacked along the channels
        
    * Activation Function
        * introduces non-linearity in the feature maps. This is done to enhance the learning process enabling the nn to learn more complex patterns
 
    * Pooling Layer
        * Pooling is donne to reduce the number of params that gradually increase due to repeated convolutions while preserving maximum information
        * Achieved by sliding a window(usually 2* 2 window) over the resultant feature map and picks up the pooled pixel in each window
        * Variants include:
            * Max Pooling - we pick up the pixel which has the maximum value in each pixel. Throws away all other non useful info
            * Min Pooling - We pick up the pixel which has the minimum value in each pooling window
            * Average Pooling - We average all the pixels in each pooling window. Retains much info about the less important elements of a block or pool 
        
* Flattening - this component is responsible for converting the multi dimensional feature map to a 1D array which can later be processed by the classification head
* Classification Head - this component is composed of several fully connected layers which perform classification


### How do CNNs Learn
* They preserve the spatial information of the image by extracting the feature maps. By performing convolution using multiple filters the CNN learns a variety of spatial information like edges, shapes, objects e.t.c which are then flattened and passed to the classification head to predict the classes or the regression head to predict the desired quantity
    * Convolution + ReLU
    * max_pooling
    * fully_connected + ReLU
    * softmax



In [None]:
import torch.nn as nn

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size):
        super().__init__()
        self.conv_layer = nn.Conv2d(in_channels, out_channels, kernel_size)
        self.pooling_layer = nn.MaxPool2d()
        
    def forward(self, x):
        x = self.conv_layer(x)
        x = nn.ReLU()(x)
        x = self.pooling_layer(x)
        return x
    
    
class MyCNN(nn.Module):
    def __init__(self):
        super().__init__()
        #Assuming input shape = (224,224,3) and padding = 0
        self.feature_extractor = nn.Sequential(
            ConvBlock(3, 64, (3,3)), #takes in an image of rgb, produces 64 output channels feature maps, uses a 3 by 3 filter kernel 
            ConvBlock(64, 128, (3,3)),#takes in 64 feature maps from above produces 128 output channels uses a 3 by 3 filter kernel
            ConvBlock(128, 256, (3,3)),#takes in 128 input channels produces 256 output channles uses a 3 by 3 filter kernel
            ConvBlock(256, 512, (3,3)) # takes in 256 input channels produces 512 output channels uses a 3 by 3 filter kernel - 12,12,512
        )
        self.flatten = nn.Flatten() #12 * 12 * 512
        self.classification_head = nn.Sequential(
            nn.Linear(in_features = 12*12*512, out_features =64),
            nn.ReLU().
            nn.Linear(in_features = 64, out_features = 32),
            nn.ReLU(),
            nn.Linear(in_features = 32, out_features = num_classes)
        )
        
    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.flatten(x)
        x = self.classification_head(x)
        return x
        
        
    


### Loss Functions in CNNS
* Image classification
    * Hinge Loss
    * Cross Entropy Loss
    
* Image regression:
    * MSE
    * MAE
    * Huber Loss
    
### Tricks to speed up training
* Use batch normalization
    * speeds up training
    * allows for a large learning rate
    * does not depend on weight initialization
    
* Use Global Average Pooling instead of Flatten to convert output to 1D 
    * captures the information of the feature map in each channel by taking the average
    * Reduces the number of params required to train
    * So from (channels, width, height) the average is taken along each chaneel, so the resultant shape becomes (channels, 1,1) which can be reshaped into (channels, 1) and a classification head can be attached afterwards
   

### Lets build a CNN network

In [None]:
import torch
import torchvision

#### Setup and Data Preparation

In [None]:
n_epochs = 3
batch_size_train = 64
batch_size_test = 1000
learning_rate = 0.01
momentum = 0.5
log_interval = 10

random_seed = 1
torch.backends.cudnn.enabled = False
torch.manual_seed(random_seed)

In [None]:
train_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('/files/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_train, shuffle=True)

test_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('/files/', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_test, shuffle=True)

In [None]:
examples = enumerate(test_loader)
batch_idx, (example_data, example_targets) = next(examples)
example_data.shape

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
for i in range(6):
  plt.subplot(2,3,i+1)
  plt.tight_layout()
  plt.imshow(example_data[i][0], cmap='gray', interpolation='none')
  plt.title("Ground Truth: {}".format(example_targets[i]))
  plt.xticks([])
  plt.yticks([])
fig

#### Building the Network

In [None]:
import torch.nn as nn #torch.nn layers contain trainable params
import torch.nn.functional as F#these ones are purely functional
import torch.optim as optim

#For readability
class MyCNN(nn.Module):
    def __init__(self):
        super(MyCNN, self).__init__()
        
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(1,10, kernel_size = 5),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            nn.Conv2d(10, 20, kernel_size = 5),
            nn.ReLU(),
            nn.MaxPool2d(2)
            
        
        )
        self.flatten = nn.Flatten()
        
        self.classification_head = nn.Sequential(
            nn.Linear(320, 50),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(50, 10)
        )
    def forward(self, x):
        #pass it to the feature extractor
        x = self.feature_extractor(x)
        #Flatten the output
        x = self.flatten(x)
        
        #forward pass through the classification layers
        x = self.classification_head(x)
        
        return F.log_softmax(x)


#### Initialize the network and the optimizer

In [None]:
network = MyCNN()
optimizer = optim.SGD(network.parameters(),lr = learning_rate, momentum = momentum)

#### Train the Network

In [None]:
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]

def train(epoch):
  network.train()
  for batch_idx, (data, target) in enumerate(train_loader):
    optimizer.zero_grad()
    output = network(data)
    loss = F.nll_loss(output, target)
    loss.backward()
    optimizer.step()
    if batch_idx % log_interval == 0:
      print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        epoch, batch_idx * len(data), len(train_loader.dataset),
        100. * batch_idx / len(train_loader), loss.item()))
      train_losses.append(loss.item())
      train_counter.append(
        (batch_idx*64) + ((epoch-1)*len(train_loader.dataset)))
      torch.save(network.state_dict(), 'model.pth')
      torch.save(optimizer.state_dict(), 'optimizer.pth')
    
def test():
  network.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for data, target in test_loader:
      output = network(data)
      test_loss += F.nll_loss(output, target, size_average=False).item()
      pred = output.data.max(1, keepdim=True)[1]
      correct += pred.eq(target.data.view_as(pred)).sum()
  test_loss /= len(test_loader.dataset)
  test_losses.append(test_loss)
  print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))           
            
            

In [None]:
test()
for epoch in range(1, n_epochs + 1):
  train(epoch)
  test()

### Evaluating the Model's Performance

In [None]:
fig = plt.figure()
plt.plot(train_counter, train_losses, color='blue')
plt.scatter(test_counter, test_losses, color='red')
plt.legend(['Train Loss', 'Test Loss'], loc='upper right')
plt.xlabel('number of training examples seen')
plt.ylabel('negative log likelihood loss')
fig

#### Sample Prediction on our example dataset

In [None]:
with torch.no_grad():
    output = network(example_data)

In [None]:
fig = plt.figure()
for i in range(6):
  plt.subplot(2,3,i+1)
  plt.tight_layout()
  plt.imshow(example_data[i][0], cmap='gray', interpolation='none')
  plt.title("Prediction: {}".format(
    output.data.max(1, keepdim=True)[1][i].item()))
  plt.xticks([])
  plt.yticks([])
fig

#### Load model from checkpoint and COntinue training your model further

In [None]:
continued_network = MyCNN()
continued_optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                                momentum=momentum)

In [None]:
network_state_dict = torch.load("/kaggle/working/model.pth")
continued_network.load_state_dict(network_state_dict)

optimizer_state_dict = torch.load("/kaggle/working/optimizer.pth")
continued_optimizer.load_state_dict(optimizer_state_dict)

In [None]:
for i in range(4,9):
  test_counter.append(i*len(train_loader.dataset))
  train(i)
  test()

In [None]:
fig = plt.figure()
plt.plot(train_counter, train_losses, color='blue')
plt.scatter(test_counter, test_losses, color='red')
plt.legend(['Train Loss', 'Test Loss'], loc='upper right')
plt.xlabel('number of training examples seen')
plt.ylabel('negative log likelihood loss')
fig