<a href="https://colab.research.google.com/github/inbalh1/ML/blob/main/DL_ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from sklearn.svm import SVC

import pickle
import os
from google.colab import files
import itertools


device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

BATCH_SIZE = 64
TRAIN_SIZE = 5000
TEST_SIZE = 1000
DEFAULT_NUM_OF_EPOCHS = 100

Using cuda device


# Question 1: setup and baseline

### setup

In [2]:
training_data = datasets.CIFAR10(
    root="data",
    train=True,
    download=True, # set to true for first run - TODO: find elegant solution
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.CIFAR10(
    root="data",
    train=False,
    download=True,# set to true for first run - TODO: find elegant solution
    transform=ToTensor(),
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:05<00:00, 33706005.77it/s]


Extracting data/cifar-10-python.tar.gz to data
Files already downloaded and verified


### Baseline - svm

In [3]:
# HW part 1 - run svm as baseline

def svm_by_type(kernel , X , y , X_test , y_test):
  clf_SVC = SVC(kernel=kernel , decision_function_shape='ovr')
  clf_SVC.fit(X.reshape([TRAIN_SIZE, 3 * 32 * 32]), y)
  # TODO: should report accuracy (in the doc)
  print ("kernel type - " + kernel)
  print("train acc: ", clf_SVC.score(X.reshape([TRAIN_SIZE, 3 * 32 * 32]), y))
  print("test acc: ", clf_SVC.score(X_test.reshape([TEST_SIZE, 3 * 32 * 32]), y_test))
 
# This is the main function for this part 
def svm(train_sampler, test_sampler):
  # We load everything to 1 batch
  train_dataloader = DataLoader(training_data, batch_size=TRAIN_SIZE , sampler = train_sampler , num_workers=2)
  test_dataloader = DataLoader(test_data, batch_size=TEST_SIZE, sampler = test_sampler , num_workers=2)

  X , y = next(iter(train_dataloader))
  X_test , y_test = next(iter(test_dataloader))

  # linear SVM:
  svm_by_type('linear' , X , y , X_test , y_test)

  # rbf kernel SVM
  svm_by_type('rbf' , X , y , X_test , y_test)


# Question 2 - Feed Forward Neural Network

In [4]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(3*32*32, 256),
            nn.ReLU(),
            nn.Linear(256, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def init_weights_wrapper(deviation):
  def init_weights(m):
    if (isinstance(m, nn.Linear)) or (isinstance(m, nn.Conv2d)):
      torch.nn.init.normal_(m.weight , std=deviation)
      #torch.nn.init.normal_(m.bias , std=deviation)
      m.bias.data.fill_(0)
  return init_weights

class GridSearch():
  def __init__(self, file_path, use_history=True):
    self.best_params = {}
    self.best_acc = 0
    self.file_path = file_path
    self.use_history = use_history

  def update_params(self, step_size, momentum, deviation):
    self.best_params['step_size'] = step_size
    self.best_params['momentum'] = momentum
    self.best_params['deviation'] = deviation

  def write_to_file(self, accuracy, loss, time, *args):
    obj = {args: (accuracy, loss, time)}
    with open(self.file_path, "ab") as f:
      pickle.dump(obj, f)

  def get_history(self):
    history_res = {}
    if self.use_history and (os.path.exists(self.file_path)):
      with open(self.file_path, "rb") as f:
        objs = []
        while 1:
            try:
                history_res.update(pickle.load(f))
            except EOFError:
                break
    return history_res

  # Config is a dict of lists that tells the values of the parameters  to run
  def run(self, train_dataloader, test_dataloader, config, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False):
    import time
    history_res = self.get_history()
    options = itertools.product(config["step_size"], config["momentum"], config["deviation"])
    for step_size, momentum, deviation in options:
      # Don't run if we already know the result
      if self.use_history and ((step_size, momentum, deviation) in history_res):
        accuracy = history_res[(step_size, momentum, deviation)][0]
        if accuracy > self.best_acc:
          self.update_params(step_size, momentum, deviation)
          self.best_acc = accuracy
        continue
      print("Starting gs for ({},{},{})".format(step_size, momentum, deviation))
      start = time.time()
      model = NeuralNetwork().to(device)
      model.apply(init_weights_wrapper(deviation))

      loss_fn = nn.CrossEntropyLoss()
      optimizer = torch.optim.SGD(model.parameters(), lr=step_size , momentum=momentum)
      # TODO: should we check train acc or test acc (now we're checking the train)!!!!
      # TODO2 : if we ignore the test(like now) - better not to run it
      train_accuracy, train_loss , __, ___ = train_and_test(train_dataloader, test_dataloader, model,
                                                    loss_fn, optimizer,
                                                    num_of_test_batches=0, num_of_epochs=num_of_epochs, verbose=verbose)
      tot_time = time.time() - start
      accuracy = train_accuracy[-1] # Take last accuracy
      loss = train_loss[-1]
      self.write_to_file(accuracy, loss, tot_time, step_size, momentum, deviation)
      
      if accuracy > self.best_acc:
        self.update_params(step_size, momentum, deviation)
        self.best_acc = accuracy
    return self.best_params

# Train the model for a single epoch
# Returns accuracy and loss
def epoch_train(dataloader, model, loss_fn, optimizer, num_of_batches=-1, verbose=False):
    model.train()

    # Calculate the size of data we run on
    if num_of_batches > 0:
      size = num_of_batches * BATCH_SIZE
    else:
      size = TRAIN_SIZE

    # loss and accuracy of the entire epoch
    epoch_loss, epoch_accuracy  = 0, 0
    for batch, (X, y) in enumerate(dataloader):
      # Dont run more than num_of_batches batches (-1 means to run all)
      if batch == num_of_batches:
        break
      X, y = X.to(device), y.to(device)

      # Compute prediction error
      pred = model(X)
      loss = loss_fn(pred, y)

      # Backpropagation
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      if (batch % 100 == 0) and verbose:
          current = (batch + 1) * len(X)
          print(f"Train loss: {loss.item():>7f}  [{current:>5d}/{size:>5d}]")
      epoch_loss += loss.item()
      epoch_accuracy += (pred.argmax(1) == y).type(torch.float).sum().item()

    # Calculate the total loss and accuracy
    if num_of_batches > 0:
      loss = epoch_loss / num_of_batches
    else:
      loss = epoch_loss / len(dataloader)
    accuracy = epoch_accuracy / size
    return accuracy, loss

def test(dataloader, model, loss_fn, num_of_batches=-1, verbose=False):
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
          # Dont run more than num_of_batches batches (-1 means to run all)
          if batch == num_of_batches:
            break
          X, y = X.to(device), y.to(device)
          pred = model(X)
          test_loss += loss_fn(pred, y).item()
          correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    # Calculate the total loss and accuracy
    if num_of_batches > 0:
      size = num_of_batches * BATCH_SIZE
      test_loss = test_loss / num_of_batches
    else:
      size = TEST_SIZE #len(dataloader.dataset)
      test_loss = test_loss / len(dataloader)
    accuracy = correct / size
    if verbose:
      print(f"Test Error: \n Accuracy: {(100*accuracy):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return accuracy, test_loss

# Run train and test for num_of_epochs
# TODO: what should be default for num_of_test_batches?
def train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer,
                   num_of_train_batches=-1, num_of_test_batches=-1, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False):
    all_loss, all_accuracy, all_test_accuracy, all_test_loss = [], [], [], []
    for i in range(num_of_epochs):
      epoch_accuracy, epoch_loss = epoch_train(train_dataloader, model, loss_fn,
                                             optimizer, num_of_batches=num_of_train_batches, verbose=verbose)
      test_accuracy, test_loss = test(test_dataloader, model, loss_fn, num_of_batches=num_of_test_batches, verbose=verbose)
      all_test_accuracy.append(test_accuracy)
      all_test_loss.append(test_loss)
      all_accuracy.append(epoch_accuracy)
      all_loss.append(epoch_loss)
    return all_accuracy, all_loss, all_test_accuracy, all_test_loss

# Plot train and test accuracy on the same graph,
# and train and test losses on the same graph.
def plot_graphs(num_of_epochs, train_accuracy, train_loss, test_accuracy, test_loss, suptitle):
  epochs = [(x + 1) for x in range(num_of_epochs)]
  fig, axs = plt.subplots(2)
  fig.suptitle(suptitle)
  fig.tight_layout()#pad=3)
  # Plot
  #axs[0].set_title("Accuracy as func of epochs")
  axs[0].plot(epochs, train_accuracy)
  axs[0].plot(epochs, test_accuracy)
  axs[0].set(xlabel='epochs', ylabel='accuracy')
  axs[0].grid()
  axs[0].legend(["train", "test"], loc="best")

  #axs[1].set_title("Loss as func of epochs")
  axs[1].plot(epochs, train_loss)
  axs[1].plot(epochs, test_loss)
  axs[1].set(xlabel='epochs', ylabel='loss')
  axs[1].grid()
  axs[1].legend(["train", "test"], loc="best")

  plt.show()

def pretty_print_results(train_accuracy, train_loss, test_accuracy, test_loss):
  # Print the results at the end of optimization
  print(f"Train Error: \n Accuracy: {(100*train_accuracy[-1]):>0.1f}%, Avg loss: {train_loss[-1]:>8f} \n")
  print(f"Test Error: \n Accuracy: {(100*test_accuracy[-1]):>0.1f}%, Avg loss: {test_loss[-1]:>8f} \n")


### Part 2.1.  Baseline

In [5]:
# Part 2.1
# Run the baseline of this part (train and test) and plot the results
# config is a dict of the parameters to run the network with
def run_baseline(config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False):
  # Train the network with the best parameters and then test it
  model = NeuralNetwork().to(device)
  model.apply(init_weights_wrapper(config["deviation"]))
  # TODO: can we delete this print?
  for name, param in model.named_parameters():
          print(name , param)

  loss_fn = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=config["step_size"] , momentum=config["momentum"])
  train_accuracy, train_loss, test_accuracy, test_loss =\
          train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, num_of_epochs=num_of_epochs, verbose=verbose)
  
  plot_graphs(num_of_epochs, train_accuracy, train_loss, test_accuracy, test_loss, "2.1 baseline")

  print("Baseline results: ")
  pretty_print_results(train_accuracy, train_loss, test_accuracy, test_loss)

### Part 2.2. Optimization

In [6]:
# Part 2.2
def change_optimization(config, train_dataloader, test_dataloader, betas, num_of_epochs=60, verbose=False, cnn=False):
  if cnn:
    model = CNN().to(device)
  else:
    model = NeuralNetwork().to(device)
  model.apply(init_weights_wrapper(config["deviation"]))

  loss_fn = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=config["step_size"] , betas=betas)
  train_accuracy, train_loss, test_accuracy, test_loss =\
          train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, num_of_epochs=num_of_epochs, verbose=verbose)
  
  plot_graphs(num_of_epochs, train_accuracy, train_loss, test_accuracy, test_loss, "2.2 optimization")
  
  print("Adam optimization results for: " + str(betas))
  pretty_print_results(train_accuracy, train_loss, test_accuracy, test_loss)

def beta_gs(config, train_dataloader, test_dataloader, num_of_epochs=20, verbose=False):
  for b1 in [0.9 , 0.8 , 0.7 , 0.6 , 0.5]:
    for b2 in [0.999 ,  0.888 , 0.777 , 0.666 , 0.555]:
      change_optimization(config, train_dataloader, test_dataloader, num_of_epochs=num_of_epochs , betas=(b1 , b2))

### Part 2.3. Initialization

In [7]:
# Part 2.3

def init_weights_with_xavier(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
        torch.nn.init.xavier_uniform_(m.weight)
        #torch.nn.init.xavier_uniform_(m.bias)
        m.bias.data.fill_(0)

def change_initialization(config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False , cnn=False):
  if cnn:
    model = CNN().to(device)
  else:
    model = NeuralNetwork().to(device)
  model.apply(init_weights_with_xavier)

  loss_fn = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=config["step_size"] , momentum=config["momentum"])
  train_accuracy, train_loss, test_accuracy, test_loss =\
          train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, num_of_epochs=num_of_epochs, verbose=verbose)
  
  plot_graphs(num_of_epochs, train_accuracy, train_loss, test_accuracy, test_loss, "2.3 initialization")
  
  print("Xavier initialization results: ")
  pretty_print_results(train_accuracy, train_loss, test_accuracy, test_loss)

### Part 2.4. Regularization
Add dropout and weight decay to the network

In [8]:
# Part 2.4

class DropoutNeuralNetwork(nn.Module):
    def __init__(self, dropout):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(3*32*32, 256),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(256, 10)
            #nn.Dropout(dropout)

        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def regularization(config, train_dataloader, test_dataloader, dropout=0.05, weight_decay=0.1, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False, cnn=False):
  if cnn:
    model = CNNWithDO(dropout).to(device)
  else:
    model = DropoutNeuralNetwork(dropout).to(device)
  model.apply(init_weights_wrapper(config["deviation"]))

  loss_fn = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=config["step_size"] , momentum=config["momentum"] , weight_decay=weight_decay)
  train_accuracy, train_loss, test_accuracy, test_loss =\
          train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, num_of_epochs=num_of_epochs, verbose=verbose)
  
  plot_graphs(num_of_epochs, train_accuracy, train_loss, test_accuracy, test_loss, "2.4 regularization")

  print("Regularization results: ")
  pretty_print_results(train_accuracy, train_loss, test_accuracy, test_loss)
  print("Weight decay is: ", weight_decay, " dropout is: ", dropout)

### Part 2.5. Preprocessing
Add PCA whitening

In [9]:
# Part 2.5

from sklearn.decomposition import PCA
#torch.cat([x[0] for x in training_data])

class NeuralNetworkWithPreprocess(nn.Module):
    def __init__(self, pca):
        super().__init__()
        self.pca = pca
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(BATCH_SIZE, 256),
            #nn.Linear(1, 256),
            nn.ReLU(),
            nn.Linear(256, 10)
        )

    def whitening(self, x):
        x_ = self.flatten(x)
        x_ = self.pca.transform(torch.Tensor.cpu(x_))
        x = (torch.tensor(x_)).to(device).to(torch.float32)
        return x

    def forward(self, x):
        # Apply preprocessing        
        x = self.whitening(x)
        # Apply the network
        logits = self.linear_relu_stack(x)
        return logits


def get_pca(train_sampler):
  train_dataloader = DataLoader(training_data, batch_size=TRAIN_SIZE, sampler=train_sampler)
  pca = PCA(whiten=True, n_components=64) # TODO: not sure about n_comp
  x, y = next(iter(train_dataloader))
  x = x.reshape((TRAIN_SIZE, -1))
  pca.fit(x)
  return pca

def add_preprocessing(config, train_sampler, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False, cnn=False):
  pca = get_pca(train_sampler)
  if cnn:
    model = CnnWithPreprocess(pca).to(device)
  else:
    model = NeuralNetworkWithPreprocess(pca).to(device)
  model.apply(init_weights_wrapper(config["deviation"]))
  loss_fn = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=config["step_size"] , momentum=config["momentum"])
  train_accuracy, train_loss, test_accuracy, test_loss =\
          train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, num_of_epochs=num_of_epochs, verbose=verbose)
  plot_graphs(num_of_epochs, train_accuracy, train_loss, test_accuracy, test_loss, "2.5 preprocessing")

  print("Preprocessing results: ")
  pretty_print_results(train_accuracy, train_loss, test_accuracy, test_loss)


### Part 2.6. Network width

In [10]:
# Part 2.6

class NeuralNetworkChangingWidth(nn.Module):
    def __init__(self, width):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(3*32*32, width),
            nn.ReLU(),
            nn.Linear(width, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

# Create a fig for plotting train and test accuracy and loss
def prepare_fig(suptitle):
  fig, axs = plt.subplots(2)
  fig.suptitle(suptitle)
  fig.tight_layout()#pad=3)
  axs[0].set(xlabel='epochs', ylabel='accuracy')
  axs[0].grid()
  #axs[0].legend(legend, loc="best")
  axs[1].set(xlabel='epochs', ylabel='loss')
  axs[1].grid()
  #axs[1].legend(legend, loc="best")
  return fig, axs  

def plot(axs, num_of_epochs, train_accuracy,
         train_loss, test_accuracy, test_loss):
  # Plot
  epochs = [(x + 1) for x in range(num_of_epochs)]
  axs[0].plot(epochs, train_accuracy)
  axs[0].plot(epochs, test_accuracy)
  axs[1].plot(epochs, train_loss)
  axs[1].plot(epochs, test_loss)
  

def change_width(config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False , cnn=False):
  fig, axs = prepare_fig("2.6 width")
  # TODO: If we can move  legend a little bit to the side
  legend = []
  if cnn:
    values = [(256 , 64) , (512 , 256)]
  else:
    values = [2**6, 2**10, 2**12]
  for i in values:
    width =  i
    if cnn:
      model = CNNChangingWidth(width).to(device)
    else:
      model = NeuralNetworkChangingWidth(width).to(device)
    
    
    model.apply(init_weights_wrapper(config["deviation"]))

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=config["step_size"] , momentum=config["momentum"])
    train_accuracy, train_loss, test_accuracy, test_loss =\
            train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, num_of_epochs=num_of_epochs, verbose=verbose)
  
    plot(axs, num_of_epochs, train_accuracy, train_loss, test_accuracy, test_loss)
    legend.append("train width {}".format(width))
    legend.append("test width {}".format(width))

    print("Changing width results, with width", width, ":")
    pretty_print_results(train_accuracy, train_loss, test_accuracy, test_loss)
  fig.legend(legend, loc="right")
  plt.show()


### Part 2.7. Network Depth

In [11]:
# part 2.7

class NeuralNetworkChangingDepth(nn.Module):
    def __init__(self, depth): # width set to 64 as requested and depth is adjustable
        super().__init__()
        self.flatten = nn.Flatten()
        self.stack = nn.ModuleList([nn.Linear(3*32*32, 64) , nn.ReLU()])
        #self.stack.append(nn.Linear(3*32*32, 64))
        for i in range(depth - 2):
            #output_size = layer_sizes[i+1]
            self.stack.append(nn.Linear(64, 64))
            self.stack.append(nn.ReLU())
            
            
        self.stack.append(nn.Linear(64, 10))
        

    def forward(self, x):
      x = self.flatten(x)
      for layer in self.stack: #enumerate(self.stack):
          #temp = self.stack[i//2#](x)
          x = layer(x)
      return x
      # for layer in self.stack[:-1]:

      #       x = layer(x).clamp(min=0)
      # return self.stack[-1](x)

def try_with_different_depth(config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False):
  
  # TODO: If we can move  legend a little bit to the side
  num_of_epochs = 100
  for depth in [3, 4 , 10]: 
    fig, axs = prepare_fig("2.7 depth " + str(depth))
    legend = []
    model = NeuralNetworkChangingDepth(depth).to(device)
    model.apply(init_weights_wrapper(config["deviation"]))

    for name, param in model.named_parameters():
          print(name , param)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=config["step_size"] , momentum=config["momentum"])
    train_accuracy, train_loss, test_accuracy, test_loss =\
            train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, num_of_epochs=num_of_epochs, verbose=verbose)
  
    plot(axs, num_of_epochs, train_accuracy, train_loss, test_accuracy, test_loss)
    legend.append("train depth {}".format(depth))
    legend.append("test depth {}".format(depth))

    print("Changing depth results, with depth", depth, ":")
    pretty_print_results(train_accuracy, train_loss, test_accuracy, test_loss)
    fig.legend(legend, loc="right")
    plt.show()

#Question 3 - Convolutional Neural Network




In [12]:
CNNconfig = {
    "step_size": [5e-4, 7e-4,  1e-3, 1e-2],
    "momentum": [0.45, 0.5, 0.55, 0.7, 0.8],
    "deviation": [1, 1.5, 2, 2.25, 2.5,  3]
}

class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=64, out_channels=16, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(16*6*6, 784),
            nn.ReLU(),
            nn.Linear(784, 10)
        )

    def forward(self, x):
        #x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits



### Part 3.1. Baseline

In [13]:
# Part 3.1

def cnn_run_baseline(config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False):
  model = CNN().to(device)
  model.apply(init_weights_wrapper(config["deviation"]))
  # TODO: can we delete this print?
  #for name, param in model.named_parameters():
  #        print(name , param)

  loss_fn = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=config["step_size"] , momentum=config["momentum"])
  train_accuracy, train_loss, test_accuracy, test_loss =\
          train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, num_of_epochs=num_of_epochs, verbose=verbose)
  
  plot_graphs(num_of_epochs, train_accuracy, train_loss, test_accuracy, test_loss, "3.1 baseline")

  print("Baseline results: ")
  pretty_print_results(train_accuracy, train_loss, test_accuracy, test_loss )

###Part 3.2 - Optimization


In [14]:
#Part 3.2 - optimization
def cnn_change_optimization(config, train_dataloader, test_dataloader, betas, num_of_epochs=60, verbose=False):
  change_optimization(config, train_dataloader, test_dataloader, betas, num_of_epochs=num_of_epochs, verbose=verbose, cnn=True)

###Part 3.3 - Initialization

In [15]:
#part 3.3 init
def cnn_with_initialization(config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False):
  change_initialization(config, train_dataloader, test_dataloader, num_of_epochs=num_of_epochs, verbose=verbose, cnn=True)


###Part 3.4 - Regularization

In [16]:
#Part 3.3 - regularizaton
class CNNWithDO(nn.Module):
    def __init__(self, dropout):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(dropout),
            nn.Conv2d(in_channels=64, out_channels=16, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(dropout),
            nn.Flatten(),
            nn.Linear(16*6*6, 784),
            nn.ReLU(),
            nn.Linear(784, 10)
        )

    def forward(self, x):
        #x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

def cnn_with_regularization(config, train_dataloader, test_dataloader, dropout=0.05, weight_decay=0.1, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False):
  regularization(config, train_dataloader, test_dataloader, dropout=dropout, weight_decay=weight_decay, num_of_epochs=num_of_epochs, verbose=verbose, cnn=True)

### Part 3.5 - Preprocessing

In [17]:
class CnnWithPreprocess(nn.Module):
    def __init__(self, pca):
        super().__init__()
        self.pca = pca
        self.linear_relu_stack = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=64, out_channels=16, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(16*6*6, 784),
            nn.ReLU(),
            nn.Linear(784, 10)
        )
        
    def whitening(self, x):
        x_ = self.flatten(x)
        x_ = self.pca.transform(torch.Tensor.cpu(x_))
        x = (torch.tensor(x_)).to(device).to(torch.float32)
        return x

    def forward(self, x):
        # Apply preprocessing        
        x = self.whitening(x)
        # Apply the network
        logits = self.linear_relu_stack(x)
        return logits


def cnn_add_preprocessing(config, train_sampler, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False):
  add_preprocessing(config, train_sampler, train_dataloader, test_dataloader, num_of_epochs=num_of_epochs, verbose=verbose, cnn=True)

###Part 3.6 - Width

In [26]:
#part 3.6 - width

class CNNChangingWidth(nn.Module):
    def __init__(self , filter_sizes):
        super().__init__()
        self.filter_sizes = filter_sizes
        self.linear_relu_stack = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=filter_sizes[0], kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=filter_sizes[0], out_channels=filter_sizes[1], kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(filter_sizes[1]*6*6, 784),
            nn.ReLU(),
            nn.Linear(784, 10)
        )

    def forward(self, x):
        #x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

def cnn_change_width(config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False):
  change_width(config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False , cnn=True) #use same function as in 2.6


###Part 3.7 - Depth

In [19]:
#part 3.7 - depth

class CNNChangingDepth(nn.Module):
    def __init__(self, depth):
        super().__init__()
        # 3 * 32 * 32 ->(conv) 64*30*30 -> 64 * 15 * 15
        self.stack = nn.ModuleList([
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)])
        for i in range(depth - 2):
            self.stack.extend(
                [nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding="same"),
                 nn.ReLU(),
                 nn.MaxPool2d(kernel_size=2, stride=2)]
            )
        # 64 * (15// (2 ** depth)) * (15 // (2 ** depth))
        size = ((15 // (2 ** (depth - 2))) - 2) // 2
        self.stack.extend([
            nn.Conv2d(in_channels=64, out_channels=16, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(16 * size * size, 784),
            nn.ReLU(),
            nn.Linear(784, 10)])

        self.linear_relu_stack = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=64, out_channels=16, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(16*6*6, 784),
            nn.ReLU(),
            nn.Linear(784, 10)
        )

    def forward(self, x):
        #x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits



class NeuralNetworkChangingDepth(nn.Module):
    def __init__(self, depth): # width set to 64 as requested and depth is adjustable
        super().__init__()
        self.flatten = nn.Flatten()
        self.stack = nn.ModuleList([nn.Linear(3*32*32, 64) , nn.ReLU()])
        #self.stack.append(nn.Linear(3*32*32, 64))
        for i in range(depth - 2):
            #output_size = layer_sizes[i+1]
            self.stack.append(nn.Linear(64, 64))
            self.stack.append(nn.ReLU())
            
            
        self.stack.append(nn.Linear(64, 10))
        

    def forward(self, x):
      x = self.flatten(x)
      for layer in self.stack: #enumerate(self.stack):
          #temp = self.stack[i//2#](x)
          x = layer(x)
      return x

def cnn_try_with_different_depth():
  pass
  # TODO: to write

# Main
Run all the code of the exercise

In [23]:
# Call all the functions of question 2
def Q2(config, train_dataloader, test_dataloader):
  # Choose which parts to run
  should_run_gs = False
  should_run_baseline = False
  should_run_change_optimization = False
  should_run_change_initialization = False
  should_run_regularization = False
  should_run_preprocessing = False
  should_run_width = False
  should_run_depth = False
  
  # Choose best parameters
  #TODO: run grid search with zero biases
  best_config = {}
  if should_run_gs:
    gs = GridSearch("grid_search_q2")
    best_config = gs.run(train_dataloader, test_dataloader, config, verbose=False)
    #best_config["step_size"] = gs.best_params["step_size"]
    #best_config["momentum"] = gs.best_params["momentum"]
    #best_config["deviation"] =  gs.best_params["deviation"]
    files.download('grid_search_q2') # TODO: shuold delete?
    print("Best config is: ")
    print(best_config)
  else:
    
    # Best
    best_config["step_size"] = 0.01
    best_config["momentum"] = 0.8
    best_config["deviation"] =  0.01
    
    

  if should_run_baseline:
    print("Run baseline")
    run_baseline(best_config, train_dataloader, test_dataloader, verbose=False)
    print()
  if should_run_change_optimization:
    print("Run change optimization")
    # Change step size for optimization
    best_config["step_size"] = 0.001
    change_optimization(best_config, train_dataloader, test_dataloader, verbose=False , betas=(0.7, 0.555))
    best_config["step_size"] = 0.01
    print()
  if should_run_change_initialization:
    print("Run change initialization")
    change_initialization(best_config, train_dataloader, test_dataloader, verbose=False)
    print()
  if should_run_regularization:
    print("Run regularization")
    regularization(best_config, train_dataloader, test_dataloader, num_of_epochs=100, verbose=False)
    print()
  if should_run_preprocessing:
    print("run preprocessing")
    add_preprocessing(best_config, train_sampler, train_dataloader, test_dataloader, verbose=False)
    print()
  if should_run_width:
    print("Run network width")
    change_width(best_config, train_dataloader, test_dataloader, verbose=False)
    print()
  if should_run_depth:
    print("Run network width")
    try_with_different_depth(best_config, train_dataloader, test_dataloader, verbose=False)
    print()


In [24]:
# Call all the functions of question 2
def Q3(config, train_dataloader, test_dataloader):
  should_run_gs = False
  should_run_baseline = False
  should_run_change_optimization = False
  should_run_change_initialization = False
  should_run_regularization = False
  should_run_preprocessing = False
  should_run_width = True
  should_run_depth = False

  best_config = {}
  if should_run_gs:
    gs = GridSearch("grid_search_q3")
    best_config = gs.run(train_dataloader, test_dataloader, config, verbose=False)
    #best_config["step_size"] = gs.best_params["step_size"]
    #best_config["momentum"] = gs.best_params["momentum"]
    #best_config["deviation"] =  gs.best_params["deviation"]
    files.download('grid_search_q3')
    print("Best config is: ")
    print(best_config)
  else:
    # The best config is the result of previous run of grid search
    best_config["step_size"] = 0.01
    best_config["momentum"] = 0.8
    best_config["deviation"] =  0.1
  if should_run_baseline:
    print("Run baseline")
    cnn_run_baseline(best_config, train_dataloader, test_dataloader, verbose=False , num_of_epochs=60) #TODO: change to 100
    print()
  if should_run_change_optimization:
    print("Run change optimization")
    # Change step size for optimization
    # TODO: should gs the betas
    best_config["step_size"] = 0.001
    cnn_change_optimization(best_config, train_dataloader, test_dataloader, verbose=False , betas=(0.7, 0.555))
    best_config["step_size"] = 0.01
    print()
  if should_run_change_initialization:
    print("Run change initialization")
    cnn_with_initialization(best_config, train_dataloader, test_dataloader, verbose=False)
    print()
  if should_run_regularization:
    print("Run regularization")
    cnn_with_regularization(best_config, train_dataloader, test_dataloader, num_of_epochs=100, verbose=False)
    print()
  if should_run_preprocessing:
    print("run preprocessing")
    cnn_add_preprocessing(best_config, train_sampler, train_dataloader, test_dataloader, verbose=False)
    print()
  if should_run_width:
    print("Run network width")
    cnn_change_width(best_config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False)
    print()
  if should_run_depth:
    print("Run network width")
    cnn_try_with_different_depth(best_config, train_dataloader, test_dataloader, num_of_epochs=DEFAULT_NUM_OF_EPOCHS, verbose=False)
    print()


In [None]:


should_run_svm = False

if __name__ == "__main__":
  # Question 1
  train_sampler = torch.utils.data.RandomSampler(training_data, replacement=True, num_samples=TRAIN_SIZE, generator=None)
  test_sampler = torch.utils.data.RandomSampler(test_data, replacement=True, num_samples=TEST_SIZE, generator=None)

  if should_run_svm:
    svm(train_sampler, test_sampler)

  # Loaders for NN
  num_workers = 1
  train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, sampler=train_sampler , num_workers=num_workers)
  test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, sampler=test_sampler , num_workers=num_workers)

  # TODO: should choose parameters here
  config = {
    "step_size": [5e-4, 1e-3, 1e-2, 1e-1],
    "momentum": [0.2, 0.5, 0.8, 0.9],
    "deviation": [1e-2,1e-1, 1, 2, 2.5]
  }

  Q2(config, train_dataloader, test_dataloader)
  Q3(config, train_dataloader, test_dataloader)

  print("end")

Run network width
Changing width results, with width (256, 64) :
Train Error: 
 Accuracy: 9.5%, Avg loss: 2.303068 

Test Error: 
 Accuracy: 10.7%, Avg loss: 2.302239 

