#Assignment 4 - Reproducability of AlexNet

We perform a reproduction study for the ImageNet Classification with Deep Convolutional Neural Network paper, more commonly known as AlexNet.  

In [None]:
#dependencies
import torch
import torch.nn as nn
from torch.utils.model_zoo import load_url as load_state_dict_from_url
from typing import Any
from PIL import Image
import torchvision
from torchvision import datasets, models, transforms
from cv2 import imread
from __future__ import print_function
from __future__ import division
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import copy


from google.colab import drive 
drive.mount('/content/drive/')

path_jacob = '/content/drive/My Drive/Assignments/Assignment4/'
path_theo = '/content/drive/My Drive/COMP 551/Assignments/Assignment4/'
path = path_theo

To simplify the removal of convolutional layers and other tests we performed, the PyTorch implementation is copied directly from the open source code: https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py



In [3]:
__all__ = ['AlexNet', 'alexnet']

model_urls = {
    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-7be5be79.pth',
}

num_classes = 1000
#This is the base architecture of the model
base_features =   [nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2)]
base_pool = nn.AdaptiveAvgPool2d((6, 6))
base_classifier = [nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes)]

class AlexNet(nn.Module):
    def __init__(self, features, pool, classifier, num_classes: int = 1000,): 
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(*features)
        self.avgpool = pool
        self.classifier = nn.Sequential(*classifier)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

def alexnet(pretrained: bool = True, load_ablation = None, progress: bool = True,
            features = base_features, pool=base_pool, classifier = base_classifier, **kwargs: Any) -> AlexNet:
    r"""AlexNet model architecture from the
    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    model = AlexNet(features, pool, classifier, **kwargs)
    if pretrained and load_ablation is None:
        state_dict = load_state_dict_from_url(model_urls['alexnet'],
                                              progress=progress)
        model.load_state_dict(state_dict)
        print('Loaded Base Model...')
    elif pretrained and load_ablation != None:
        model.load_state_dict(load_ablation)
    return model


## Loading the Dataset

In [4]:
def preprocess(input_image):
  #reshaping the images for compatibility with PyTorch AlexNet
  #normalizes the images like PyTorch AlexNet implementation 
  preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
  input_tensor = preprocess(input_image)
  
  input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
  return input_batch



Let's read in some images from the ILSVRC2012 ImageNet Dataset on which to test the pretrained model. Note that due to the impractically large size of the training set (~138 GB), and the lack of labels in the test set, all training and testing was done using the ILSVRC2012 ImageNet validation set. 

In [None]:
def load_images(img_count = 100, path = path):
  read_file=open(path+"ILSVRC2012_validation_ground_truth.txt",'r')
  ground_truth=[int(s.strip()) for s in read_file.readlines()]
  image_list=[]
  for i in range (1,img_count):
    var= str(i).zfill(8)
    string_to_read= path+'ILSVRC2012_img_val/ILSVRC2012_val_'+var+'.JPEG'
    image=Image.open(string_to_read)
    image_list.append(image.convert('RGB'))
    if i%1000 == 0:
      print(i)
  return image_list, ground_truth

images, ground_truth = load_images(img_count = 5000) 

Note that the model predicts a category for an image. The output of this prediction is a string naming that category. For the purposes of evaluating the performance of the model, the model must output a numerical label. These values are obtained below:  

In [None]:
# Download ImageNet labels
def load_labels(ground_truth = ground_truth):
  !wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt
  # Read the categories
  with open("imagenet_classes.txt", "r") as f:
      categories = [s.strip() for s in f.readlines()]
  label_names=open(path+"map_clsloc.txt",'r')
  lines= label_names.readlines()
  split_lines=list(map(lambda x: x.split(),lines))
  #learn the names of the ground truths 
  conversion_dict_truth_to_name={}
  for split_line in split_lines:
    list_of_words= split_line[2].split("_")
    lower_case= list(map(lambda x: x.lower(),list_of_words))
    conversion_dict_truth_to_name[int(split_line[1])]=" ".join(lower_case)
  #learn how to turn the names into the categories 
  name_to_int={}
  for i in range(len(categories)):
    name= categories[i]
    name_to_int[name.lower()]=i #do i need to adjust from starting at zero or one???????
  ####FINALLY CONVERT####
  new_ground_truth=[]
  for truth in ground_truth:
    new_ground_truth.append(name_to_int[conversion_dict_truth_to_name[truth]])
  return new_ground_truth
  
labels = load_labels()

## 1 Reproducing Results

In [7]:
def calculate_accuracy_top_one(top1_categories, new_ground_truth = labels):
  #calculate the accuracy using the top 1 prediction
  sum=0
  for i in range(len(top1_categories)):
    prob, cat= top1_categories[i]
    #print(categories[ground_truth[i]])
    if cat==new_ground_truth[i]:
      sum=sum+1
  return sum/len(top1_categories)


def calculate_accuracy_top_five(top5_categories,new_ground_truth = labels):
  #calculate the accuracy using the top 5 prediction (ie: if the true label is 
  #contained in top 5 out of 1000 predicted categories)
  sum=0
  for i in range(len(top5_categories)):
    prob, cat= top5_categories[i]
    #print(categories[ground_truth[i]])
    if new_ground_truth[i] in cat:
      sum=sum+1
  return sum/len(top5_categories)

def run_model(model, input_batch):
  #forward propagates for testing 
  if torch.cuda.is_available():
    input_batch = input_batch.to('cuda')
    model.to('cuda')
  with torch.no_grad():
    output = model(input_batch)
  return output

def find_probabilities(output_scores): 
  #convert model output to probabilities using Softmax
  probabilities = torch.nn.functional.softmax(output_scores[0], dim=0)
  return probabilities

def find_top_one(probabilities):
  #return label and probability of top 1 prediction
  top1_prob, top1_catid = torch.topk(probabilities,1)
  return top1_prob[0], top1_catid[0]

def find_top_five(probabilities):
  #return label and probability of top 5 prediction
  top5_prob, top5_catid = torch.topk(probabilities,5)
  return top5_prob, top5_catid

def simple_test(model, data = images):
  #performs a simple test of a model using a list of images. 
  model.eval()
  xtest=list(map(lambda x: preprocess(x),data))
  output=list(map(lambda x: run_model(model, x), xtest))
  probabilities= list(map(lambda x: find_probabilities(x),output))
  top1_catids=list(map(lambda x: find_top_one(x),probabilities))# find_top_one(probabilities)
  top5_catids=list(map(lambda x: find_top_five(x),probabilities))
  print("Top 5 accuracy")
  print(calculate_accuracy_top_five(top5_catids))
  print("Top 1 accuracy")
  print(calculate_accuracy_top_one(top1_catids))


In [33]:
model = alexnet(pretrained = True)
simple_test(model)


Loaded Base Model...
Top 5 accuracy
0.7899579915983197
Top 1 accuracy
0.5717143428685737


In [11]:
weights = model.state_dict()

##Ablation Studies
We performed two successful ablation studies by removing the last two convolutional layers and their respective ReLU activations. We also tried to remove middle layers, but were unable to sufficiently retrain the models to produce a tolerable accuracy. Finally, we perfomed a series of other tests. The initialization of all of these models are listed at the bottom of the Notebook. 

In [12]:
def view_layers(model):
  #display the model's layers
  for name, param in model.named_parameters():
      print('name: ', name)
      print(type(param))
      print('param.shape: ', param.shape)
      print('param.requires_grad: ', param.requires_grad)
      print('=====')

###Freeze model weights
We attempted to remove middle layers by replacing them with various Identity Conv2d layers, which would simply output that layer's inputs. In order to finetune the model, that identity layer's weights would have to be frozen, so that it remains an identity, and does not become updated by stochastic gradient descent. 

In [48]:
def freeze_layers(model, unfrozen_param = output):
  for name, param in model.named_parameters():
    if name in unfrozen_param:
        param.requires_grad = True
    else:
        param.requires_grad = False

## Let's try to remove a layer: 

In [None]:
from collections import OrderedDict

def remove_layer(weights, *layers):
  #Removes layer weights from the ordered dictionary of network weights
  #by removing the corresponding key value pair
  weights = weights.copy()
  print('Removing Layer:')
  for key in layers:
    print(f'{key}: {weights[key].shape}')
    del weights[key]
  print('\n')
  return weights

def show_new_dims(weights):
  #list the model's architecture
  print('\nRetained Weights:')
  for key in weights.keys():
    print(f'{key}: {weights[key].shape}')

def rename_key(dictionary, old_key,new_key):
  #renames a key in the ordered dictionary of weights
  #necessary when completely removing a middle layer 
  #from Alexnet
  dictionary[new_key] = dictionary[old_key]
  del dictionary[old_key]
  return dictionary

def reorder(dictionary, new_order):
  #after renaming the keys, reorders the dictionary
  d = dict(dictionary)
  ordered = OrderedDict()
  for k in new_order:
    ordered[k] = d[k]
  return ordered

def make_layer_dumb(model,layer, shape = (256,384,3,3)):
  #makes a layer dumb by creating an Identity layer. 
  #replaces  layer's weights with a tensor composed of (kernel size x kernel size)
  #matrices of 0s with a 1 at their center
  weights = model.state_dict().copy()
  new_tensor= np.zeros(shape)
  for x in range(shape[0]):
    for y in range(shape[1]):
      new_tensor[x][y][1][1]=1
  weights[layer] = torch.tensor(new_tensor)
  return weights

new_model = alexnet(load_ablation=weights)
new_weights = remove_layer(new_model.state_dict(), 'features.10.weight', 'features.10.bias')
show_new_dims(new_weights)

# new_weights = rename_key(new_weights, 'features.10.weight','features.8.weight')
# new_weights = rename_key(new_weights, 'features.10.bias','features.8.bias')
# # new_weights = reorder(new_weights, new_order)
# show_new_dims(new_weights)

# # new_weights = make_layer_dumb(new_model,'classifier.6.weight','classifier.6.bias')
# show_new_dims(new_weights)

# layer = ['features.8.weight', 'features.8.bias']
# new_model = alexnet(load_ablation=weights)
# new_weights = make_layer_dumb2(new_model, 12, layer[0])
# freeze_layers(new_model, layer)
# show_new_dims(new_weights)

##Initialize our New Model
New model architecture found below

In [None]:
num_classes = 1000
features_ =  [nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2)]
            
pooling_ = nn.AdaptiveAvgPool2d((6, 6))

classifier_ = [nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace = True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace = True),
            nn.Linear(4096, num_classes)]


changed_weights = True
if changed_weights == True:
  #set weights to new weights obtained above (for removing layer) 
  weights = new_weights 

model_abl = alexnet(pretrained = True, load_ablation = weights, features = features_, pool = pooling_, classifier = classifier_ )
simple_test(model_abl)


##Fine-Tuning New Model

Train the model using the new architecture. Optimization is done using SGD against a cross entropy loss criteria. Code was obtained from tutorial below, and was edited to suit the purposes of this study. 

https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html

In [25]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=5):
    since = time.time()
    val_acc_history = []
    #copy the weights
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    #iterate over epochs
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
            running_loss = 0.0
            running_corrects = 0
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
        print()
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history



Load New Data for Training

In [26]:
input_size=224

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])}


In [27]:
#Default Hyperparameters
batch_size = 128
lr_ = 0.05
momentum_ = 0.9



In [28]:
data_dir=path
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}
print(image_datasets)
# Create training and validation dataloaders
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=2) for x in ['train', 'val']}

{'train': Dataset ImageFolder
    Number of datapoints: 35003
    Root location: /content/drive/My Drive/COMP 551/Assignments/Assignment4/train
    StandardTransform
Transform: Compose(
               RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear)
               RandomHorizontalFlip(p=0.5)
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           ), 'val': Dataset ImageFolder
    Number of datapoints: 5000
    Root location: /content/drive/My Drive/COMP 551/Assignments/Assignment4/val
    StandardTransform
Transform: Compose(
               Resize(size=224, interpolation=bilinear)
               CenterCrop(size=(224, 224))
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )}


Train and Test a Model

In [None]:
device = torch.device("cuda:0")
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
# Train and evaluate

#HYPER PARAMETERS
lr_ = 0.05
momentum_ = 0.9

model_ft = model_abl
model_ft = model_ft.to(device)

params_to_update = model_ft.parameters()
optimizer_ft = optim.SGD(params_to_update, lr=lr_, momentum=momentum_)

model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft)
simple_test(model_ft)


##Hyperparameter Grid Search

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
# Train and evaluate

#default from paper
batch_size = 128
lr_ = 0.05
momentum_ = 0.9

momentum = [0.09, 0.9, 9]
lr = [0.0005, 0.05, 0.5]
batch_sizes = [8, 128, 500]

#GRID_SEARCH OVER HYPER PARAMETERS
hyperparameters  = momentum 
results = []

for h in hyperparameters:
  dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=2) for x in ['train', 'val']}
  model_ft = alexnet(pretrained = True)
  model_ft = model_ft.to(device)

  params_to_update = model_ft.parameters()
  optimizer_ft = optim.SGD(params_to_update, lr=lr_, momentum=h)

  model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft)
  results.append(hist)


##The architectures we used

In [None]:
#ablation last layer
features_ = [nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2)]
            
pooling_ = nn.AdaptiveAvgPool2d((6, 6))

classifier_ = [nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes)]

[]

In [None]:
#ablation of last two convolutional layers 
features_ = [nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2)]
            
pooling_ = nn.AdaptiveAvgPool2d((4, 6))

classifier_ = [nn.Dropout(),
            nn.Linear(384 * 4 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes)]

In [None]:
#replace max pool with average pool
num_classes = 1000
features_ = [nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.AvgPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.AvgPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.AvgPool2d(kernel_size=3, stride=2),
            
pooling_ = nn.AdaptiveAvgPool2d((4, 6))

classifier_ = [nn.Dropout(),
            nn.Linear(384 * 4 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes)]

In [None]:
#non-overlapping max pool
num_classes = 1000
features_ = [nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(192, 384, kernel_size=2, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=2, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=2, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
pooling_ = nn.AdaptiveAvgPool2d((4, 6))

classifier_ = [nn.Dropout(),
            nn.Linear(384 * 4 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes)]

In [None]:
#sigmoid activation
features_ = [nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.Simoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.Simoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.Simoid(),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.Simoid(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2)]
            
pooling_ = nn.AdaptiveAvgPool2d((6, 6))

classifier_ = [nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.Simoid(),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.Simoid(),
            nn.Linear(4096, num_classes)]

In [None]:
#tanh activation
features_ = [nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.Tanh(), 
            nn.MaxPool2d(kernel_size=3, stride=2)]
            
pooling_ = nn.AdaptiveAvgPool2d((6, 6))

classifier_ = [nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.Tanh(),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.Tanh(),
            nn.Linear(4096, num_classes)]

In [None]:
 #remove dropout
 features_ = [nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2)]

pooling_ = nn.AdaptiveAvgPool2d((4, 6))

classifier_ = [
            nn.Linear(256 * 4 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes)]