In [1]:
import torch as th
from load_mnist import load_mnist
import matplotlib.pyplot as plt
from functools import reduce
import numpy as np

##b) Vanishing Gradients and Initialization (9 points)

In [2]:

class Dataset():
    def __init__(self, dataset, subset=None, sorted=False, path="data/FashionMNIST/raw"):
        # Load data
        images, labels = load_mnist(path=path, dataset=dataset)
        self.x = images
        self.y = labels.long()  

        # Include only specified labels
        if subset is not None:
            indices = th.nonzero(reduce(lambda x, y: x | y, map(lambda x: labels == x, subset)))[:,0]
            self.x = self.x[indices]
            self.y = self.y[indices]

        # Sort labels
        if sorted is True:
            indices = th.argsort(self.y)
            self.x = self.x[indices]
            self.y = self.y[indices]

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, index):
        # Preprocess the image
        image = self.x[index]
        image = 2.0*(image/255.0)-1.0
        #image = (image-self.mean/self.std)
        return image, self.y[index]



class FashionMNISTClassifier(th.nn.Module):

    def __init__(self, num_neurons=[50, 20], activation=th.nn.ReLU):
        super().__init__()
        num_neurons = [784] + num_neurons

        self.layers = th.nn.ModuleList()
        self.layers.append(th.nn.Flatten())
        for i, in_neurons in enumerate(num_neurons[:-1]):
            out_neurons = num_neurons[i+1]
            self.layers.append(th.nn.Linear(in_neurons, out_neurons, True))
            self.layers.append(activation())
        self.layers.append(th.nn.Linear(num_neurons[-1], 10, True))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class Swish(th.nn.Module):

    def __init__(self, beta=1.0) -> None:
        super().__init__()
        self.beta  = th.nn.Parameter(th.tensor(beta))

    def forward(self, x):
        return x * th.sigmoid(self.beta * x)

def train(model, criterion, dataloader, optimizer):
    avgLoss = 0
    norms=[]
    for index, (images, labels) in enumerate(dataloader):
        # print(index)
        optimizer.zero_grad()
        prediction = model(images)
        loss = criterion(prediction, labels)
        loss.backward()
        optimizer.step()

        with th.no_grad():
            avgLoss += loss.numpy()/len(dataloader)
            
    for name, param in model.named_parameters():
        if param.requires_grad:
            norms.append(param.data.norm(2).item())
            
         
    return avgLoss,norms

def accuracy(predicted, true):
    return th.mean((predicted == true).type(th.float32))

@th.no_grad()
def evaluate(model, images, labels):
    model.eval()
    predictions = model(images)
    _, predictions = th.max(predictions, dim=1)
    return accuracy(predictions, labels)
    

def run_configuration(activation=th.nn.Sigmoid):
    configurations=[[50],[50,30],[50,30,30],[50,30,30,30]]
    # configurations=[[50,30]]
    for configuration in configurations:
        criterion = th.nn.CrossEntropyLoss()
        testset = Dataset('testing')
        
        model = FashionMNISTClassifier(num_neurons=configuration,activation=activation)
        optimizer = th.optim.SGD(model.parameters(), lr=0.005)

        losses_a = [float('nan')]*5
        accuarcies_a = [float('nan')]*5
        norms_a=[]
        

        dataset = Dataset('training')
        dataloader = th.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

        for epoch in range(5):
                loss,norms = train(model, criterion, dataloader, optimizer)
                accuarcy = evaluate(model, testset.x, testset.y)
                losses_a[epoch] = loss
                accuarcies_a[epoch] = accuarcy
                norms_a.append(norms)
                print('accuracy :{}'.format(accuarcy.data))
                print('norms :{}'.format(norms))




    

### b 1
Take your FashionMNIST classifier from the last exercise sheet and train
it for 20 epochs with the following layer configurations: (50,), (50, 30), (50, 30,
30), (50, 30, 30, 30). Your net should have torch.nn.Sigmoid as the activation
function between all hidden layers. Use the torch.optim.SGD optimizer with a learning
rate of η = 0:005. Record the norms of the gradients (per layer) of the last batch each
iteration as well as the test accuracy

In [3]:
run_configuration(activation=th.nn.Sigmoid)

accuracy :0.328900009393692
norms :[4.360305309295654, 0.12342795729637146, 2.4818596839904785, 0.19532305002212524]
accuracy :0.47380000352859497
norms :[4.677216529846191, 0.1265179067850113, 3.367706298828125, 0.19616451859474182]
accuracy :0.5703999996185303
norms :[4.944694995880127, 0.12945617735385895, 4.050527572631836, 0.1970328688621521]
accuracy :0.6114000082015991
norms :[5.168910026550293, 0.13169535994529724, 4.581873893737793, 0.19733576476573944]
accuracy :0.6284999847412109
norms :[5.3591437339782715, 0.13351227343082428, 5.0078864097595215, 0.2006649523973465]
accuracy :0.25519999861717224
norms :[4.1063385009765625, 0.1466086506843567, 3.159876823425293, 0.45604872703552246, 1.8135693073272705, 0.40482398867607117]
accuracy :0.31139999628067017
norms :[4.138560771942139, 0.14627180993556976, 3.2075109481811523, 0.45394542813301086, 1.8992550373077393, 0.4069543778896332]
accuracy :0.3066999912261963
norms :[4.2032575607299805, 0.1457388550043106, 3.3157589435577393, 

### b 3
Rerun the above experiment, this time using torch.nn.ReLU as the activation
function. Also plot the results like above. What do you observe?

In [4]:
run_configuration(activation=th.nn.ReLU)

accuracy :0.6085000038146973
norms :[4.467549800872803, 0.15378272533416748, 2.586721181869507, 0.26340410113334656]
accuracy :0.6384000182151794
norms :[4.654330253601074, 0.1561964452266693, 2.8958637714385986, 0.26427319645881653]
accuracy :0.6601999998092651
norms :[4.788599491119385, 0.1575445532798767, 3.104959011077881, 0.2659487724304199]
accuracy :0.6690999865531921
norms :[4.896921157836914, 0.15873202681541443, 3.266948699951172, 0.26688724756240845]
accuracy :0.6833000183105469
norms :[4.987635612487793, 0.15988647937774658, 3.3984928131103516, 0.2691066265106201]
accuracy :0.5361999869346619
norms :[4.419488906860352, 0.16360044479370117, 3.5687403678894043, 0.42819544672966003, 2.490389347076416, 0.44874975085258484]
accuracy :0.5953999757766724
norms :[4.6322407722473145, 0.16714532673358917, 3.828632354736328, 0.4302278161048889, 2.8529419898986816, 0.46018487215042114]
accuracy :0.6150000095367432
norms :[4.746358394622803, 0.16782069206237793, 3.9650533199310303, 0.43