In [1]:
import vugrad as vg
import numpy as np

### Question 5

In [2]:
a = vg.TensorNode(np.random.randn(2, 2))
b = vg.TensorNode(np.random.randn(2, 2))
c = a + b

In [4]:
c.value # value after the summation of nodes a and b

array([[-0.62734419,  0.96965596],
       [-0.20857508, -0.71375401]])

In [9]:
c.source # location of operation node

<vugrad.core.OpNode at 0x1f9cf3bbfd0>

In [11]:
c.source.inputs[0].value # value of the first input of the operation (so value of a)

array([[ 0.89399539, -0.37864434],
       [-0.87517414,  0.53242794]])

In [12]:
a.grad

array([[0., 0.],
       [0., 0.]])

### Question 9

In [2]:
import plotly.express as px
import pandas as pd

In [2]:
# !python experiments/train_mlp.py -D mnist -l 0.0001

In [3]:
# (xtrain, ytrain), (xval, yval), num_classes = vg.load_mnist(final=False, flatten=True)
# (xtrain, ytrain), (xval, yval), num_classes = vg.load_synth()
# num_instances, num_features = xtrain.shape

In [4]:
# Create a simple neural network.
# This is a `Module` consisting of other modules representing linear layers, provided by the vugrad library.
class MLP(vg.Module):
    """
    A simple MLP with one hidden layer, and a sigmoid non-linearity on the hidden layer and a softmax on the
    output.
    """

    def __init__(self, input_size, output_size, hidden_mult=4, activation='sigmoid'):
        """
        :param input_size:
        :param output_size:
        :param hidden_mult: Multiplier that indicates how many times bigger the hidden layer is than the input layer.
        """
        super().__init__()

        hidden_size = hidden_mult * input_size
        # -- There is no common wisdom on how big the hidden size should be, apart from the idea
        #    that it should be strictly _bigger_ than the input if at all possible.

        self.layer1 = vg.Linear(input_size, hidden_size)
        self.layer2 = vg.Linear(hidden_size, output_size)
        # -- The linear layer (without activation) is implemented in vugrad. We simply instantiate these modules, and
        #    add them to our network.
        if activation == 'relu':
            self.activation =  vg.relu
        else:
            self.activation =  vg.sigmoid

    def forward(self, input):

        assert len(input.size()) == 2

        # first layer
        hidden = self.layer1(input)

        # non-linearity
        hidden = self.activation(hidden)
        # -- We've called a utility function here, to mimin how this is usually done in pytorch. We could also do:
        #    hidden = Sigmoid.do_forward(hidden)
        #    hidden = ReLU.do_forward(hidden)

        # second layer
        output = self.layer2(hidden)

        # softmax activation
        output = vg.logsoftmax(output)
        # -- the logsoftmax computes the _logarithm_ of the probabilities produced by softmax. This makes the computation
        #    of the CE loss more stable when the probabilities get close to 0 (remember that the CE loss is the logarithm
        #    of these probabilities). It needs to be implemented in a specific way. See the source for details.

        return output

    def parameters(self):

        return self.layer1.parameters() + self.layer2.parameters()

In [5]:
def train_MLP(args, data, activation):
    (xtrain, ytrain), (xval, yval), num_classes = data
    num_instances, num_features = xtrain.shape

    # Initialize data
    batch_losses = []
    epochs = []
    accuracies = []
    losses = []

    ## Instantiate the model
    mlp = MLP(input_size=num_features, output_size=num_classes, activation=activation)

    n, m = xtrain.shape
    b = args['batch_size']

    print('\n## Starting training')
    for epoch in range(args['epochs']):

        print(f'{activation} epoch {epoch:03}')

        ## Compute validation accuracy
        o = mlp(vg.TensorNode(xval))
        oval = o.value

        predictions = np.argmax(oval, axis=1)
        num_correct = (predictions == yval).sum()
        acc = num_correct / yval.shape[0]

        o.clear() # gc the computation graph
        print(f'       accuracy: {acc:.4}')

        accuracies.append(acc)
        epochs.append(epoch)


        cl = 0.0 # running sum of the training loss

        # We loop over the data in batches of size `b`
        for fr in range(0, n, b):

            # The end index of the batch
            to = min(fr + b, n)

            # Slice out the batch and its corresponding target values
            batch, targets = xtrain[fr:to, :], ytrain[fr:to]

            # Wrap the inputs in a Node
            batch = vg.TensorNode(value=batch)

            outputs = mlp(batch)
            loss = vg.logceloss(outputs, targets)
            # -- The computation graph is now complete. It consists of the MLP, together with the computation of
            #    the scalar loss.
            # -- The variable `loss` is the TensorNode at the very top of our computation graph. This means we can call
            #    it to perform operations on the computation graph, like clearing the gradients, starting the backpropgation
            #    and clearing the graph.
            # -- Note that we set the MLP up to produce log probabilties, so we should compute the CE loss for these.

            cl += loss.value
            # -- We must be careful here to extract the _raw_ value for the running loss. What would happen if we kept
            #    a running sum using the TensorNode?

            batch_losses.append(loss.value)

            # Start the backpropagation
            loss.backward()

            # pply gradient descent
            for parm in mlp.parameters():
                parm.value -= args['lr'] * parm.grad
                # -- Note that we are directly manipulating the members of the parm TensorNode. This means that for this
                #    part, we are not building up a computation graph.

            # -- In Pytorch, the gradient descent is abstracted away into an Optimizer. This allows us to build slightly more
            #    complexoptimizers than plain graident descent.

            # Finally, we need to reset the gradients to zero ...
            loss.zero_grad()
            # ... and delete the parts of the computation graph we don't need to remember.
            loss.clear()

        losses.append(cl/n)
        print(f'   running loss: {cl/n:.4}')

    return losses, accuracies, epochs, batch_losses

In [13]:
args = {
    'batch_size': 128,
    # 'lr': 0.01, # vg.load_synth()
    'lr': 0.0001, # vg.load_mnist(final=False, flatten=True)
    'epochs': 10
}

losses_sigmoid, accuracies_sigmoid, epochs_sigmoid, batch_losses_sigmoid = train_MLP(args=args, data=vg.load_mnist(final=False, flatten=True), activation='sigmoid')
losses_relu, accuracies_relu, epochs_relu, batch_losses_relu = train_MLP(args=args, data=vg.load_mnist(final=False, flatten=True), activation='relu')


## Starting training
sigmoid epoch 000
       accuracy: 0.1022
   running loss: 0.3984
sigmoid epoch 001
       accuracy: 0.947
   running loss: 0.2123
sigmoid epoch 002
       accuracy: 0.957
   running loss: 0.1602
sigmoid epoch 003
       accuracy: 0.9612
   running loss: 0.1268
sigmoid epoch 004
       accuracy: 0.964
   running loss: 0.1025
sigmoid epoch 005
       accuracy: 0.9664
   running loss: 0.08388
sigmoid epoch 006
       accuracy: 0.9678
   running loss: 0.0697
sigmoid epoch 007
       accuracy: 0.9676
   running loss: 0.05876
sigmoid epoch 008
       accuracy: 0.9688
   running loss: 0.05015
sigmoid epoch 009
       accuracy: 0.9694
   running loss: 0.04342

## Starting training
relu epoch 000
       accuracy: 0.042
   running loss: 1.864e+03
relu epoch 001
       accuracy: 0.794
   running loss: 0.6548
relu epoch 002
       accuracy: 0.909
   running loss: 0.4874
relu epoch 003
       accuracy: 0.916
   running loss: 0.4606
relu epoch 004
       accuracy: 0.9434
   ru

In [7]:
def normalize_data(dataset):
    return (dataset - np.min(dataset)) / (np.max(dataset) - np.min(dataset))

In [14]:
def plot_diagrams(epochs_loss_relu, epochs_acc_relu,
                  epochs_loss_sigmoid, epochs_acc_sigmoid, 
                  epochs_batch_loss=[], epochs_batch_acc=[], total_epochs=None):
    batch_list = []
    if len(epochs_batch_loss)>0: batch_list = list(normalize_data([i for i in range(len(epochs_batch_loss))])*total_epochs)
      
    loss = epochs_loss_relu + epochs_loss_sigmoid + epochs_batch_loss
    acc = epochs_acc_relu + epochs_acc_sigmoid + epochs_batch_acc
    vector_epochs = [i for i in range(1, len(epochs_loss_relu)+1)]*2
    vector_epochs += batch_list
    name = ['relu']*len(epochs_loss_relu) + ['sigmoid']*len(epochs_loss_sigmoid) + ['batch']*len(epochs_batch_acc)

    df = pd.DataFrame({'epochs': vector_epochs,'loss': loss, 'name': name})
    fig = px.line(df, x='epochs', y='loss', color='name', markers=False, width=600, height=400)
    fig.show()

    df = pd.DataFrame({'epochs': vector_epochs,'Accuracy': acc, 'name': name})
    fig = px.line(df, x='epochs', y='Accuracy', color='name', markers=False, width=600, height=400)
    fig.show()

In [15]:
plot_diagrams(losses_relu, accuracies_relu,
              losses_sigmoid, accuracies_sigmoid,
              [], [], total_epochs=len(losses_relu))