# Computing gradient

In this notebook your task will be to implement forward and backward pass of Linear and backward pass of Conv1d modules.

Resources:

* Backprop with focus on PyTorch: https://www.youtube.com/watch?v=ma2KXWblllc (see also other lectures from this series)

* Lecture on backpropagation https://www.cs.ox.ac.uk/people/nando.defreitas/machinelearning/ (Lecture 8)

# Setup

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import tqdm
import json

import torch
import torch.nn.functional as F

from torch import optim
from torch import nn
from torch.autograd import Variable

from keras.datasets import fashion_mnist
from keras.utils import np_utils

%matplotlib inline
import matplotlib.pylab as plt
import matplotlib as mpl

from torch.autograd import gradcheck

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Linear

* Your task is to implement backward and forward pass of a Linear module. 

* **You cannot use for loops inside backward.**

* Hint: try to implement first using for loops, and then transform to matrices.

In [31]:
class Linear(torch.autograd.Function):

    def forward(self, input, weight, bias=None):
        self.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    def backward(self, grad_output):
        input, weight, bias = self.saved_tensors
        grad_input = grad_weight = grad_bias = None

        print(input.shape, weight.shape, bias.shape, grad_output.shape)
        # gradoutput (n_examples, dim_out)
        
        # Hint: start with bias, use for loops if that's easier
        if self.needs_input_grad[0]:
            # Hint: grad_input should have same shape as input, (20,20)
            grad_input = grad_output.mm(weight) 
            print(grad_input.shape)
            #print(input + grad_input)
        if self.needs_input_grad[1]:
            # Hint: grad_weight should have same shape as weight (20, 15)
            grad_weight = torch.zeros((20,15))
        if bias is not None and self.needs_input_grad[2]: 
            # Hint: grad_bias should have same shape as bias (15,)
            grad_bias = torch.zeros(15)

        print(grad_input.shape, grad_weight.shape, grad_bias.shape)
    
        return grad_input, grad_weight, grad_bias

In [32]:
input = (Variable(torch.randn(30,20).double(), requires_grad=True),  # x
         Variable(torch.randn(15,20).double(), requires_grad=True),  # w
         Variable(torch.randn(15,).double(), requires_grad=True))    # b
test = gradcheck(Linear(), input, eps=1e-6, atol=1e-4)
print(test)

torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]

torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]

torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]

torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]

torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]

torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]

torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]) torch.Size([15])
torch.Size([30, 20]) torch.Size([15, 20]) torch.Size([15]) torch.Size([30, 15])
torch.Size([30, 20])
torch.Size([30, 20]) torch.Size([20, 15]

RuntimeError: for output no. 1,
 numerical:(
-0.0495 -0.9564 -0.7562  ...   0.0000  0.0000  0.0000
-0.3081 -1.1119  0.3934  ...   0.0000  0.0000  0.0000
 0.8872 -1.1274  1.0528  ...   0.0000  0.0000  0.0000
          ...             ⋱             ...          
 0.0000  0.0000  0.0000  ...   0.8873  0.4861 -0.9228
 0.0000  0.0000  0.0000  ...   2.0070 -0.4886  0.1345
 0.0000  0.0000  0.0000  ...   0.0754 -2.1655 -0.5208
[torch.FloatTensor of size 600x450]
, 
 0.5641  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.2371  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 1.0974  0.0000  0.0000  ...   0.0000  0.0000  0.0000
          ...             ⋱             ...          
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  1.2727
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.5230
 0.0000  0.0000  0.0000  ...   0.0000  0.0000 -1.2859
[torch.FloatTensor of size 300x450]
, 
    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.FloatTensor of size 15x450]
)
analytical:(
-0.0495 -0.9564 -0.7562  ...   0.0000  0.0000  0.0000
-0.3081 -1.1119  0.3934  ...   0.0000  0.0000  0.0000
 0.8872 -1.1274  1.0528  ...   0.0000  0.0000  0.0000
          ...             ⋱             ...          
 0.0000  0.0000  0.0000  ...   0.8873  0.4861 -0.9228
 0.0000  0.0000  0.0000  ...   2.0070 -0.4886  0.1345
 0.0000  0.0000  0.0000  ...   0.0754 -2.1655 -0.5208
[torch.FloatTensor of size 600x450]
, 
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 300x450]
, 
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 15x450]
)


# Conv1d

Your task will be to implement part of the backward pass of 1d convolutional layer.

We will have separate lab on convolutions. A crash course on CNNs:

<img width=300 src=http://cs231n.github.io/assets/nn1/neural_net2.jpeg>

<img width=400 src=http://cs231n.github.io/assets/cnn/stride.jpeg>

## Example of forward

Note that test uses same input as

<img width=400 src=http://cs231n.github.io/assets/cnn/stride.jpeg>

In [97]:
## Example with a single input channel

# Input
ex1 = [[0, 1, 2, -1, 1, -3, 0]]
ex2 = [[0, 1, 2, -1, 1, 3, 0]]
input = torch.autograd.Variable(torch.Tensor([ex1, ex2]))

# Define conv1d
weight = torch.autograd.Variable(torch.Tensor([[[1, 0, -1]]]) )
bias = torch.autograd.Variable(torch.Tensor([0]))
conv1d = torch.nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3)

# Compute output. Note that ex1 is same as in the figure
F.conv1d(input, weight=weight, bias=bias)

Variable containing:
(0 ,.,.) = 
 -2  2  1  2  1

(1 ,.,.) = 
 -2  2  1 -4  1
[torch.FloatTensor of size 2x1x5]

In [98]:
## Example with two input channels

ex1 = [[0, 1, 2, -1, 1, -3, 0], [0, 1, 3, -1, 2, -3, 1]]
ex2 = [[0, 1, 2, -1, 1, 3, 0], [0, 1, 3, -1, 2, 3, 1]]
input = torch.autograd.Variable(torch.Tensor([ex1, ex2]))

weight = torch.autograd.Variable(torch.Tensor([[[1, 0, -1], [1, 1, 1]]]) )
bias = torch.autograd.Variable(torch.Tensor([0]))

conv1d = torch.nn.Conv1d(in_channels=2, out_channels=1, kernel_size=3)

F.conv1d(input, weight=weight, bias=bias)

Variable containing:
(0 ,.,.) = 
  2  5  5  0  1

(1 ,.,.) = 
  2  5  5  0  7
[torch.FloatTensor of size 2x1x5]

## Implement backward 

* Implement only gradient with respect to bias

In [140]:
# Inherit from Function
class Conv1D(torch.autograd.Function):
    # bias is an optional argument
    def forward(self, input, weight, bias=None):
        input = input
        self.save_for_backward(input, weight, bias)
        output = F.conv1d(torch.autograd.Variable(input), 
                          weight=torch.autograd.Variable(weight), 
                          bias=torch.autograd.Variable(bias))
        return output.data

    # This function has only a single output, so it gets only one gradient
    def backward(self, grad_output):
        input, weight, bias = self.saved_tensors
        grad_input = grad_weight = grad_bias = None

        if self.needs_input_grad[0]:
            # You don't have to implement this
            grad_input = None
        if self.needs_input_grad[1]:
            # You don't have to implement this
            grad_weight = None
        if bias is not None and self.needs_input_grad[2]:
            # Hint: first write solution using for loop
            grad_bias = ??
        return grad_input, grad_weight, grad_bias

In [141]:
input = (Variable(torch.randn(2, 4, 5).double(), requires_grad=False), # 2 ex, 4 channels, 5 length
         Variable(torch.randn(3, 4, 2).double(), requires_grad=False), # 3 out channels, 4 in channels
         Variable(torch.randn(3).double(), requires_grad=True)) # 1 bias for each out channel
test = gradcheck(Conv1D(), input, eps=1e-6, atol=1e-4)
print(test)

True


# Tests

In [92]:
result = {}

In [142]:
## Test linear
input = (Variable(torch.randn(20,20).double(), requires_grad=True), 
         Variable(torch.randn(15,20).double(), requires_grad=True))
result['linear'] = 0.5*int(gradcheck(Linear(), input, eps=1e-6, atol=1e-4))

In [143]:
## Test conv1d
input = (Variable(torch.randn(2, 4, 5).double(), requires_grad=False), # 2 ex, 4 channels, 5 length
         Variable(torch.randn(3, 4, 2).double(), requires_grad=False), # 3 out channels, 4 in channels
         Variable(torch.randn(3).double(), requires_grad=True)) # 1 bias for each out channel
result['conv1d'] = 0.5*int(gradcheck(Conv1D(), input, eps=1e-6, atol=1e-4))

In [145]:
result

{'conv1d': 0.5, 'linear': 0.5}

In [144]:
json.dump(result, open("5_computing_gradient_backprop.json", "w"))