## PyTorch Tutorial

IFT6135 – Representation Learning

A Deep Learning Course, January 2018

By Chin-Wei Huang 

(Adapted from Sandeep Subramanian's 2017 MILA tutorial)

## Creating Your Own Modules

### `torch.nn.module`

In [17]:
import numpy as np
from __future__ import print_function

In [33]:
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.autograd import Variable

import math
import numpy as np

``nn.Module`` is base class for all neural network modules.

You should also write your modules as sub-class of ``nn.Module``, so that it can inherit the following attributes:

* *Recursive structure*: you can wrap an instantiation of a Module class with another one, which stores the inner one as its parent

* cudafiability: you can easily cudafy the whole sequence of modules using `model.cuda()`

* *Serializable*: you can save your trained model (checkpoint, early stopping ...) using ``torch.save``, ``torch.load``

etc. 


In [None]:
# http://pytorch.org/docs/master/nn.html#linear-layers
class Linear(nn.Module):
    r"""Applies a linear transformation to the incoming data: :math:`y = Ax + b`

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to False, the layer will not learn an additive bias. Default: True

    Shape:
        - Input: :math:`(N, in\_features)`
        - Output: :math:`(N, out\_features)`

    Attributes:
        weight: the learnable weights of the module of shape (out_features x in_features)
        bias:   the learnable bias of the module of shape (out_features)

    Examples::

        >>> m = nn.Linear(20, 30)
        >>> input = autograd.Variable(torch.randn(128, 20))
        >>> output = m(input)
        >>> print(output.size())
    """

    def __init__(self, in_features, out_features, bias=True):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input):
        if self.bias is None:
            return self._backend.Linear()(input, self.weight)
        else:
            return self._backend.Linear()(input, self.weight, self.bias)

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
            + str(self.in_features) + ' -> ' \
            + str(self.out_features) + ')'


In [28]:
class MyLinear(nn.Module):
    
    def __init__(self, in_features, out_features, bias=True):
        super(MyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)

    def forward(self, input):
        if self.bias is None:
            return torch.mm(input, self.weight) 
        else:
            return torch.mm(input, self.weight) + self.bias

        

In [45]:
x = Variable(torch.from_numpy(np.random.randn(2, 3))).float()


linear1 = nn.Linear(3,4)
linear2 = MyLinear(3,4)

# set the weight and bias of linear2 to be the same as linear1's
linear2.weight.data = linear1.weight.data.transpose(1,0)
linear2.bias.data = linear1.bias.data

print(torch.eq(linear1(x), linear2(x)))



Variable containing:
 1  1  1  1
 1  1  1  1
[torch.ByteTensor of size 2x4]



### Resnet example

* Resnet blocks let the gradient flow through the hidden unit more directly and at the same time increase expressiveness

Res(x) = F(x, {W}) + x

Res(x) = F(x, {W1}) + W2 x



In [76]:
class ResLinear(nn.Module):

    def __init__(self, in_features, out_features, activation=nn.ReLU()):
        super(ResLinear, self).__init__()
        
        self.in_features = in_features
        self.out_features = out_features
        self.activation = activation
        
        self.linear = nn.Linear(in_features, out_features)
        if in_features != out_features:
            self.project_linear = nn.Linear(in_features, out_features)
        
    def forward(self, x):
        inner = self.activation(self.linear(x))
        if self.in_features != self.out_features:
            skip = self.project_linear(x)
        else:
            skip = x
        return inner + skip
        
        

In [77]:
x = Variable(torch.from_numpy(np.random.randn(2, 3))).float()


res1 = nn.Linear(3,3)
res2 = MyLinear(3,5)

print(res1(x).size())
print(res2(x).size())

torch.Size([2, 3])
torch.Size([2, 5])


### Putting things altogether, Sequential, Parameter updates

In [104]:
class MyModel(nn.Module):
    
    def __init__(self):
        super(MyModel, self).__init__()
        
        self.predict_ = nn.Sequential(
            ResLinear(784, 328),
            nn.ReLU(),
            ResLinear(328, 328),
            nn.ReLU(),
            ResLinear(328, 10),
        )
        
        
        self.criterion = nn.CrossEntropyLoss()
    
    def predict_proba(self, x):
        return nn.Softmax()(x)
    
    def predict(self, x):
        return torch.max(self.predict_proba(x))[1]
    
    def loss(self, x, target):
        proba = self.predict_(x)
        return self.criterion(proba, target)
        
        







caveate:    ``CrossEntropyLoss``    versus    ``NLLLoss``


* ``CrossEntropyLoss`` takes in *pre-softmax* as input

* ``NLLLoss`` takes in *log-softmax* as input


In [118]:
y = Variable(torch.Tensor(1,10).normal_())
t = Variable(torch.from_numpy(np.random.choice(10, size=1)))

loss1 = nn.CrossEntropyLoss()
loss2 = nn.NLLLoss()

print(loss1(y, t))
print(loss2(nn.LogSoftmax(dim=1)(y), t))


Variable containing:
 2.8393
[torch.FloatTensor of size 1]

Variable containing:
 2.8393
[torch.FloatTensor of size 1]



In [129]:
x = Variable(torch.from_numpy(np.random.randn(64, 784))).float()
t = Variable(torch.from_numpy(np.random.choice(10, size=64)))

model = MyModel()
print(model.loss(x, t))

Variable containing:
 2.4260
[torch.FloatTensor of size 1]



### Updating Parameters (Manually)

In [139]:
x = Variable(torch.from_numpy(np.random.randn(64, 784))).float()
t = Variable(torch.from_numpy(np.random.choice(10, size=64)))
model = MyModel()

lr = 0.1

for i in range(10):
    loss = model.loss(x, t)
    loss.backward()
    
    for param in model.parameters():
        #param.data = param.data - lr*param.grad.data
        param.data.sub_(param.grad.data*lr)
        param.grad.data.zero_()
        
    print(loss)
    

Variable containing:
 2.4742
[torch.FloatTensor of size 1]

Variable containing:
 1.5233
[torch.FloatTensor of size 1]

Variable containing:
 1.1643
[torch.FloatTensor of size 1]

Variable containing:
 0.7926
[torch.FloatTensor of size 1]

Variable containing:
 0.4097
[torch.FloatTensor of size 1]

Variable containing:
 0.2453
[torch.FloatTensor of size 1]

Variable containing:
 0.1749
[torch.FloatTensor of size 1]

Variable containing:
 0.1344
[torch.FloatTensor of size 1]

Variable containing:
 0.1071
[torch.FloatTensor of size 1]

Variable containing:
1.00000e-02 *
  8.7845
[torch.FloatTensor of size 1]



### Updating Parameters (``torch.optim``)

In [149]:
x = Variable(torch.from_numpy(np.random.randn(64, 784))).float()
t = Variable(torch.from_numpy(np.random.choice(10, size=64)))
model = MyModel()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.0)


for i in range(10):
    optimizer.zero_grad()
    
    loss = model.loss(x, t)
    loss.backward()
    
    optimizer.step()
        
    print(loss)
    

Variable containing:
 2.4319
[torch.FloatTensor of size 1]

Variable containing:
 1.6166
[torch.FloatTensor of size 1]

Variable containing:
 1.2979
[torch.FloatTensor of size 1]

Variable containing:
 0.7428
[torch.FloatTensor of size 1]

Variable containing:
 0.4813
[torch.FloatTensor of size 1]

Variable containing:
 0.3368
[torch.FloatTensor of size 1]

Variable containing:
 0.2404
[torch.FloatTensor of size 1]

Variable containing:
 0.1836
[torch.FloatTensor of size 1]

Variable containing:
 0.1464
[torch.FloatTensor of size 1]

Variable containing:
 0.1203
[torch.FloatTensor of size 1]

