In [1]:
import numpy as np

import torch
import torch.nn as nn

## Chain Rule and Computational Graph

In [2]:
# z = (x + y)^2

x = torch.randn(32, requires_grad=True)
y = torch.randn(32)

t = (x + y)
z = t ** 2

In [3]:
x.grad is None

True

In [4]:
z.backward(torch.ones_like(z))

In [5]:
x.grad

tensor([ 2.2231, -0.8313,  1.5170,  0.2799,  2.9198,  1.9845,  1.1557,  3.0177,
        -1.2327, -3.6421,  0.1778, -5.1457,  0.9701,  0.1705, -0.6928,  0.1140,
        -2.3534, -0.6161, -0.3250,  4.0826, -0.7653, -3.4585,  0.0458,  2.0011,
         0.3645, -1.6698, -2.1550,  2.1985,  3.5416, -1.1568, -0.5758,  0.5724])

In [6]:
2 * (x + y) # partial derivative z over x is same as 2(x+y)*1

tensor([ 2.2231, -0.8313,  1.5170,  0.2799,  2.9198,  1.9845,  1.1557,  3.0177,
        -1.2327, -3.6421,  0.1778, -5.1457,  0.9701,  0.1705, -0.6928,  0.1140,
        -2.3534, -0.6161, -0.3250,  4.0826, -0.7653, -3.4585,  0.0458,  2.0011,
         0.3645, -1.6698, -2.1550,  2.1985,  3.5416, -1.1568, -0.5758,  0.5724],
       grad_fn=<MulBackward0>)

In [7]:
t.grad_fn, z.grad_fn

(<AddBackward0 at 0x20d01f077f0>, <PowBackward0 at 0x20d01f076a0>)

## Pytorch Implementation

- Add

In [8]:
class Add2(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, i, j):
        result = i + j
        ctx.save_for_backward(result)
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        return grad_output, grad_output

In [9]:
x.grad = None
y2 = y.data
y2.requires_grad = True

In [10]:
t = Add2.apply(x, y2)
t.grad_fn

<torch.autograd.function.Add2Backward at 0x20d01e7eb40>

In [11]:
t.backward(torch.ones_like(t))

In [12]:
x.grad # 덧셈 노드 역전파는 gradient를 그대로 전파

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [13]:
y2.grad

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

- Multiplication

In [14]:
class Mul2(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, i, j):
        result = i * j
        ctx.save_for_backward(i, j)
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        x, y = ctx.saved_tensors
        return y, x

In [15]:
x.grad = None
y2.grad = None

In [16]:
t = Mul2.apply(x, y2)
t.grad_fn

<torch.autograd.function.Mul2Backward at 0x20d01e7ec18>

In [17]:
t.backward(torch.ones_like(t))

In [18]:
x.grad # is same as y2

tensor([ 0.0350, -0.3540,  0.8244, -0.4111,  1.0865,  0.7121,  0.6865,  0.8284,
        -0.0700, -1.6359,  0.7256, -0.6371,  0.3203, -0.5709,  0.2887, -0.2612,
        -0.6102, -1.4872,  0.7437,  1.3832,  0.8530, -0.2996,  0.1710, -0.7266,
         0.4247, -0.5761, -1.4798,  0.9976,  1.4002,  0.1949,  0.8476,  1.2577])

In [19]:
y2.grad # is same as x

tensor([ 1.0766, -0.0617, -0.0658,  0.5510,  0.3734,  0.2801, -0.1086,  0.6804,
        -0.5463, -0.1851, -0.6367, -1.9357,  0.1647,  0.6562, -0.6351,  0.3182,
        -0.5665,  1.1792, -0.9062,  0.6581, -1.2357, -1.4297, -0.1481,  1.7272,
        -0.2424, -0.2588,  0.4023,  0.1016,  0.3706, -0.7733, -1.1355, -0.9715])

## 사과 쇼핑의 예

In [20]:
apple = torch.tensor([100.,], requires_grad=True)
num = torch.tensor([2.,], requires_grad=True)
ctax = torch.tensor([1.1], requires_grad=True)

In [21]:
price = apple * num
price.retain_grad()
result = price * ctax
result.retain_grad()

In [22]:
result.backward(torch.ones(1,))

In [23]:
apple.grad, num.grad, ctax.grad, price.grad, result.grad

(tensor([2.2000]),
 tensor([110.]),
 tensor([200.]),
 tensor([1.1000]),
 tensor([1.]))

## 사과와 귤 쇼핑의 역전파

In [24]:
apple = torch.tensor([100.,], requires_grad=True)
tangerine = torch.tensor([150.,], requires_grad=True)
num_apple = torch.tensor([2.,], requires_grad=True)
num_tangerine = torch.tensor([3.,], requires_grad=True)
ctax = torch.tensor([1.1], requires_grad=True)

In [25]:
apple_price = apple * num_apple
apple_price.retain_grad()

tangerine_price = tangerine * num_tangerine
tangerine_price.retain_grad()

price = apple_price + tangerine_price
price.retain_grad()

result = price * ctax
result.retain_grad()

In [26]:
result.backward(torch.ones(1))

In [27]:
items = ['apple', 'tangerine', 'num_apple', 'num_tangerine', 'ctax']
items += ['apple_price', 'tangerine_price', 'price', 'result']

for item in items:
    print(f"{item:>15s}.grad = {eval(item).grad.item():.2f}")

          apple.grad = 2.20
      tangerine.grad = 3.30
      num_apple.grad = 110.00
  num_tangerine.grad = 165.00
           ctax.grad = 650.00
    apple_price.grad = 1.10
tangerine_price.grad = 1.10
          price.grad = 1.10
         result.grad = 1.00


## Implement Activations, Affine and Softmax

### ReLU

In [28]:
x.grad = None
z = torch.relu(x)
z

tensor([1.0766, 0.0000, 0.0000, 0.5510, 0.3734, 0.2801, 0.0000, 0.6804, 0.0000,
        0.0000, 0.0000, 0.0000, 0.1647, 0.6562, 0.0000, 0.3182, 0.0000, 1.1792,
        0.0000, 0.6581, 0.0000, 0.0000, 0.0000, 1.7272, 0.0000, 0.0000, 0.4023,
        0.1016, 0.3706, 0.0000, 0.0000, 0.0000], grad_fn=<ReluBackward0>)

In [29]:
z.backward(torch.ones_like(z))

In [30]:
x.grad

tensor([1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1.,
        0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0.])

In [31]:
class Relu2(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, i):
        x = i.clone()
        ctx.save_for_backward(x < 0)
        x[x < 0] = 0
        return x
    
    @staticmethod
    def backward(ctx, grad_output):
        cond, = ctx.saved_tensors
        grad_output[cond] = 0
        return grad_output

In [32]:
x.grad = None
z = Relu2.apply(x)
z

tensor([1.0766, 0.0000, 0.0000, 0.5510, 0.3734, 0.2801, 0.0000, 0.6804, 0.0000,
        0.0000, 0.0000, 0.0000, 0.1647, 0.6562, 0.0000, 0.3182, 0.0000, 1.1792,
        0.0000, 0.6581, 0.0000, 0.0000, 0.0000, 1.7272, 0.0000, 0.0000, 0.4023,
        0.1016, 0.3706, 0.0000, 0.0000, 0.0000], grad_fn=<Relu2Backward>)

In [33]:
z.backward(torch.ones_like(z))

In [34]:
x.grad

tensor([1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1.,
        0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0.])

### sigmoid

In [35]:
class Sigmoid(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, i):
        result = 1 / (1 + np.exp(-i))
        ctx.save_for_backward(result)
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        result, = ctx.saved_tensors
        return grad_output * result * (1 - result)

In [36]:
x.grad = None
z = Sigmoid.apply(x)
z

tensor([0.7458, 0.4846, 0.4835, 0.6344, 0.5923, 0.5696, 0.4729, 0.6638, 0.3667,
        0.4538, 0.3460, 0.1261, 0.5411, 0.6584, 0.3464, 0.5789, 0.3620, 0.7648,
        0.2878, 0.6588, 0.2252, 0.1931, 0.4630, 0.8491, 0.4397, 0.4357, 0.5992,
        0.5254, 0.5916, 0.3158, 0.2431, 0.2746], grad_fn=<SigmoidBackward>)

In [37]:
z.backward(torch.ones_like(z))

In [38]:
x.grad # is same as z * (1-z)

tensor([0.1896, 0.2498, 0.2497, 0.2319, 0.2415, 0.2452, 0.2493, 0.2232, 0.2322,
        0.2479, 0.2263, 0.1102, 0.2483, 0.2249, 0.2264, 0.2438, 0.2310, 0.1799,
        0.2050, 0.2248, 0.1745, 0.1558, 0.2486, 0.1282, 0.2464, 0.2459, 0.2402,
        0.2494, 0.2416, 0.2161, 0.1840, 0.1992])

In [39]:
z * (1-z)

tensor([0.1896, 0.2498, 0.2497, 0.2319, 0.2415, 0.2452, 0.2493, 0.2232, 0.2322,
        0.2479, 0.2263, 0.1102, 0.2483, 0.2249, 0.2264, 0.2438, 0.2310, 0.1799,
        0.2050, 0.2248, 0.1745, 0.1558, 0.2486, 0.1282, 0.2464, 0.2459, 0.2402,
        0.2494, 0.2416, 0.2161, 0.1840, 0.1992], grad_fn=<MulBackward0>)

### Affine

In [40]:
x = torch.randn((32, 128), requires_grad=True)
W = torch.randn((128, 10), requires_grad=True)
b = torch.randn((10,), requires_grad=True)

In [41]:
Wx = x @ W
result = Wx + b

In [42]:
Wx.grad_fn, result.grad_fn

(<MmBackward at 0x20d01f57400>, <AddBackward0 at 0x20d01f57048>)

In [43]:
Wx.retain_grad()
result.retain_grad()

In [44]:
result.backward(torch.ones_like(result))

In [45]:
x.grad # Wx.grad @ W.T

tensor([[ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        ...,
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347]])

In [46]:
torch.eq(Wx.grad @ W.T, x.grad).all().item()

True

In [47]:
W.grad # x.T @ Wx.grad

tensor([[-2.2547, -2.2547, -2.2547,  ..., -2.2547, -2.2547, -2.2547],
        [ 0.5833,  0.5833,  0.5833,  ...,  0.5833,  0.5833,  0.5833],
        [ 0.3981,  0.3981,  0.3981,  ...,  0.3981,  0.3981,  0.3981],
        ...,
        [-0.4636, -0.4636, -0.4636,  ..., -0.4636, -0.4636, -0.4636],
        [ 8.7022,  8.7022,  8.7022,  ...,  8.7022,  8.7022,  8.7022],
        [10.7459, 10.7459, 10.7459,  ..., 10.7459, 10.7459, 10.7459]])

In [48]:
torch.eq(x.T @ Wx.grad, W.grad).all().item()

True

In [49]:
b.grad # Wx의 grad와 동일하지만, dim=0으로 summation

tensor([32., 32., 32., 32., 32., 32., 32., 32., 32., 32.])

In [50]:
torch.eq(Wx.grad, torch.ones_like(result)).all().item()

True

In [51]:
class Mm2(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, x, W):
        ctx.save_for_backward(x, W)
        result = x @ W
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        x, W = ctx.saved_tensors
        return grad_output @ W.T, x.T @ grad_output

In [52]:
x.grad = None
W.grad = None
b.grad = None

In [53]:
Wx = Mm2.apply(x, W)
result = Add2.apply(Wx, b)

In [54]:
result.backward(torch.ones_like(result))

In [55]:
x.grad

tensor([[ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        ...,
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347],
        [ 4.5338,  3.1817, -1.2500,  ...,  2.8115, -1.0819, -2.6347]])

In [56]:
W.grad

tensor([[-2.2547, -2.2547, -2.2547,  ..., -2.2547, -2.2547, -2.2547],
        [ 0.5833,  0.5833,  0.5833,  ...,  0.5833,  0.5833,  0.5833],
        [ 0.3981,  0.3981,  0.3981,  ...,  0.3981,  0.3981,  0.3981],
        ...,
        [-0.4636, -0.4636, -0.4636,  ..., -0.4636, -0.4636, -0.4636],
        [ 8.7022,  8.7022,  8.7022,  ...,  8.7022,  8.7022,  8.7022],
        [10.7459, 10.7459, 10.7459,  ..., 10.7459, 10.7459, 10.7459]])

In [57]:
b.grad

tensor([32., 32., 32., 32., 32., 32., 32., 32., 32., 32.])

### Cross Entropy(LogSoftmax + NLL)

**주의할 점!!**

```python
class CrossEntropyLoss(_WeightedLoss):
    
    __constants__ = ['ignore_index', 'reduction']
    ignore_index: int
    
    def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
                 reduce=None, reduction: str = 'mean') -> None:
        super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
        self.ignore_index = ignore_index
        
    def forward(self, input: Tensor, target: Tensor) -> Tensor:
        return F.cross_entropy(input, target, weight=self.weight,
                               ignore_index=self.ignore_index, reduction=self.reduction)
```

```python
def cross_entropy(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):
    # if not torch.jit.is_scripting() 부분 패스
    # size_average, reduce 중 하나가 None일 때 reduction setting
    return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
```

- Cross Entropy = LogSoftMax + Negative Log Likelihood
- 코드로 보면 알겠지만, softmax function 값을 취할 때 dim=1로 들어감.
- 이 부분 기억해서 코드짤 것!

In [58]:
output = torch.randn(32, 10, requires_grad=True)
target = torch.LongTensor(32,).random_(0, 10)

loss_fn = nn.CrossEntropyLoss()

In [59]:
loss_fn(output, target)

tensor(2.6035, grad_fn=<NllLossBackward>)

In [60]:
log_prob = torch.log(torch.softmax(output, dim=1)) # LogSoftMax

In [61]:
-torch.mean((torch.diag(log_prob[:, target]))) # NLL

tensor(2.6035, grad_fn=<NegBackward>)

In [62]:
nn.NLLLoss()(log_prob, target)

tensor(2.6035, grad_fn=<NllLossBackward>)

In [63]:
# 코드 구현 차후

## Gradient check
- https://pytorch.org/docs/master/notes/extending.html

In [66]:
from torch.autograd import Function, gradcheck

In [74]:
# Inherit from Function
class LinearFunction(Function):
    
    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output
    
    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None
        
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0)
        
        return grad_input, grad_weight, grad_bias

In [75]:
linear = LinearFunction.apply

In [76]:
# gradcheck takes a tuple of tensors as input, check if your gradient
# evaluated with these tensors are close enough to numerical
# approximations and returns True if they all verify this condition.

input = (torch.randn(20,20, dtype=torch.double, requires_grad=True), 
         torch.randn(30,20, dtype=torch.double, requires_grad=True))
test = gradcheck(linear, input, eps=1e-6, atol=1e-4)
print(test)

True


In [77]:
class MulConstant(Function):
    @staticmethod
    def forward(ctx, tensor, constant):
        ctx.set_materilize_grad(False)
        ctx.constant = constant
        return tensor * constant
    @staticmethod
    def backward(ctx, grad_output):
        if grad_output is None:
            return None, None
        return grad_output * ctx.constant, None