In [1]:
import numpy as np

import torch
import torch.nn as nn

## Chain Rule and Computational Graph

In [2]:
# z = (x + y)^2

x = torch.randn(32, requires_grad=True)
y = torch.randn(32)

t = (x + y)
z = t ** 2

In [3]:
x.grad is None

True

In [4]:
z.backward(torch.ones_like(z))

In [5]:
x.grad

tensor([-4.9022, -1.3994, -3.6113, -0.1243, -2.7523, -0.2463,  1.6660, -3.5591,
        -3.9942, -3.9819,  1.6452, -5.6255,  1.4447, -0.3180, -5.8219, -2.0622,
         4.0085,  0.0107, -0.7885,  5.2320,  1.9871,  5.0611,  0.0436,  5.3610,
        -0.4770,  0.3224,  2.1578, -0.9394,  0.2355,  2.9944,  0.6156, -0.6121])

In [6]:
2 * (x + y) # partial derivative z over x is same as 2(x+y)*1

tensor([-4.9022, -1.3994, -3.6113, -0.1243, -2.7523, -0.2463,  1.6660, -3.5591,
        -3.9942, -3.9819,  1.6452, -5.6255,  1.4447, -0.3180, -5.8219, -2.0622,
         4.0085,  0.0107, -0.7885,  5.2320,  1.9871,  5.0611,  0.0436,  5.3610,
        -0.4770,  0.3224,  2.1578, -0.9394,  0.2355,  2.9944,  0.6156, -0.6121],
       grad_fn=<MulBackward0>)

In [7]:
t.grad_fn, z.grad_fn

(<AddBackward0 at 0x19759f7dc88>, <PowBackward0 at 0x19759f7df60>)

## Pytorch Implementation

- Add

In [8]:
class Add2(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, i, j):
        result = i + j
        ctx.save_for_backward(result)
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        return grad_output, grad_output

In [9]:
x.grad = None
y2 = y.data
y2.requires_grad = True

In [10]:
t = Add2.apply(x, y2)
t.grad_fn

<torch.autograd.function.Add2Backward at 0x19759f0eb40>

In [11]:
t.backward(torch.ones_like(t))

In [12]:
x.grad # 덧셈 노드 역전파는 gradient를 그대로 전파

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [13]:
y2.grad

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

- Multiplication

In [14]:
class Mul2(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, i, j):
        result = i * j
        ctx.save_for_backward(i, j)
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        x, y = ctx.saved_tensors
        return y, x

In [15]:
x.grad = None
y2.grad = None

In [16]:
t = Mul2.apply(x, y2)
t.grad_fn

<torch.autograd.function.Mul2Backward at 0x19759f0ec18>

In [17]:
t.backward(torch.ones_like(t))

In [18]:
x.grad # is same as y2

tensor([-1.5021, -0.1684, -2.0597, -0.1771, -0.6056, -0.3150,  0.6844, -1.7798,
        -1.4690, -1.6745,  0.8560, -2.0431,  0.2906,  0.4620, -1.9062, -0.2509,
         0.6938, -0.5153, -0.0825,  1.7021,  1.8963,  1.9593, -0.4827,  1.6668,
        -0.8638,  0.4666,  1.7789, -0.2838, -0.1452,  1.3351,  1.0301, -0.0459])

In [19]:
y2.grad # is same as x

tensor([-9.4899e-01, -5.3129e-01,  2.5405e-01,  1.1490e-01, -7.7055e-01,
         1.9188e-01,  1.4856e-01,  2.6861e-04, -5.2811e-01, -3.1647e-01,
        -3.3334e-02, -7.6965e-01,  4.3182e-01, -6.2099e-01, -1.0047e+00,
        -7.8020e-01,  1.3105e+00,  5.2063e-01, -3.1181e-01,  9.1392e-01,
        -9.0276e-01,  5.7119e-01,  5.0452e-01,  1.0137e+00,  6.2528e-01,
        -3.0534e-01, -6.9995e-01, -1.8595e-01,  2.6299e-01,  1.6210e-01,
        -7.2231e-01, -2.6012e-01])

## 사과 쇼핑의 예

In [20]:
apple = torch.tensor([100.,], requires_grad=True)
num = torch.tensor([2.,], requires_grad=True)
ctax = torch.tensor([1.1], requires_grad=True)

In [21]:
price = apple * num
price.retain_grad()
result = price * ctax
result.retain_grad()

In [22]:
result.backward(torch.ones(1,))

In [23]:
apple.grad, num.grad, ctax.grad, price.grad, result.grad

(tensor([2.2000]),
 tensor([110.]),
 tensor([200.]),
 tensor([1.1000]),
 tensor([1.]))

## 사과와 귤 쇼핑의 역전파

In [24]:
apple = torch.tensor([100.,], requires_grad=True)
tangerine = torch.tensor([150.,], requires_grad=True)
num_apple = torch.tensor([2.,], requires_grad=True)
num_tangerine = torch.tensor([3.,], requires_grad=True)
ctax = torch.tensor([1.1], requires_grad=True)

In [25]:
apple_price = apple * num_apple
apple_price.retain_grad()

tangerine_price = tangerine * num_tangerine
tangerine_price.retain_grad()

price = apple_price + tangerine_price
price.retain_grad()

result = price * ctax
result.retain_grad()

In [26]:
result.backward(torch.ones(1))

In [27]:
items = ['apple', 'tangerine', 'num_apple', 'num_tangerine', 'ctax']
items += ['apple_price', 'tangerine_price', 'price', 'result']

for item in items:
    print(f"{item:>15s}.grad = {eval(item).grad.item():.2f}")

          apple.grad = 2.20
      tangerine.grad = 3.30
      num_apple.grad = 110.00
  num_tangerine.grad = 165.00
           ctax.grad = 650.00
    apple_price.grad = 1.10
tangerine_price.grad = 1.10
          price.grad = 1.10
         result.grad = 1.00


## Implement Activations, Affine and Softmax

- ReLU

In [28]:
x.grad = None
z = torch.relu(x)
z

tensor([0.0000e+00, 0.0000e+00, 2.5405e-01, 1.1490e-01, 0.0000e+00, 1.9188e-01,
        1.4856e-01, 2.6861e-04, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        4.3182e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.3105e+00, 5.2063e-01,
        0.0000e+00, 9.1392e-01, 0.0000e+00, 5.7119e-01, 5.0452e-01, 1.0137e+00,
        6.2528e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.6299e-01, 1.6210e-01,
        0.0000e+00, 0.0000e+00], grad_fn=<ReluBackward0>)

In [29]:
z.backward(torch.ones_like(z))

In [30]:
x.grad

tensor([0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
        0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0.])

In [31]:
class Relu2(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, i):
        x = i.clone()
        ctx.save_for_backward(x < 0)
        x[x < 0] = 0
        return x
    
    @staticmethod
    def backward(ctx, grad_output):
        cond, = ctx.saved_tensors
        grad_output[cond] = 0
        return grad_output

In [32]:
x.grad = None
z = Relu2.apply(x)
z

tensor([0.0000e+00, 0.0000e+00, 2.5405e-01, 1.1490e-01, 0.0000e+00, 1.9188e-01,
        1.4856e-01, 2.6861e-04, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        4.3182e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.3105e+00, 5.2063e-01,
        0.0000e+00, 9.1392e-01, 0.0000e+00, 5.7119e-01, 5.0452e-01, 1.0137e+00,
        6.2528e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.6299e-01, 1.6210e-01,
        0.0000e+00, 0.0000e+00], grad_fn=<Relu2Backward>)

In [33]:
z.backward(torch.ones_like(z))

In [34]:
x.grad

tensor([0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
        0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0.])

- sigmoid

In [35]:
class Sigmoid(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, i):
        result = 1 / (1 + np.exp(-i))
        ctx.save_for_backward(result)
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        result, = ctx.saved_tensors
        return grad_output * result * (1 - result)

In [36]:
x.grad = None
z = Sigmoid.apply(x)
z

tensor([0.2791, 0.3702, 0.5632, 0.5287, 0.3164, 0.5478, 0.5371, 0.5001, 0.3710,
        0.4215, 0.4917, 0.3166, 0.6063, 0.3496, 0.2680, 0.3143, 0.7876, 0.6273,
        0.4227, 0.7138, 0.2885, 0.6390, 0.6235, 0.7337, 0.6514, 0.4243, 0.3318,
        0.4536, 0.5654, 0.5404, 0.3269, 0.4353], grad_fn=<SigmoidBackward>)

In [37]:
z.backward(torch.ones_like(z))

In [38]:
x.grad # is same as z * (1-z)

tensor([0.2012, 0.2332, 0.2460, 0.2492, 0.2163, 0.2477, 0.2486, 0.2500, 0.2333,
        0.2438, 0.2499, 0.2163, 0.2387, 0.2274, 0.1962, 0.2155, 0.1673, 0.2338,
        0.2440, 0.2043, 0.2053, 0.2307, 0.2347, 0.1954, 0.2271, 0.2443, 0.2217,
        0.2479, 0.2457, 0.2484, 0.2200, 0.2458])

In [39]:
z * (1-z)

tensor([0.2012, 0.2332, 0.2460, 0.2492, 0.2163, 0.2477, 0.2486, 0.2500, 0.2333,
        0.2438, 0.2499, 0.2163, 0.2387, 0.2274, 0.1962, 0.2155, 0.1673, 0.2338,
        0.2440, 0.2043, 0.2053, 0.2307, 0.2347, 0.1954, 0.2271, 0.2443, 0.2217,
        0.2479, 0.2457, 0.2484, 0.2200, 0.2458], grad_fn=<MulBackward0>)

- Affine

In [71]:
x = torch.randn((32, 128), requires_grad=True)
W = torch.randn((128, 10), requires_grad=True)
b = torch.randn((10,), requires_grad=True)

In [72]:
Wx = x @ W
result = Wx + b

In [73]:
Wx.grad_fn, result.grad_fn

(<MmBackward at 0x19759fae6a0>, <AddBackward0 at 0x19759fae8d0>)

In [74]:
Wx.retain_grad()
result.retain_grad()

In [75]:
result.backward(torch.ones_like(result))

In [76]:
x.grad # Wx.grad @ W.T

tensor([[ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        ...,
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412]])

In [77]:
torch.eq(Wx.grad @ W.T, x.grad).all().item()

True

In [78]:
W.grad # x.T @ Wx.grad

tensor([[  1.6591,   1.6591,   1.6591,  ...,   1.6591,   1.6591,   1.6591],
        [  5.8406,   5.8406,   5.8406,  ...,   5.8406,   5.8406,   5.8406],
        [-14.8959, -14.8959, -14.8959,  ..., -14.8959, -14.8959, -14.8959],
        ...,
        [  3.8534,   3.8534,   3.8534,  ...,   3.8534,   3.8534,   3.8534],
        [ -0.2523,  -0.2523,  -0.2523,  ...,  -0.2523,  -0.2523,  -0.2523],
        [ 11.3444,  11.3444,  11.3444,  ...,  11.3444,  11.3444,  11.3444]])

In [79]:
torch.eq(x.T @ Wx.grad, W.grad).all().item()

True

In [80]:
b.grad # Wx의 grad와 동일하지만, dim=0으로 summation

tensor([32., 32., 32., 32., 32., 32., 32., 32., 32., 32.])

In [81]:
torch.eq(Wx.grad, torch.ones_like(result)).all().item()

True

In [96]:
class Mm2(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, x, W):
        ctx.save_for_backward(x, W)
        result = x @ W
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        x, W = ctx.saved_tensors
        return grad_output @ W.T, x.T @ grad_output

In [97]:
x.grad = None
W.grad = None
b.grad = None

In [98]:
Wx = Mm2.apply(x, W)
result = Add2.apply(Wx, b)

In [99]:
result.backward(torch.ones_like(result))

In [100]:
x.grad

tensor([[ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        ...,
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412],
        [ 2.8950,  1.8922, -5.9358,  ..., -1.9769, -4.1586, -2.2412]])

In [101]:
W.grad

tensor([[  1.6591,   1.6591,   1.6591,  ...,   1.6591,   1.6591,   1.6591],
        [  5.8406,   5.8406,   5.8406,  ...,   5.8406,   5.8406,   5.8406],
        [-14.8959, -14.8959, -14.8959,  ..., -14.8959, -14.8959, -14.8959],
        ...,
        [  3.8534,   3.8534,   3.8534,  ...,   3.8534,   3.8534,   3.8534],
        [ -0.2523,  -0.2523,  -0.2523,  ...,  -0.2523,  -0.2523,  -0.2523],
        [ 11.3444,  11.3444,  11.3444,  ...,  11.3444,  11.3444,  11.3444]])

In [102]:
b.grad

tensor([32., 32., 32., 32., 32., 32., 32., 32., 32., 32.])

## Gradient check