# Autograd
- automatic differentiation engine

## backward()
- Computes the sum of gradients of given tensors w.r.t. the leaves of  computation graphs. 주어진 텐서의 gradients 를 graph leaves 로 계산한다.

=> gradient 계산 과정을 줄일 수 있음.

## Example

- input
    - $\mathbf{w}, \mathbf{x}$
- model
    - $a = \mathbf{w}^T\mathbf{x}$

### 1. Initialize

In [5]:
import numpy as np
import torch

In [6]:
# 모델 파라미터 생성, 입력값 생성
w = torch.randn(2, requires_grad = True)
x = torch.Tensor([1, 2])

### 2. Predict output

In [88]:
# model output 계산
y_hat = torch.inner(w, x)

### 2-2. Intermediate results

In [89]:
print(x)
print(w)
print(y_hat)

tensor([1., 2.])
tensor([-0.3628, -0.8129], requires_grad=True)
tensor(-1.9887, grad_fn=<ReshapeAliasBackward0>)


### 3. Compute loss

In [90]:
loss = (x.mean() - y_hat)**2
print(loss)

tensor(12.1709, grad_fn=<PowBackward0>)


### 4. Backpropagation

In [91]:
loss.backward()

### 4-2. Accessing the gradient

In [92]:
w.grad
# 업데이트 된 것을 확인 가능!

tensor([ -6.9774, -13.9547])

### Update parameters

In [93]:
lr = 0.1
with torch.no_grad():
    w = w - lr * w.grad
    print(w.grad)
print(w.requires_grad)
w.requires_grad = True
print(w.grad)

None
False
None


## Avoiding in-place operations

In [101]:
# 1. A = A + X
# 2.
mask = torch.ones_like(t)
mask[1:, :] = 0
print(mask)
t = t*mask
print(t)

tensor([[[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]])
tensor([[[1., 2., 3.],
         [4., 5., 6.],
         [7., 8., 9.]],

        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]])


# Implement a Shallow NN with PyTorch autograd

## Data preparation & Import

In [19]:
import torch
import numpy as np

### XOR data (numpy)

In [30]:
x_seeds = np.array([(0, 0), (1, 0), (0, 1), (1, 1)], dtype = np.float32)
y_seeds = np.array([0, 1, 1, 0])

N = 1000
idxs = np.random.randint(0, 4, N)

X = x_seeds[idxs]
Y = y_seeds[idxs]

X += np.random.normal(scale = 0.25, size = X.shape)

## Model

### Model (torch)

In [26]:
class shallow_neural_network():
    def __init__(self, num_input_features, num_hiddens):
        self.num_input_features = num_input_features
        self.num_hiddens = num_hiddens
        
        self.W1 = torch.randn((num_hiddens, num_input_features), requires_grad = True)
        self.b1 = torch.randn(num_hiddens, requires_grad = True)
        self.W2 = torch.randn(num_hiddens, requires_grad = True)
        self.b2 = torch.randn(1, requires_grad = True)
        
        self.tanh = torch.nn.Tanh()
        self.sigmoid = torch.nn.Sigmoid()
        
    def predict(self, x):
        z1 = torch.matmul(self.W1, x) + self.b1
        a1 = self.tanh(z1)
        z2 = torch.matmul(self.W2, a1) + self.b2
        a2 = self.sigmoid(z2)
        return a2

In [31]:
model_ag = shallow_neural_network(2, 3)

## Training

In [28]:
def train(X, Y, model, lr = 0.1):
    m = len(X)
    
    cost = 0.0
    for x, y in zip(X, Y):
        x_torch = torch.from_numpy(x) # tensor 로 바꿔줌.
        
        a2 = model.predict(x_torch)
        if y == 1:
            loss = -torch.log(a2+0.0001)
        else:
            loss = -torch.log(1.0001-a2)
        
        loss.backward() # compute gradients
        cost += loss.item()
        
    with torch.no_grad(): # parameter update
        model.W1 -= lr * model.W1.grad / m
        model.b1 -= lr * model.b1.grad / m
        model.W2 -= lr * model.W2.grad / m
        model.b2 -= lr * model.b2.grad / m
        
    model.W1.requires_grad = True # parameter tracking
    model.b1.requires_grad = True
    model.W2.requires_grad = True
    model.b2.requires_grad = True
    
    return cost/m

In [None]:
for epoch in range(100):
    cost = train(X, Y, model_ag, 1.0)
    if epoch % 10 == 0:
        print(epoch, cost)

## Testing

In [36]:
print(model_ag.predict(torch.Tensor((0, 0))))
print(model_ag.predict(torch.Tensor((0, 1))))
print(model_ag.predict(torch.Tensor((1, 0))))
print(model_ag.predict(torch.Tensor((1, 1))))

tensor([3.1551e-17], grad_fn=<SigmoidBackward0>)
tensor([1.], grad_fn=<SigmoidBackward0>)
tensor([0.8453], grad_fn=<SigmoidBackward0>)
tensor([8.6571e-18], grad_fn=<SigmoidBackward0>)


# nn.Module

## A simple custom module

In [124]:
import torch
from torch import nn

class MyLinear(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_features, out_features))
        self.bias = nn.Parameter(torch.randn(out_features))
        
    def forward(self, input):
        return (input @ self.weight) + self.bias
    
m = MyLinear(4, 3)
sample_input = torch.randn(4)
m(sample_input)

tensor([-1.0721,  3.5980, -1.3363], grad_fn=<AddBackward0>)

## Modules as Building Blocks

In [126]:
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.l0 = nn.Linear(4, 3)
        self.l1 = nn.Linear(3, 1)
    def forward(self, x):
        x = self.l0(x)
        x = F.relu(x)
        x = self.l1(x)
        return x

1. Intialize

`model = YourModel()`

`optimizer = torch.optim.SGD(model.parameters(), lr = <learning_rate>`
-> parameter 자동으로 셀렉. 알아서 optimize.

2. Forward (= predict)

`y_hat = model(input)`

3. Backward (= update parameter)

`loss = compute_loss(y, y_hat)`

`model.zero_grad()` -> 누적 없앰.

`loss.backward()`

`optimizer.step()`


# Implementing a Shallow NN with autograd and nn.Module

In [59]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [68]:
x_seeds = np.array([(0, 0), (1, 0), (0, 1), (1, 1)], dtype = np.float32)
y_seeds = np.array([0, 1, 1, 0])

N = 1000
idxs = np.random.randint(0, 4, N)

X = x_seeds[idxs]
Y = y_seeds[idxs]

X += np.random.normal(scale = 0.25, size = X.shape)

## Model (torch.nn.Module)

In [62]:
class shallow_neural_network_nn(nn.Module):
    def __init__(self, num_input_features, num_hiddens):
        super().__init__()
        self.num_input_features = num_input_features
        self.num_hiddens = num_hiddens
        
        self.linear1 = nn.Linear(num_input_features, num_hiddens)
        self.linear2 = nn.Linear(num_hiddens, 1)
        
        self.tanh = torch.nn.Tanh()
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        z1 = self.linear1(x)
        a1 = self.tanh(z1)
        z2 = self.linear2(a1)
        a2 = self.sigmoid(z2)
        return a2

## Training

In [69]:
num_epochs = 100
lr = 1.0
num_hiddens = 3

model_nn = shallow_neural_network_nn(2, num_hiddens)
optimizer_nn = optim.SGD(model_nn.parameters(), lr = lr)
loss_nn = nn.BCELoss()

In [70]:
for epoch in range(num_epochs):
    optimizer_nn.zero_grad()
    
    cost_nn = 0.0
    for x, y in zip(X, Y):
        x_torch = torch.FloatTensor(x)
        y_torch = torch.FloatTensor([y])
        
        y_hat = model_nn(x_torch)
        
        loss_val = loss_nn(y_hat, y_torch)
        cost_nn += loss_val
        
    cost_nn = cost_nn / len(X)
    cost_nn.backward()
    optimizer_nn.step()
    
    if epoch %10 == 0:
        print(epoch, cost_nn)

0 tensor(0.6929, grad_fn=<DivBackward0>)
10 tensor(0.6872, grad_fn=<DivBackward0>)
20 tensor(0.6806, grad_fn=<DivBackward0>)
30 tensor(0.6674, grad_fn=<DivBackward0>)
40 tensor(0.6427, grad_fn=<DivBackward0>)
50 tensor(0.6075, grad_fn=<DivBackward0>)
60 tensor(0.5704, grad_fn=<DivBackward0>)
70 tensor(0.5355, grad_fn=<DivBackward0>)
80 tensor(0.4919, grad_fn=<DivBackward0>)
90 tensor(0.4247, grad_fn=<DivBackward0>)


## Test

In [71]:
for x, y in zip(x_seeds, y_seeds):
    print(x)
    x_torch = torch.FloatTensor(x)
    y_hat = model_nn(x_torch)
    print(y, y_hat.item())

[0. 0.]
0 0.11121822893619537
[1. 0.]
1 0.7172693014144897
[0. 1.]
1 0.8491338491439819
[1. 1.]
0 0.3307785987854004
