# Perceptron

In [1]:
# Import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

## XOR

In [3]:
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

# nn layers
linear = torch.nn.Linear(2,1, bias = True)
sigmoid = torch.nn.Sigmoid()
model = torch.nn.Sequential(linear, sigmoid).to(device)

# define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1)

for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)
    
    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()
    
    if step % 100 == 0:
        print("Step: {}\tCost: {}".format(step, cost.item()))

Step: 0	Cost: 0.708491325378418
Step: 100	Cost: 0.6931474804878235
Step: 200	Cost: 0.6931471824645996
Step: 300	Cost: 0.6931471824645996
Step: 400	Cost: 0.6931471824645996
Step: 500	Cost: 0.6931471824645996
Step: 600	Cost: 0.6931471824645996
Step: 700	Cost: 0.6931471824645996
Step: 800	Cost: 0.6931471824645996
Step: 900	Cost: 0.6931471824645996
Step: 1000	Cost: 0.6931471824645996
Step: 1100	Cost: 0.6931471824645996
Step: 1200	Cost: 0.6931471824645996
Step: 1300	Cost: 0.6931471824645996
Step: 1400	Cost: 0.6931471824645996
Step: 1500	Cost: 0.6931471824645996
Step: 1600	Cost: 0.6931471824645996
Step: 1700	Cost: 0.6931471824645996
Step: 1800	Cost: 0.6931471824645996
Step: 1900	Cost: 0.6931471824645996
Step: 2000	Cost: 0.6931471824645996
Step: 2100	Cost: 0.6931471824645996
Step: 2200	Cost: 0.6931471824645996
Step: 2300	Cost: 0.6931471824645996
Step: 2400	Cost: 0.6931471824645996
Step: 2500	Cost: 0.6931471824645996
Step: 2600	Cost: 0.6931471824645996
Step: 2700	Cost: 0.6931471824645996
Step:

In [4]:
print(hypothesis)

tensor([[0.5000],
        [0.5000],
        [0.5000],
        [0.5000]], device='cuda:0', grad_fn=<SigmoidBackward>)


# Multi Layer Perceptron

## Backpropagation

YouTube Link: [LINK](https://www.youtube.com/watch?v=573EZkzfnZ0&list=PLlMkM4tgfjnLSOjrEJN31gZATbcj_MpUm&index=27) (Season 1)

In [28]:
# Code Implementation

# Data
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

# nn layers
w1 = torch.Tensor(2, 2).to(device)
b1 = torch.Tensor(2).to(device)
w2 = torch.Tensor(2, 1).to(device)
b2 = torch.Tensor(1).to(device)

def sigmoid(x):
    # Sigmoid function
    return 1.0 / (1.0 + torch.exp(-x))
    # return torch.div(torch.tensor(1), torch.add(torch.tensor(1.0), torch.exp(-x)))
    
def sigmoid_prime(x):
    # Derivative of the sigmoid function
    return sigmoid(x) * (1 - sigmoid(x))

In [29]:
learning_rate = 3e-2

In [30]:
for step in range(10001):
    # forward
    l1 = torch.add(torch.matmul(X, w1), b1)
    a1 = sigmoid(l1)
    l2 = torch.add(torch.matmul(a1, w2), b2)
    Y_pred = sigmoid(l2)
    
    cost = -torch.mean(Y * torch.log(Y_pred) + (1 - Y) * torch.log(1 - Y_pred))
    
    
    # Backprop (chain rule)
    # Loss derivative
    d_Y_pred = (Y_pred - Y) / (Y_pred * (1.0 - Y_pred) + 1e-7)
    
    # Layer 2
    d_l2 = d_Y_pred * sigmoid_prime(l2)
    d_b2 = d_l2
    d_w2 = torch.matmul(torch.transpose(a1, 0, 1), d_b2)
    
    # Layer 1
    d_a1 = torch.matmul(d_b2, torch.transpose(w2, 0, 1))
    d_l1 = d_a1 * sigmoid_prime(l1)
    d_b1 = d_l1
    d_w1 = torch.matmul(torch.transpose(X, 0, 1), d_b1)
    
    
    # Weight update
    w1 = w1 - learning_rate * d_w1
    b1 = b1 - learning_rate * torch.mean(d_b1, 0)
    w2 = w2 - learning_rate * d_w2
    b2 = b2 - learning_rate * torch.mean(d_b2, 0)
    
    
    if step % 100 == 0:
        print("Step: {} \t Cost: {}".format(step, cost.item()))

Step: 0 	 Cost: 0.6931471824645996
Step: 100 	 Cost: 0.6931471824645996
Step: 200 	 Cost: 0.6931471824645996
Step: 300 	 Cost: 0.6931471824645996
Step: 400 	 Cost: 0.6931471824645996
Step: 500 	 Cost: 0.6931471824645996
Step: 600 	 Cost: 0.6931471824645996
Step: 700 	 Cost: 0.6931471824645996
Step: 800 	 Cost: 0.6931471824645996
Step: 900 	 Cost: 0.6931471824645996
Step: 1000 	 Cost: 0.6931471824645996
Step: 1100 	 Cost: 0.6931471824645996
Step: 1200 	 Cost: 0.6931471824645996
Step: 1300 	 Cost: 0.6931471824645996
Step: 1400 	 Cost: 0.6931471824645996
Step: 1500 	 Cost: 0.6931471824645996
Step: 1600 	 Cost: 0.6931471824645996
Step: 1700 	 Cost: 0.6931471824645996
Step: 1800 	 Cost: 0.6931471824645996
Step: 1900 	 Cost: 0.6931471824645996
Step: 2000 	 Cost: 0.6931471824645996
Step: 2100 	 Cost: 0.6931471824645996
Step: 2200 	 Cost: 0.6931471824645996
Step: 2300 	 Cost: 0.6931471824645996
Step: 2400 	 Cost: 0.6931471824645996
Step: 2500 	 Cost: 0.6931471228599548
Step: 2600 	 Cost: 0.693

## Code: XOR-nn

In [32]:
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

# nn layers
linear1 = nn.Linear(2, 2, bias=True)
linear2 = nn.Linear(2, 1, bias=True)
sigmoid = nn.Sigmoid()
model = nn.Sequential(linear1, sigmoid, linear2, sigmoid).to(device)

# define cost/loss & optimizer
criterion = nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1)
for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)
    
    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()
    
    if step % 100 == 0:
        print("Step: {} \t Cost: {}".format(step, cost.item()))

Step: 0 	 Cost: 0.7174476385116577
Step: 100 	 Cost: 0.6931344270706177
Step: 200 	 Cost: 0.6929052472114563
Step: 300 	 Cost: 0.6922950744628906
Step: 400 	 Cost: 0.688478946685791
Step: 500 	 Cost: 0.6552730202674866
Step: 600 	 Cost: 0.5433056354522705
Step: 700 	 Cost: 0.3511308431625366
Step: 800 	 Cost: 0.12058607488870621
Step: 900 	 Cost: 0.05995449423789978
Step: 1000 	 Cost: 0.03859895467758179
Step: 1100 	 Cost: 0.028160858899354935
Step: 1200 	 Cost: 0.022059038281440735
Step: 1300 	 Cost: 0.01808222010731697
Step: 1400 	 Cost: 0.015295113436877728
Step: 1500 	 Cost: 0.013238020241260529
Step: 1600 	 Cost: 0.011659765616059303
Step: 1700 	 Cost: 0.010411925613880157
Step: 1800 	 Cost: 0.009401406161487103
Step: 1900 	 Cost: 0.008566854521632195
Step: 2000 	 Cost: 0.007866371423006058
Step: 2100 	 Cost: 0.0072702402248978615
Step: 2200 	 Cost: 0.006756922230124474
Step: 2300 	 Cost: 0.006310423836112022
Step: 2400 	 Cost: 0.005918555893003941
Step: 2500 	 Cost: 0.00557193439

## Code: XOR-nn Wide&Deep

In [33]:
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

# nn layers
linear1 = nn.Linear(2, 10, bias=True)
linear2 = nn.Linear(10, 10, bias=True)
linear3 = nn.Linear(10, 10, bias=True)
linear4 = nn.Linear(10, 1, bias=True)
sigmoid = nn.Sigmoid()
model = nn.Sequential(linear1, sigmoid,
                      linear2, sigmoid,
                      linear3, sigmoid,
                      linear4, sigmoid).to(device)

# define cost/loss & optimizer
criterion = nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1)
for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)
    
    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()
    
    if step % 100 == 0:
        print("Step: {} \t Cost: {}".format(step, cost.item()))

Step: 0 	 Cost: 0.6932129859924316
Step: 100 	 Cost: 0.6931434273719788
Step: 200 	 Cost: 0.6931430697441101
Step: 300 	 Cost: 0.6931426525115967
Step: 400 	 Cost: 0.693142294883728
Step: 500 	 Cost: 0.6931418180465698
Step: 600 	 Cost: 0.6931414604187012
Step: 700 	 Cost: 0.6931411027908325
Step: 800 	 Cost: 0.6931406855583191
Step: 900 	 Cost: 0.6931402683258057
Step: 1000 	 Cost: 0.6931398510932922
Step: 1100 	 Cost: 0.6931394338607788
Step: 1200 	 Cost: 0.6931390166282654
Step: 1300 	 Cost: 0.693138599395752
Step: 1400 	 Cost: 0.6931381225585938
Step: 1500 	 Cost: 0.6931377053260803
Step: 1600 	 Cost: 0.6931371688842773
Step: 1700 	 Cost: 0.6931366920471191
Step: 1800 	 Cost: 0.6931362152099609
Step: 1900 	 Cost: 0.693135678768158
Step: 2000 	 Cost: 0.693135142326355
Step: 2100 	 Cost: 0.6931344270706177
Step: 2200 	 Cost: 0.6931338310241699
Step: 2300 	 Cost: 0.6931332349777222
Step: 2400 	 Cost: 0.6931325793266296
Step: 2500 	 Cost: 0.6931318640708923
Step: 2600 	 Cost: 0.6931310

# ReLU

별도의 내용이 없어서 생략 (ReLU와 Optimizer에 대해서 소개)

# Weight Initialization

## Code: mnist_nn_xavier

$$ a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}} $$

In [34]:
from torch.nn.init import _calculate_fan_in_and_fan_out

In [35]:
def xavier_uniform_(tensor, gain=1):
    '''
    Also known as glorot initialization.
    
    Args:
        tensor: a n-dimensional `torch.Tensor`
        gain: an optional scaling factor
        
    Examples:
        >>> w = torch.empty(3, 5)
        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))
    '''
    # Check: https://github.com/pytorch/pytorch/blob/8e9692df2787b64f879e83db617745b810bd7ef2/torch/nn/init.py#L209
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    std = gain * math.sqrt(2.0/(fan_in + fan_out))
    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
    
    with torch.no_grad():
        return tensor.uniform_(-a, a)

# Dropout

따로 추가할 내용이 없어 생략
- 요약 하자면, torch.nn.Dropout(p=drop_prob)를 쓰는 이야기
- layer → activation → dropout순으로 진행

# Batch Normalization

- Gradient Vanishing / Exploding
- Internal Covariate Shift
    - Covariate Shift
        - Training과 Test셋의 분포에 차이가 있을 수 있으며, 이로 인해 문제가 발생할 수 있음
        - `입력과 출력의 분포가 다르다`라고도 생각할 수 있음
    - Internal Covariate Shift
        - 입력과 출력이 아닌 `Layer`간의 차이 때문에 Covariate Shift가 발생할 수 있다.
        - 이는 학습이 진행될 수록 각 Layer의 parameter의 분포가 다를 수 있기 때문에 생기는 문제이다.
- Batch Normalization
    - Layer마다 Normalization을 해서 변형된 분포가 나오지 않도록 하고자 하는 과정
    - 주의: 입력에 대한 분포를 Normalize하는 과정이다.
    - Normalize가 끝난 결과에 Gamma를 곱해주고 Beta를 더해주는 연산이 Batch Normalization이다.
        - Gamma와 Beta는 Trainable이다.
    - Sample mean/variance: Batch-size별 입력에 대한 mean/variancec
    - Running mean/variance: 모든 batch에 대한 sample mean/variance를 평균한 value
    - Layer → BN → Activation → ... 순으로 사용하는 것이 일반적인 사용법이다.
    - PyTorch에서 BatchNorm이나 Dropout을 사용할 시에는 `model.train()`을 이용해 학습 모드를 꼭 지정하도록 하자.
        - `model.eval()`은 Validation/Evaluation시 사용하도록 하자.

## Code: MNIST_batchnorm

In [36]:
# nn layers
linear1 = torch.nn.Linear(784, 32, bias=True)
linear2 = torch.nn.Linear(32, 32, bias=True)
linear3 = torch.nn.Linear(32, 10, bias=True)
relu = torch.nn.ReLU()
bn1 = torch.nn.BatchNorm1d(32)    # 1D Batch-normalization
bn2 = torch.nn.BatchNorm1d(32)

nn_linear1 = torch.nn.Linear(784, 32, bias=True)
nn_linear2 = torch.nn.Linear(32, 32, bias=True)
nn_linear3 = torch.nn.Linear(32, 10, bias=True)

# model
bn_model = torch.nn.Sequential(linear1, bn1, relu,
                               linear2, bn2, relu,
                               linear3).to(device)
nn_model = torch.nn.Sequential(nn_linear1, relu,
                               nn_linear2, relu,
                               nn_linear3).to(device)