## Cost Function


In [17]:
import torch
import torch.optim as optim

In [18]:
X_train = torch.FloatTensor([[1], [2], [3]])
y_train = torch.FloatTensor([[2], [4], [6]])
W = torch.zeros(1, requires_grad = True)
b = torch.zeros(1, requires_grad = True)
hype = X_train * W + b

requires_grad는 해당 tensor에서 모든 연산을 적한 후, backward에서 gradient를 자동으로 계산함  
detach를 사용하여 연산 기록으로부터 분리할 수 있음  
혹은 valid set에서는 with torch.no_grad()를 사용할 수 있음  

In [19]:
MSE_cost = torch.mean((hype - y_train) ** 2)

In [20]:
optimizer = optim.SGD([W, b], lr = 0.01)
optimizer.zero_grad()
MSE_cost.backward()
optimizer.step()

In [21]:
n_epochs = 1000
for epoch in range(1, n_epochs + 1):
    hype = X_train*W + b
    cost = torch.mean((hype - y_train) ** 2)
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f"cost : {cost}")

cost : 0.04817060008645058
cost : 0.029766537249088287
cost : 0.018393920734524727
cost : 0.011366334743797779
cost : 0.007023668382316828
cost : 0.00434019835665822
cost : 0.0026819754857569933
cost : 0.0016572937602177262
cost : 0.0010241125710308552
cost : 0.0006328357267193496


## GD

In [48]:
X_train = torch.FloatTensor([[1], [2], [3]])
y_train = torch.FloatTensor([[2], [4], [6]])
W = torch.zeros(1)

In [49]:
lr = 0.1
n_epoch = 10

for epoch in range(1, n_epoch +1):
    hype = X_train * W
    cost = torch.mean((hype - y_train)**2)
    gradient = torch.sum((W*X_train - y_train) * X_train)
    print(f"Epoch : {epoch:4d}/{n_epoch}, W : {W.item():.3f}, Cost : {cost.item():.3f}")
    W -= lr * gradient

Epoch :    1/10, W : 0.000, Cost : 18.667
Epoch :    2/10, W : 2.800, Cost : 2.987
Epoch :    3/10, W : 1.680, Cost : 0.478
Epoch :    4/10, W : 2.128, Cost : 0.076
Epoch :    5/10, W : 1.949, Cost : 0.012
Epoch :    6/10, W : 2.020, Cost : 0.002
Epoch :    7/10, W : 1.992, Cost : 0.000
Epoch :    8/10, W : 2.003, Cost : 0.000
Epoch :    9/10, W : 1.999, Cost : 0.000
Epoch :   10/10, W : 2.001, Cost : 0.000


### torch.optim

In [47]:
lr = 0.15
W = torch.zeros(1, requires_grad = True)
optimizer = optim.SGD([W], lr = lr)

n_epoch = 10

for epoch in range(1, n_epoch +1):
    hype = X_train * W
    cost = torch.mean((hype - y_train)**2)
    print(f"Epoch : {epoch:4d}/{n_epoch}, W : {W.item():.3f}, Cost : {cost.item():.3f}")
    
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

Epoch :    1/10, W : 0.000, Cost : 18.667
Epoch :    2/10, W : 2.800, Cost : 2.987
Epoch :    3/10, W : 1.680, Cost : 0.478
Epoch :    4/10, W : 2.128, Cost : 0.076
Epoch :    5/10, W : 1.949, Cost : 0.012
Epoch :    6/10, W : 2.020, Cost : 0.002
Epoch :    7/10, W : 1.992, Cost : 0.000
Epoch :    8/10, W : 2.003, Cost : 0.000
Epoch :    9/10, W : 1.999, Cost : 0.000
Epoch :   10/10, W : 2.001, Cost : 0.000


## require_grad, zero_grad, no_grad
  
- require_grad
    - True
        - Tensor의 gradient 자동 계산, Tensor 내에 저장
        - backward 함수를 통해 호출 가능 
    - False
        - Tensor의 변화도 계산하지 않음
- zero_grad
    - gradient를 0으로 만듬
    - backward를 호출할 때마다 누적되는 것 방지
    - 초기화
- no_grad
    - history 트래킹 하지 않음

요즘은 Radam, adamW를 사용  
adamP 네이버에서 만듬  

## Minibatch Gradient Descent
- 전체 데이터가 아니라 minibatch에 있는 데이터만 사용
    - 전체 데이터를 쓰지 않아 잘못된 방향으로 업데이트할 수 있음

## Softmax Classification

- logistic의 연장성
    - 

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)

<torch._C.Generator at 0x2e13ac63e90>

In [4]:
z = torch.FloatTensor([1,2,3])
y_pred = F.softmax(z, dim = 0)
y_pred

tensor([0.0900, 0.2447, 0.6652])

## Cross Entropy
$H(P, Q) = -E[logQ(x)] = - \sum P(x) log Q(x)$