# Lab 08-1 Perceptron

In [1]:
import torch
import numpy as np

## XOR

**BCE Loss** : Binary Cross Entropy

$$ BCE(x) = - {1\over N} \sum^N_{i = 1}y_ilog(h(x_i;\theta)) + (1 - y_i) log(1-h(x_i; \theta)) $$


In [18]:
# XOR perceptron implementation
device = 'cuda' if torch.cuda.is_available() else 'cpu'
X = torch.FloatTensor([[0,0], [0,1], [1,0], [1,1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

# nn layers
linear = torch.nn.Linear(2, 1, bias = True)
sigmoid = torch.nn.Sigmoid()
model = torch.nn.Sequential(linear, sigmoid).to(device)

#define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device) # Binary classification -> BCE
optimizer = torch.optim.SGD(model.parameters(), lr = 1)

for step in range(10001):
  optimizer.zero_grad()
  hypothesis = model(X)
  # cost / loss function
  cost = criterion(hypothesis, Y)
  cost.backward()
  optimizer.step()
  if step % 100 == 0:
    print(step, cost.item())

0 0.7297654151916504
100 0.6931477189064026
200 0.6931471824645996
300 0.6931471824645996
400 0.6931471824645996
500 0.6931471824645996
600 0.6931471824645996
700 0.6931471824645996
800 0.6931471824645996
900 0.6931471824645996
1000 0.6931471824645996
1100 0.6931471824645996
1200 0.6931471824645996
1300 0.6931471824645996
1400 0.6931471824645996
1500 0.6931471824645996
1600 0.6931471824645996
1700 0.6931471824645996
1800 0.6931471824645996
1900 0.6931471824645996
2000 0.6931471824645996
2100 0.6931471824645996
2200 0.6931471824645996
2300 0.6931471824645996
2400 0.6931471824645996
2500 0.6931471824645996
2600 0.6931471824645996
2700 0.6931471824645996
2800 0.6931471824645996
2900 0.6931471824645996
3000 0.6931471824645996
3100 0.6931471824645996
3200 0.6931471824645996
3300 0.6931471824645996
3400 0.6931471824645996
3500 0.6931471824645996
3600 0.6931471824645996
3700 0.6931471824645996
3800 0.6931471824645996
3900 0.6931471824645996
4000 0.6931471824645996
4100 0.6931471824645996
4200

# Lab 08-2 Multi Layer Perceptron

To solve the problem that XOR cannot be represented with single Perceptron, we introduce the concept of "**backpropagation**"

In [5]:
# Backpropagation

# nn Layers , the same as two of nn.Linear
w1 = torch.Tensor(2, 2).to(device)
b1 = torch.Tensor(2).to(device)
w2 = torch.Tensor(2, 1).to(device)
b2 = torch.Tensor(1).to(device)

In [6]:
def sigmoid(x):
  # sigmoid function
  return 1.0 / (1.0 + torch.exp(-x))
  

In [7]:
def sigmoid_prime(x):
  # derivative of the sigmoid function
  return sigmoid(x) * (1 - sigmoid(x))

In [19]:
learning_rate = 1

In [None]:
# learning

for step in range(10001):
  # forward
  l1 = torch.add(torch.matmul(X, w1), b1)
  a1 = sigmoid(l1)
  l2 = torch.add(torch.matmul(a1, w2), b2)
  Y_pred = sigmoid(l2)

  cost = -torch.mean(Y * torch.log(Y_pred) + (1- Y) * torch.log(1 - Y_pred))

  # bacd prop (chan rule)

  # Loss derivative
  d_Y_pred = (Y_pred - Y) / (Y_pred * (1.0 - Y_pred) + 1e-7)
   
  
  # layer 2
  d_l2 = d_Y_pred * sigmoid_prime(l2)
  d_b2 = d_l2
  d_w2 = torch.matmul(torch.transpose(a1, 0, 1), d_b2)

  # layer 1
  d_a1 = torch.matmul(d_b2, torch.transpose(w2, 0, 1))
  d_l1 = d_a1 * sigmoid_prime(l1)
  d_b1 = d_l1
  d_w1 = torch.matmul(torch.transpose(X, 0, 1), d_b1)

  # weight update
  w1 = w1 - learning_rate * d_w1
  b1 = b1 - learning_rate * torch.mean(d_b1, 0)
  w2 = w2 - learning_rate* d_w2
  b2 = b2 - learning_rate * torch.mean(d_b2, 0)

  if step % 100 == 0:
    print(step, cost.item())

In [21]:
# Backpropagation using torch

# nn layers
# MLP
linear1 = torch.nn.Linear(2, 2, bias = True)
linear2 = torch.nn.Linear(2, 1, bias = True)
sigmoid = torch.nn.Sigmoid()
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid).to(device)

# define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 1)
for step in range(10001):
  optimizer.zero_grad()
  hypothesis = model(X)
  # cost / loss function
  cost = criterion(hypothesis, Y)
  cost.backward()
  optimizer.step()
  if step % 100 == 0:
    print(step, cost.item())

0 0.6931954026222229
100 0.6930721998214722
200 0.6929640173912048
300 0.6926223039627075
400 0.690690815448761
500 0.6712223887443542
600 0.5781338810920715
700 0.5087843537330627
800 0.3558744788169861
900 0.12581370770931244
1000 0.061768241226673126
1100 0.03949028253555298
1200 0.028697241097688675
1300 0.02242206409573555
1400 0.018346931785345078
1500 0.015498388558626175
1600 0.013400016352534294
1700 0.01179259829223156
1800 0.010523222386837006
1900 0.009496381506323814
2000 0.00864909403026104
2100 0.007938425987958908
2200 0.007334042806178331
2300 0.006813946180045605
2400 0.0063617597334086895
2500 0.005965052638202906
2600 0.005614331923425198
2700 0.005302071571350098
2800 0.005022276192903519
2900 0.00477019976824522
3000 0.004541928879916668
3100 0.0043343049474060535
3200 0.004144606180489063
3300 0.003970608115196228
3400 0.003810522612184286
3500 0.0036626821383833885
3600 0.003525795415043831
3700 0.0033986461348831654
3800 0.0032802748028188944
3900 0.00316979549

The loss has reduced with the wide-deep MLP!

In [22]:
# XOR-nn-wide-deep

# nn layers

# 4 MLP
linear1 = torch.nn.Linear(2, 10, bias = True)
linear2 = torch.nn.Linear(10, 10, bias = True)
linear3 = torch.nn.Linear(10, 10, bias = True)
linear4 = torch.nn.Linear(10, 1, bias = True)
sigmoid = torch.nn.Sigmoid()

model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid, linear3, sigmoid, linear4, sigmoid).to(device)

# define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 1)
for step in range(10001):
  optimizer.zero_grad()
  hypothesis = model(X)
  # cost / loss function
  cost = criterion(hypothesis, Y)
  cost.backward()
  optimizer.step()
  if step % 100 == 0:
    print(step, cost.item())

0 0.6957021951675415
100 0.6931525468826294
200 0.6931518316268921
300 0.6931511163711548
400 0.6931504011154175
500 0.6931496858596802
600 0.6931490302085876
700 0.6931482553482056
800 0.6931476593017578
900 0.6931469440460205
1000 0.6931463479995728
1100 0.6931456327438354
1200 0.6931450366973877
1300 0.6931443214416504
1400 0.6931436061859131
1500 0.6931430101394653
1600 0.693142294883728
1700 0.6931415796279907
1800 0.6931408643722534
1900 0.6931401491165161
2000 0.6931394934654236
2100 0.6931387186050415
2200 0.6931378841400146
2300 0.6931371688842773
2400 0.6931363344192505
2500 0.6931354403495789
2600 0.6931345462799072
2700 0.6931335926055908
2800 0.6931326985359192
2900 0.693131685256958
3000 0.693130612373352
3100 0.6931294202804565
3200 0.693128228187561
3300 0.6931269764900208
3400 0.6931256055831909
3500 0.6931241750717163
3600 0.6931227445602417
3700 0.6931210160255432
3800 0.6931192874908447
3900 0.6931174397468567
4000 0.6931154131889343
4100 0.6931131482124329
4200 0.6