### XOR : 두개의 입력이 다른경우에 1을, 같은 경우에 0을 출력하는 논리 연산
* 단일 퍼셉트론으로는 계산이 불가능하기 때문에 Multi layer perceptrons을 사용한다.
* 역전파를 사용해서 MLP를 사용이 가능

In [36]:
import torch

In [37]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [38]:
# XOR
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device) # 정답

In [39]:
# 학습을 하기 위해 Layer를 구성하는 weight와 bias 
# nn Layers
w1 = torch.Tensor(2,2).to(device)
b1 = torch.Tensor(2).to(device)
w2 = torch.Tensor(2,1).to(device)
b2 = torch.Tensor(1).to(device)

# nn.Linear 두개로 이루어 진것과 같음 

In [40]:
# sigmoid함수 구현
def sigmoid(x) :
    # sigmoid function
    return 1.0 / (1.0 + torch.exp(-x))

# sigmoid의 미분함수 
def sigmoid_prime(x) :
    return sigmoid(x)*(1-sigmoid(x))

In [42]:
# 학습 코드 

learning_rate = 0.1
for step in range(10001) :
    # forward
    l1 = torch.add(torch.matmul(X, w1), b1)
    a1 = sigmoid(l1)
    l2 = torch.add(torch.matmul(a1, w2), b2)
    Y_pred = sigmoid(l2)
    
    cost = -torch.mean(Y * torch.log(Y_pred) + (1-Y)* torch.log(1 - Y_pred))
    
    # Backprop
    # loss(binary cross entropy loss)에 대한 미분 
    d_Y_pred = (Y_pred - Y) / (Y_pred * (1.0 - Y_pred) + 1e-7)
    
    # Layer 2
    d_l2 = d_Y_pred * sigmoid_prime(l2)
    d_b2 = d_l2 # bias
    d_w2 = torch.matmul(torch.transpose(a1, 0, 1), d_b2) # transpose : a1 의 차원 (n x m) 을 (m x n) 으로 바꿈 
    
    # Layer 1 
    d_a1 = torch.matmul(d_b2, torch.transpose(w2, 0, 1))
    d_l1 = d_a1 * sigmoid_prime(l1)
    d_b1 = d_l1 # bias
    d_w1 = torch.matmul(torch.transpose(X, 0 ,1), d_b1)
    
    # Weight update (gradient descent)
    w1 = w1 - learning_rate * d_w1
    b1 = b1 - learning_rate * torch.mean(d_b1, 0)
    w2 = w2 - learning_rate * d_w2
    b2 = b2 - learning_rate * torch.mean(d_b2, 0)
    
    if step % 100 == 0 :
        print(step, cost)

0 tensor(nan)
100 tensor(nan)
200 tensor(nan)
300 tensor(nan)
400 tensor(nan)
500 tensor(nan)
600 tensor(nan)
700 tensor(nan)
800 tensor(nan)
900 tensor(nan)
1000 tensor(nan)
1100 tensor(nan)
1200 tensor(nan)
1300 tensor(nan)
1400 tensor(nan)
1500 tensor(nan)
1600 tensor(nan)
1700 tensor(nan)
1800 tensor(nan)
1900 tensor(nan)
2000 tensor(nan)
2100 tensor(nan)
2200 tensor(nan)
2300 tensor(nan)
2400 tensor(nan)
2500 tensor(nan)
2600 tensor(nan)
2700 tensor(nan)
2800 tensor(nan)
2900 tensor(nan)
3000 tensor(nan)
3100 tensor(nan)
3200 tensor(nan)
3300 tensor(nan)
3400 tensor(nan)
3500 tensor(nan)
3600 tensor(nan)
3700 tensor(nan)
3800 tensor(nan)
3900 tensor(nan)
4000 tensor(nan)
4100 tensor(nan)
4200 tensor(nan)
4300 tensor(nan)
4400 tensor(nan)
4500 tensor(nan)
4600 tensor(nan)
4700 tensor(nan)
4800 tensor(nan)
4900 tensor(nan)
5000 tensor(nan)
5100 tensor(nan)
5200 tensor(nan)
5300 tensor(nan)
5400 tensor(nan)
5500 tensor(nan)
5600 tensor(nan)
5700 tensor(nan)
5800 tensor(nan)
5900 tens

## Xor -nn

In [43]:
# XOR
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device) # 정답

In [44]:
# nn layers
linear1 = torch.nn.Linear(2, 2, bias = True)
linear2 = torch.nn.Linear(2, 1, bias = True)
sigmoid = torch.nn.Sigmoid()

In [45]:
# model
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid).to(device)

In [46]:
# define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1)

In [47]:
# training
for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)

    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()

    if step % 100 == 0:
        print("step:", step,"Cost:", cost.item())

step: 0 Cost: 0.7434073090553284
step: 100 Cost: 0.693165123462677
step: 200 Cost: 0.6931577920913696
step: 300 Cost: 0.6931517124176025
step: 400 Cost: 0.6931463479995728
step: 500 Cost: 0.6931411027908325
step: 600 Cost: 0.6931357383728027
step: 700 Cost: 0.6931294798851013
step: 800 Cost: 0.6931220889091492
step: 900 Cost: 0.6931126117706299
step: 1000 Cost: 0.6930999755859375
step: 1100 Cost: 0.693082332611084
step: 1200 Cost: 0.6930569410324097
step: 1300 Cost: 0.6930190324783325
step: 1400 Cost: 0.6929606199264526
step: 1500 Cost: 0.6928660273551941
step: 1600 Cost: 0.6927032470703125
step: 1700 Cost: 0.6923960447311401
step: 1800 Cost: 0.6917301416397095
step: 1900 Cost: 0.6899653673171997
step: 2000 Cost: 0.683831512928009
step: 2100 Cost: 0.6561665534973145
step: 2200 Cost: 0.431100070476532
step: 2300 Cost: 0.1348930448293686
step: 2400 Cost: 0.0663042739033699
step: 2500 Cost: 0.04216815158724785
step: 2600 Cost: 0.03045385330915451
step: 2700 Cost: 0.0236658975481987
step: 

In [10]:
# Accuracy computation
# True if hypothesis>0.5 else False
with torch.no_grad():
    hypothesis = model(X)
    predicted = (hypothesis > 0.5).float()
    accuracy = (predicted == Y).float().mean()
    print('\nHypothesis: ', hypothesis.detach().cpu().numpy(), '\nCorrect: ', predicted.detach().cpu().numpy(), '\nAccuracy: ', accuracy.item())


Hypothesis:  [[0.5]
 [0.5]
 [0.5]
 [0.5]] 
Correct:  [[0.]
 [0.]
 [0.]
 [0.]] 
Accuracy:  0.5


### 층을 늘림

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [17]:
# nn layers
linear1 = torch.nn.Linear(2, 2, bias=True)
linear2 = torch.nn.Linear(2, 1, bias=True)
sigmoid = torch.nn.Sigmoid()

In [18]:
# model
model2 = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid).to(device)

In [19]:
# define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model2.parameters(), lr=1)  # modified learning rate from 0.1 to 1

In [20]:
for step in range(10001):
    optimizer.zero_grad() # 파라미터 초기화 
    hypothesis = model2(X) 

    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()

    if step % 100 == 0:
        print(step, cost.item())

0 0.7434073090553284
100 0.693165123462677
200 0.6931577920913696
300 0.6931517124176025
400 0.6931463479995728
500 0.6931411027908325
600 0.6931357383728027
700 0.6931294798851013
800 0.6931220889091492
900 0.6931126117706299
1000 0.6930999755859375
1100 0.693082332611084
1200 0.6930569410324097
1300 0.6930190324783325
1400 0.6929606199264526
1500 0.6928660273551941
1600 0.6927032470703125
1700 0.6923960447311401
1800 0.6917301416397095
1900 0.6899653673171997
2000 0.683831512928009
2100 0.6561665534973145
2200 0.431100070476532
2300 0.1348930448293686
2400 0.0663042739033699
2500 0.04216815158724785
2600 0.03045385330915451
2700 0.0236658975481987
2800 0.01927773468196392
2900 0.01622403785586357
3000 0.01398373395204544
3100 0.01227390207350254
3200 0.010928118601441383
3300 0.009842472150921822
3400 0.008949032984673977
3500 0.008201336488127708
3600 0.0075667379423975945
3700 0.007021686062216759
3800 0.006548595614731312
3900 0.006134253926575184
4000 0.005768344737589359
4100 0.

In [21]:
# Accuracy computation
# 정확도 증가 
# True if hypothesis>0.5 else False
with torch.no_grad():  # update된 gradient
    hypothesis = model2(X)
    predicted = (hypothesis > 0.5).float() # 확률이 0.5 보다 클때 1 , 작을때 0
    accuracy = (predicted == Y).float().mean()
    print('\nHypothesis: ', hypothesis.detach().cpu().numpy(), '\nCorrect: ', predicted.detach().cpu().numpy(), '\nAccuracy: ', accuracy.item())


Hypothesis:  [[0.00106364]
 [0.99889404]
 [0.99889404]
 [0.00165861]] 
Correct:  [[0.]
 [1.]
 [1.]
 [0.]] 
Accuracy:  1.0


### nn layers

In [49]:
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

In [50]:
# nn layers
linear1 = torch.nn.Linear(2, 10, bias=True)
linear2 = torch.nn.Linear(10, 10, bias=True)
linear3 = torch.nn.Linear(10, 10, bias=True)
linear4 = torch.nn.Linear(10, 1, bias=True)
sigmoid = torch.nn.Sigmoid()

In [51]:
# model
model3 = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid, linear3, sigmoid, linear4, sigmoid).to(device)

In [52]:
# define cost/loss & optimizer
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1)  # modified learning rate from 0.1 to 1

In [53]:
for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)

    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()

    if step % 100 == 0:
        print(step, cost.item())

0 0.0012343396665528417
100 0.0012182408245280385
200 0.0012025302276015282
300 0.0011871629394590855
400 0.0011722433846443892
500 0.0011576821561902761
600 0.0011434644693508744
700 0.0011295899748802185
800 0.0011160289868712425
900 0.0011027962900698185
1000 0.0010899071348831058
1100 0.001077286433428526
1200 0.0010649494361132383
1300 0.0010528811253607273
1400 0.0010410815011709929
1500 0.0010295654647052288
1600 0.0010182731784880161
1700 0.0010072496952489018
1800 0.0009964201599359512
1900 0.000985888997092843
2000 0.0009755371138453484
2100 0.0009653493762016296
2200 0.0009554600110277534
2300 0.0009457049891352654
2400 0.0009361589327454567
2500 0.0009268366266041994
2600 0.0009177083848044276
2700 0.0009086995269171894
2800 0.0008998995763249695
2900 0.0008912637713365257
3000 0.0008827920537441969
3100 0.0008744547376409173
3200 0.0008662814507260919
3300 0.0008582425070926547
3400 0.000850397627800703
3500 0.0008426720160059631
3600 0.0008350807474926114
3700 0.000827608

In [54]:
# Accuracy computation
# True if hypothesis>0.5 else False
with torch.no_grad():
    hypothesis = model(X)
    predicted = (hypothesis > 0.5).float()
    accuracy = (predicted == Y).float().mean()
    print('\nHypothesis: ', hypothesis.detach().cpu().numpy(), '\nCorrect: ', predicted.detach().cpu().numpy(), '\nAccuracy: ', accuracy.item())


Hypothesis:  [[4.5869316e-04]
 [9.9952638e-01]
 [9.9952638e-01]
 [7.1005197e-04]] 
Correct:  [[0.]
 [1.]
 [1.]
 [0.]] 
Accuracy:  1.0
