In [2]:
import torch

apple      = torch.tensor(100.0, requires_grad=True)  
apple_num  = torch.tensor(2.0,   requires_grad=True)
tax        = torch.tensor(1.1,   requires_grad=True)

price = apple * apple_num * tax  
print(float(price))  


price.backward() 

print("dapple     =", apple.grad.item())      
print("dapple_num =", apple_num.grad.item())  
print("dtax       =", tax.grad.item())        


220.0
dapple     = 2.200000047683716
dapple_num = 110.0
dtax       = 200.0


Consider using tensor.detach() first. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\autograd\generated\python_variable_methods.cpp:837.)
  print(float(price))


In [None]:
apple       = torch.tensor(100.0, requires_grad=True)
apple_num   = torch.tensor(2.0,   requires_grad=True)
orange      = torch.tensor(150.0, requires_grad=True)
orange_num  = torch.tensor(3.0,   requires_grad=True)
tax         = torch.tensor(1.1,   requires_grad=True)

apple_price   = apple * apple_num      
orange_price  = orange * orange_num    
all_price     = apple_price + orange_price 
price         = all_price * tax         

print("price =", float(price)) 

price.backward()

print("dapple      =", apple.grad.item())      
print("dapple_num  =", apple_num.grad.item()) 
print("dorange     =", orange.grad.item())     
print("dorange_num =", orange_num.grad.item())  
print("dtax        =", tax.grad.item())       


price = 715.0
dapple      = 2.200000047683716
dapple_num  = 110.0
dorange     = 3.3000001907348633
dorange_num = 165.0
dtax        = 650.0


In [8]:
import torch.nn as nn

class ReLU(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch.relu(x)
    
class Sigmoid(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch.sigmoid(x)
    


In [9]:
x = torch.tensor([[1.0, -0.5],
                  [-2.0,  3.0]])

print("x =\n", x)

mask = (x <= 0)
print("mask(x<=0) =\n", mask)

x =
 tensor([[ 1.0000, -0.5000],
        [-2.0000,  3.0000]])
mask(x<=0) =
 tensor([[False,  True],
        [ True, False]])


In [10]:
relu = ReLU()
sigm = Sigmoid()

# 순전파
y_relu = relu(x)         # ReLU
y_sigm = sigm(x)         # Sigmoid

print("ReLU(x) =\n", y_relu)
print("Sigmoid(x) =\n", y_sigm)


ReLU(x) =
 tensor([[1., 0.],
        [0., 3.]])
Sigmoid(x) =
 tensor([[0.7311, 0.3775],
        [0.1192, 0.9526]])


In [None]:
X = torch.rand(2)
W = torch.rand(2, 3)
B = torch.rand(3)

print(X.shape, W.shape, B.shape)

Y = X @ W + B

X_dot_W = torch.tensor([[0., 0., 0.], [10., 10., 10.]])
B = torch.tensor([1., 2., 3.])

print(X_dot_W)
print(X_dot_W + B)

dY = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
print(dY)

dB = dY.sum(dim=0)
print(dB)


torch.Size([2]) torch.Size([2, 3]) torch.Size([3])
tensor([[ 0.,  0.,  0.],
        [10., 10., 10.]])
tensor([[ 1.,  2.,  3.],
        [11., 12., 13.]])
tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor([5., 7., 9.])


In [None]:
import torch.nn.functional as F

class Affine(nn.Module):
    def __init__(self, W: torch.Tensor, b: torch.Tensor):
        super().__init__()

        self.W = nn.Parameter(W.clone().detach().float())
        self.b = nn.Parameter(b.clone().detach().float())

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x @ self.W + self.b

class SoftmaxWithLoss(nn.Module):
    def __init__(self, reduction: str = "mean"):
        super().__init__()
        self.reduction = reduction

    def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
        if t.dim() == 2 and t.shape == x.shape:
            log_probs = F.log_softmax(x, dim=1)  
            loss_vec = -(t * log_probs).sum(dim=1)   
            if self.reduction == "mean":
                return loss_vec.mean()
            elif self.reduction == "sum":
                return loss_vec.sum()
            else:
                return loss_vec  # 'none'
        else:
            # (2) t가 class index인 경우: PyTorch의 cross_entropy 사용
            # 기존 backward의
            #   if one-hot: dx = (y - t) / N
            #   else: dx = (y - one_hot(t)) / N
            # 를 autograd가 내부적으로 처리합니다.
            return F.cross_entropy(x, t, reduction=self.reduction)



In [None]:
import sys, os
sys.path.append(os.path.join(os.getcwd(), 'common'))
from collections import OrderedDict

import torch.nn.functional as F

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01, device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

        W1 = weight_init_std * torch.randn(input_size, hidden_size, device=self.device)
        b1 = torch.zeros(hidden_size, device=self.device)
        W2 = weight_init_std * torch.randn(hidden_size, output_size, device=self.device)
        b2 = torch.zeros(output_size, device=self.device)

        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(W1, b1).to(self.device)
        self.layers['Relu1']   = relu().to(self.device)
        self.layers['Affine2'] = Affine(W2, b2).to(self.device)

        self.lastLayer = SoftmaxWithLoss().to(self.device)
        self.params = {
            'W1': self.layers['Affine1'].W,
            'b1': self.layers['Affine1'].b,
            'W2': self.layers['Affine2'].W,
            'b2': self.layers['Affine2'].b,
        }

    def _to_tensor(self, x):
        if isinstance(x, np.ndarray):
            x = torch.from_numpy(x)
        return x.to(self.device).float()

    def predict(self, x):
        x = self._to_tensor(x)
        for layer in self.layers.values():
            x = layer(x)
        return x

    def loss(self, x, t):
        x = self._to_tensor(x)
        t = self._to_tensor(t)
        y = self.predict(x)
        return self.lastLayer(y, t)

    def accuracy(self, x, t):
        x = self._to_tensor(x)
        t = self._to_tensor(t)
        y = self.predict(x).argmax(dim=1)
        if t.dim() != 1:
            t = t.argmax(dim=1)
        return (y == t).float().mean().item()

    @torch.no_grad()
    def numerical_gradient(self, x, t, eps=1e-4):
        x_t = self._to_tensor(x)
        t_t = self._to_tensor(t)
        grads = {}

        for key in ('W1', 'b1', 'W2', 'b2'):
            p = self.params[key]
            grad = torch.zeros_like(p)
            it = np.nditer(p.detach().cpu().numpy(), flags=['multi_index'], op_flags=['readwrite'])
            while not it.finished:
                idx = it.multi_index
                old = p[idx].item()

                p.data[idx] = old + eps
                l1 = self.loss(x_t, t_t).item()

                p.data[idx] = old - eps
                l2 = self.loss(x_t, t_t).item()

                grad[idx] = (l1 - l2) / (2 * eps)
                p.data[idx] = old
                it.iternext()

            grads[key] = grad.to(self.device)
        return grads

    def gradient(self, x, t):
        # autograd로 역전파 (원본의 backward 순서를 내부적으로 대체)
        for p in self.params.values():
            if p.grad is not None:
                p.grad.zero_()

        loss = self.loss(x, t)
        loss.backward()

        grads = {
            'W1': self.params['W1'].grad.detach().clone(),
            'b1': self.params['b1'].grad.detach().clone(),
            'W2': self.params['W2'].grad.detach().clone(),
            'b2': self.params['b2'].grad.detach().clone(),
        }

        # 원본과의 호환: Affine 레이어에도 dW/db 필드 채워두기
        self.layers['Affine1'].dW = grads['W1']
        self.layers['Affine1'].db = grads['b1']
        self.layers['Affine2'].dW = grads['W2']
        self.layers['Affine2'].db = grads['b2']

        return grads


In [15]:
import sys, os
sys.path.append(os.path.join(os.getcwd(), 'dataset'))
import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

for key in grad_numerical.keys():
    diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
    print(key + ":" + str(diff))

W1:1.3743592392214974e-06
b1:2.103025242342722e-05
W2:4.907778803998354e-09
b2:1.3919942052492695e-07


In [None]:
import sys, os
sys.path.append(os.path.join(os.getcwd(), 'dataset'))
import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet  # ← NumPy 버전

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list, train_acc_list, test_acc_list = [], [], []
iter_per_epoch = max(train_size // batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    grad = network.gradient(x_batch, t_batch)
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * learning_rate * 0 + learning_rate * (-0)  # placeholder
        network.params[key] -= learning_rate * grad[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

        #twolayernet이 pytorch 기반이 아니라서 


train acc, test acc | 0.10863333333333333, 0.1056
train acc, test acc | 0.9072666666666667, 0.9093
train acc, test acc | 0.92115, 0.924
train acc, test acc | 0.9384833333333333, 0.938
train acc, test acc | 0.9459666666666666, 0.9433
train acc, test acc | 0.9518666666666666, 0.9486
train acc, test acc | 0.9572166666666667, 0.9548
train acc, test acc | 0.9612166666666667, 0.9578
train acc, test acc | 0.9638833333333333, 0.9596
train acc, test acc | 0.9681833333333333, 0.9643
train acc, test acc | 0.9701333333333333, 0.9641
train acc, test acc | 0.9715333333333334, 0.9661
train acc, test acc | 0.97375, 0.9682
train acc, test acc | 0.9752333333333333, 0.9673
train acc, test acc | 0.9736333333333334, 0.9666
train acc, test acc | 0.9774833333333334, 0.9699
train acc, test acc | 0.9789833333333333, 0.9708
