在此笔记本中，我们将对原来提供的tensorflow框架下numpy实现前馈神经网络的tutorial_minist_fnn-numpy-exercise.ipynb文件进行简单的修改，改为pytorch框架。实现的方法和代码形式尽可能与原代码相近。

环境：python 3.9.21, pytorch 2.0.0, CUDA 11.8

## Data Preparation

In [1]:
import numpy as np
from torchvision import datasets, transforms

def mnist_dataset():
    transform = transforms.ToTensor()

    train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
    test_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

    #normalize
    x = train_data.data.numpy().astype(np.float32) / 255.0  #形状(60000, 28, 28)
    y = train_data.targets.numpy()

    x_test = test_data.data.numpy().astype(np.float32) / 255.0
    y_test = test_data.targets.numpy()

    return (x, y), (x_test, y_test)

## Demo numpy based auto differentiation

In [2]:
class Matmul:
    def __init__(self):
        self.mem = {}
        
    def forward(self, x, W):
        h = np.matmul(x, W)
        self.mem={'x': x, 'W': W}
        return h
    
    def backward(self, grad_y):
        '''
        x: shape(N, d)
        w: shape(d, d')
        grad_y: shape(N, d')
        '''
        x = self.mem['x']
        W = self.mem['W']
        
        ####################
        '''计算矩阵乘法的对应的梯度'''
        ####################
        grad_x = np.matmul(grad_y, W.T)
        grad_W = np.matmul(x.T, grad_y)

        return grad_x, grad_W


class Relu:
    def __init__(self):
        self.mem = {}
        
    def forward(self, x):
        self.mem['x']=x
        return np.where(x > 0, x, np.zeros_like(x))
    
    def backward(self, grad_y):
        '''
        grad_y: same shape as x
        '''
        ####################
        '''计算relu 激活函数对应的梯度'''
        ####################
        x = self.mem['x']
        grad_x = grad_y * (x > 0).astype(np.float32)
        return grad_x
    


class Softmax:
    '''
    softmax over last dimention
    '''
    def __init__(self):
        self.epsilon = 1e-12
        self.mem = {}
        
    def forward(self, x):
        '''
        x: shape(N, c)
        '''
        x_exp = np.exp(x)
        partition = np.sum(x_exp, axis=1, keepdims=True)
        out = x_exp/(partition+self.epsilon)
        
        self.mem['out'] = out
        self.mem['x_exp'] = x_exp
        return out
    
    def backward(self, grad_y):
        '''
        grad_y: same shape as x
        '''
        s = self.mem['out']
        sisj = np.matmul(np.expand_dims(s,axis=2), np.expand_dims(s, axis=1)) # (N, c, c)
        g_y_exp = np.expand_dims(grad_y, axis=1)
        tmp = np.matmul(g_y_exp, sisj) #(N, 1, c)
        tmp = np.squeeze(tmp, axis=1)
        tmp = -tmp+grad_y*s 
        return tmp
    
class Log:
    '''
    softmax over last dimention
    '''
    def __init__(self):
        self.epsilon = 1e-12
        self.mem = {}
        
    def forward(self, x):
        '''
        x: shape(N, c)
        '''
        out = np.log(x+self.epsilon)
        
        self.mem['x'] = x
        return out
    
    def backward(self, grad_y):
        '''
        grad_y: same shape as x
        '''
        x = self.mem['x']
        
        return 1./(x+1e-12) * grad_y

## Gradient check

在这一部分，我们将使用pytorch框架来检查我们的numpy计算结果是否正确

### Matmul

In [3]:
import torch

x = np.random.normal(size=[5, 6]).astype(np.float32)
W = np.random.normal(size=[6, 4]).astype(np.float32)

#Numpy
aa = Matmul()
out = aa.forward(x, W)
grad = aa.backward(np.ones_like(out))
print("NumPy backward:", grad[0])

#PyTorch
x_torch = torch.tensor(x, requires_grad=True)
W_torch = torch.tensor(W, requires_grad=True)

y = torch.matmul(x_torch, W_torch)
loss = y.sum()
loss.backward()

print("PyTorch grad x:", x_torch.grad.numpy())

NumPy backward: [[-0.45607203 -2.1363819  -1.7792773  -0.06573141  2.3203516   1.1278021 ]
 [-0.45607203 -2.1363819  -1.7792773  -0.06573141  2.3203516   1.1278021 ]
 [-0.45607203 -2.1363819  -1.7792773  -0.06573141  2.3203516   1.1278021 ]
 [-0.45607203 -2.1363819  -1.7792773  -0.06573141  2.3203516   1.1278021 ]
 [-0.45607203 -2.1363819  -1.7792773  -0.06573141  2.3203516   1.1278021 ]]
PyTorch grad x: [[-0.45607203 -2.1363819  -1.7792772  -0.06573141  2.3203516   1.1278021 ]
 [-0.45607203 -2.1363819  -1.7792772  -0.06573141  2.3203516   1.1278021 ]
 [-0.45607203 -2.1363819  -1.7792772  -0.06573141  2.3203516   1.1278021 ]
 [-0.45607203 -2.1363819  -1.7792772  -0.06573141  2.3203516   1.1278021 ]
 [-0.45607203 -2.1363819  -1.7792772  -0.06573141  2.3203516   1.1278021 ]]


### ReLU

In [4]:
x = np.random.normal(size=[5, 6]).astype(np.float32)

#NumPy
aa = Relu()
out = aa.forward(x)
grad = aa.backward(np.ones_like(out))
print("NumPy backward:", grad)

#PyTorch
x_torch = torch.tensor(x, requires_grad=True)
y = torch.nn.functional.relu(x_torch)
loss = y.sum()
loss.backward()
print("PyTorch grad x:", x_torch.grad.numpy())


NumPy backward: [[1. 1. 0. 0. 1. 0.]
 [1. 1. 0. 1. 0. 1.]
 [1. 0. 1. 1. 0. 0.]
 [1. 1. 0. 1. 1. 1.]
 [1. 0. 0. 0. 0. 0.]]
PyTorch grad x: [[1. 1. 0. 0. 1. 0.]
 [1. 1. 0. 1. 0. 1.]
 [1. 0. 1. 1. 0. 0.]
 [1. 1. 0. 1. 1. 1.]
 [1. 0. 0. 0. 0. 0.]]


### Softmax

In [5]:
x = np.random.normal(size=[5, 6], scale=5.0, loc=1).astype(np.float32)
label = np.zeros_like(x)
label[0, 1] = 1.
label[1, 0] = 1
label[1, 1] = 1
label[2, 3] = 1
label[3, 5] = 1
label[4, 0] = 1
print("Label:\n", label)

# NumPy
aa = Softmax()
out = aa.forward(x)
grad = aa.backward(label)
print("NumPy backward:\n", grad)

# PyTorch
x_torch = torch.tensor(x, requires_grad=True)
label_torch = torch.tensor(label)

y = torch.nn.functional.softmax(x_torch, dim=1)
loss = (y * label_torch).sum()
loss.backward()

print("PyTorch grad x:\n", x_torch.grad.numpy())

Label:
 [[0. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]]
NumPy backward:
 [[-2.1389827e-08  3.4520195e-05 -2.4974264e-05 -6.2897861e-06
  -3.3625986e-07 -2.8984966e-06]
 [ 7.3657334e-02  3.0794363e-03 -2.8492774e-08 -2.7063288e-04
  -5.6033173e-10 -7.6466113e-02]
 [-3.1466282e-04 -7.5810052e-02 -2.9963639e-04  7.6425523e-02
  -5.4997638e-09 -1.1705509e-06]
 [-3.1959944e-06 -1.3568686e-01 -1.0404501e-06 -1.3019294e-07
  -3.3106278e-09  1.3569123e-01]
 [ 1.9025691e-05 -2.5172149e-09 -1.2155481e-05 -4.9309087e-06
  -1.5866855e-11 -1.9367687e-06]]
PyTorch grad x:
 [[-2.1389827e-08  3.4520202e-05 -2.4974268e-05 -6.2897884e-06
  -3.3625992e-07 -2.8984973e-06]
 [ 7.3657334e-02  3.0794370e-03 -2.8492778e-08 -2.7063285e-04
  -5.6033173e-10 -7.6466113e-02]
 [-3.1466270e-04 -7.5810038e-02 -2.9963622e-04  7.6425515e-02
  -5.4997567e-09 -1.1705499e-06]
 [-3.1959935e-06 -1.3568686e-01 -1.0404498e-06 -1.3019292e-07
  -3.3106269e-09  1.3569123e-

### Log

In [6]:
x = np.random.uniform(low=0.1, high=1.0, size=[5, 6]).astype(np.float32)

# NumPy
aa = Log()
out = aa.forward(x)
grad = aa.backward(label)
print("NumPy backward:\n", grad)

# PyTorch
x_torch = torch.tensor(x, requires_grad=True)
label_torch = torch.tensor(label)

y = torch.log(x_torch)
loss = (y * label_torch).sum()
loss.backward()

print("PyTorch grad x:\n", x_torch.grad.numpy())

NumPy backward:
 [[0.        1.1788638 0.        0.        0.        0.       ]
 [1.0030893 1.7262231 0.        0.        0.        0.       ]
 [0.        0.        0.        1.0950545 0.        0.       ]
 [0.        0.        0.        0.        0.        2.039355 ]
 [1.4992834 0.        0.        0.        0.        0.       ]]
PyTorch grad x:
 [[0.        1.1788638 0.        0.        0.        0.       ]
 [1.0030893 1.7262231 0.        0.        0.        0.       ]
 [0.        0.        0.        1.0950545 0.        0.       ]
 [0.        0.        0.        0.        0.        2.039355 ]
 [1.4992834 0.        0.        0.        0.        0.       ]]


## Final Gradient Check

In [7]:
x = np.random.normal(size=[5, 6]).astype(np.float32)
W1 = np.random.normal(size=[6, 5]).astype(np.float32)
W2 = np.random.normal(size=[5, 6]).astype(np.float32)

label = np.zeros_like(x, dtype=np.float32)
label[0, 1] = 1.
label[1, 0] = 1
label[2, 3] = 1
label[3, 5] = 1
label[4, 0] = 1

mul_h1 = Matmul()
mul_h2 = Matmul()
relu = Relu()
softmax = Softmax()
log = Log()

h1 = mul_h1.forward(x, W1)
h1_relu = relu.forward(h1)
h2 = mul_h2.forward(h1_relu, W2)
h2_soft = softmax.forward(h2)
h2_log = log.forward(h2_soft)

h2_log_grad = log.backward(label)
h2_soft_grad = softmax.backward(h2_log_grad)
h2_grad, W2_grad = mul_h2.backward(h2_soft_grad)
h1_relu_grad = relu.backward(h2_grad)
h1_grad, W1_grad = mul_h1.backward(h1_relu_grad)

#print("NumPy grad W1:\n", W1_grad)
#print("NumPy grad W2:\n", W2_grad)
#print("NumPy grad x:\n", h1_grad)

print('--' * 20)

x_torch = torch.tensor(x, requires_grad=True)
W1_torch = torch.tensor(W1, requires_grad=True)
W2_torch = torch.tensor(W2, requires_grad=True)
label_torch = torch.tensor(label)

with torch.autograd.set_grad_enabled(True):
    h1 = torch.matmul(x_torch, W1_torch)
    h1_relu = torch.relu(h1)
    h2 = torch.matmul(h1_relu, W2_torch)
    prob = torch.softmax(h2, dim=1)
    log_prob = torch.log(prob + 1e-12)
    loss = torch.sum(label_torch * log_prob)

    loss.backward()

#print("PyTorch grad W1:\n", W1_torch.grad.numpy())
#print("PyTorch grad W2:\n", W2_torch.grad.numpy())
#print("PyTorch grad x:\n", x_torch.grad.numpy())

#进行对比
def compare_numpy_pytorch(np_grad, torch_grad, tol=1e-5):
    return np.allclose(np_grad, torch_grad, atol=tol)

print("W1 grad match:", compare_numpy_pytorch(W1_grad, W1_torch.grad.numpy()))
print("W2 grad match:", compare_numpy_pytorch(W2_grad, W2_torch.grad.numpy()))
print("x grad match:", compare_numpy_pytorch(h1_grad, x_torch.grad.numpy()))

----------------------------------------
W1 grad match: True
W2 grad match: True
x grad match: True


## Setup Model

In [8]:
class myModel:
    def __init__(self):
        
        self.W1 = np.random.normal(size=[28*28+1, 100])
        self.W2 = np.random.normal(size=[100, 10])
        
        self.mul_h1 = Matmul()
        self.mul_h2 = Matmul()
        self.relu = Relu()
        self.softmax = Softmax()
        self.log = Log()
        
        
    def forward(self, x):
        x = x.reshape(-1, 28*28)
        bias = np.ones(shape=[x.shape[0], 1])
        x = np.concatenate([x, bias], axis=1)
        
        self.h1 = self.mul_h1.forward(x, self.W1) # shape(5, 4)
        self.h1_relu = self.relu.forward(self.h1)
        self.h2 = self.mul_h2.forward(self.h1_relu, self.W2)
        self.h2_soft = self.softmax.forward(self.h2)
        self.h2_log = self.log.forward(self.h2_soft)
            
    def backward(self, label):
        self.h2_log_grad = self.log.backward(-label)
        self.h2_soft_grad = self.softmax.backward(self.h2_log_grad)
        self.h2_grad, self.W2_grad = self.mul_h2.backward(self.h2_soft_grad)
        self.h1_relu_grad = self.relu.backward(self.h2_grad)
        self.h1_grad, self.W1_grad = self.mul_h1.backward(self.h1_relu_grad)
        
model = myModel()


## Calculate Loss

In [9]:
def compute_loss(log_prob, labels):
     return np.mean(np.sum(-log_prob*labels, axis=1))
    

def compute_accuracy(log_prob, labels):
    predictions = np.argmax(log_prob, axis=1)
    truth = np.argmax(labels, axis=1)
    return np.mean(predictions==truth)

def train_one_step(model, x, y):
    model.forward(x)
    model.backward(y)
    model.W1 -= 1e-5* model.W1_grad
    model.W2 -= 1e-5* model.W2_grad
    loss = compute_loss(model.h2_log, y)
    accuracy = compute_accuracy(model.h2_log, y)
    return loss, accuracy

def test(model, x, y):
    model.forward(x)
    loss = compute_loss(model.h2_log, y)
    accuracy = compute_accuracy(model.h2_log, y)
    return loss, accuracy

## Train

In [10]:
train_data, test_data = mnist_dataset()
train_label = np.zeros(shape=[train_data[0].shape[0], 10])
test_label = np.zeros(shape=[test_data[0].shape[0], 10])
train_label[np.arange(train_data[0].shape[0]), np.array(train_data[1])] = 1.
test_label[np.arange(test_data[0].shape[0]), np.array(test_data[1])] = 1.

for epoch in range(100):
    loss, accuracy = train_one_step(model, train_data[0], train_label)
    print('epoch', epoch, ': loss', loss, '; accuracy', accuracy)
loss, accuracy = test(model, test_data[0], test_label)

print('test loss', loss, '; accuracy', accuracy)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw

epoch 0 : loss 25.07529562208099 ; accuracy 0.0532
epoch 1 : loss 23.546406583857838 ; accuracy 0.10006666666666666
epoch 2 : loss 22.244975015471027 ; accuracy 0.15436666666666668
epoch 3 : loss 21.10062362819336 ; accuracy 0.18028333333333332
epoch 4 : loss 19.61581548155453 ; accuracy 0.21685
epoch 5 : loss 17.123079843926767 ; accuracy 0.29435
epoch 6 : loss 15.22914206470931 ; accuracy 0.3670333333333333
epoch 7 : loss 13.919294499803174 ; accuracy 0.41823333333333335
epoch 8 : loss 12.976987604760408 ; accuracy 0.4532333333333333
epoch 9 : loss 12.142250591706368 ; accuracy 0.48135
epoch 10 : loss 11.374820719240628 ; accuracy 0.5088166666666667
epoch 11 : loss 10.63563835985899 ; accuracy 0.5333833333333333
epoch 12 : loss 9.92838713988776 ; accuracy 0.5600666666666667
epoch 13 : loss 9.351579870154616 ; accuracy 0.58135
epoch 14 : loss 8.89253089440596 ; accuracy 0.60305
epoch 15 : loss 8.722740528482035