In [3]:
import numpy as np
from mynn import op as nn

batchsize = 16

# test 1 # 
# input = np.random.randn(batchsize, 10)
# l1 = nn.Linear(10, 20)
# l1(input).shape
# grad = np.random.randn(batchsize, 20)
# l1.backward(grad).shape

# test 2 #
def test2(verbose=False):
    iter_time = 10000
    N = 16*iter_time
    X_data = np.random.randn(N, 10)
    W_gt = np.random.randn(10, 20)
    b_gt = np.random.randn(20)
    Y_data = X_data@W_gt + b_gt
    l2 = nn.Linear(10, 20)
    for i in range(iter_time):
        X = X_data[i*16:(i+1)*16, :]
        Y_gt = Y_data[i*16:(i+1)*16, :]
        Y_pred = l2(X)
        loss = np.linalg.norm(Y_gt-Y_pred)
        if verbose:
            print(loss)
        grad = -(Y_gt-Y_pred)
        l2.backward(grad)
        for key in l2.grads.keys():
            l2.params[key] -= 1*l2.grads[key]
            if np.any(np.isnan(l2.params[key])):
                print(l2.grads[key])
                print(grad)
                raise ValueError("Need to break!!!") 
        # print([np.linalg.norm(l2.params[key]) for key in l2.grads.keys()])
    print(f"the residual norm is {np.linalg.norm(l2.W-W_gt), np.linalg.norm(l2.b-b_gt)}")

def test3(verbose=False):
    iter_time = 200000
    N = 16*iter_time
    X_data = np.random.randn(N, 10)
    W_gt = np.random.randn(10, 20)
    b_gt = np.random.randn(20)
    Y_data = X_data@W_gt + b_gt
    l1 = nn.Linear(10, 10)
    relu = nn.ReLU()
    l2 = nn.Linear(10, 20)
    for i in range(iter_time):
        X = X_data[i*16:(i+1)*16, :]
        Y_gt = Y_data[i*16:(i+1)*16, :]
        Y_pred = l2(relu(l1(X)))
        loss = np.linalg.norm(Y_gt-Y_pred)
        if verbose:
            print(loss)
        grad = -(Y_gt-Y_pred)

        # passing the grad to l2!
        passing_grad = l2.backward(grad)
        print(f"norm of the grad is {np.linalg.norm(grad)}")
        for key in l2.grads.keys():
            l2.params[key] -= 0.01*l2.grads[key]
            if np.any(np.isnan(l2.params[key])):
                print(l2.grads[key])
                print(f"grad is {grad}")
                raise ValueError("l2 Need to break!!!") 
            
        # passing the grad to relu!
        passing_grad = relu.backward(passing_grad)
        # no params to optimize for relu!

        # passing the grad to l1!
        l1.backward(passing_grad)
        print(np.linalg.norm(passing_grad))
        for key in l1.grads.keys():
            l1.params[key] -= 0.01*l1.grads[key]
            if np.any(np.isnan(l1.params[key])):
                print(l1.grads[key])
                print(grad)
                raise ValueError("l1 Need to break!!!")        

def test4():
    con1 = nn.Conv2D(in_channels=3, out_channels=6, kernel_size=5)
    con2 = nn.Conv2D(in_channels=6, out_channels=12, kernel_size=5, stride=2)
    X = np.random.rand(16, 3, 28, 28)
    print(con2(con1(X)).shape)
    # grad = np.zeros((16, 12, 13, 13))
    # con2.backward(grad)

def test4_backward(verbose=False):
    iter_time = 1000
    con1 = nn.Conv2D(in_channels=3, out_channels=6, kernel_size=4)
    con2 = nn.Conv2D(in_channels=3, out_channels=6, kernel_size=4)
    print(np.linalg.norm(con1.kernel-con2.kernel))
    print(np.linalg.norm(con1.b-con2.b))
    for i in range(iter_time):
        X = np.random.rand(16, 3, 32, 32)
        gt = con1(X)
        pred = con2(X)
        grad = -(gt-pred)
        con2.backward(grad)
        for key in con2.grads.keys():
            con2.params[key] -= 0.1*con2.grads[key]
        if verbose and i%100==0:
            print(np.linalg.norm(con1.b-con2.b))
    print(np.linalg.norm(con1.kernel-con2.kernel))
    print(np.linalg.norm(con1.b-con2.b))



def test5():
    predict = np.random.rand(5, 10)
    lable = np.array([2, 1, 4, 3, 6])
    loss = nn.MultiCrossEntropyLoss()
    print(loss(predicts=predict, labels=lable))
    loss.backward()
    print(loss.grads)

if __name__ == "__main__":
    # test4_backward(verbose=True)
    # test2()
    test4()

(16, 12, 10, 10)


In [8]:
import mynn as nn
import numpy as np

model = nn.models.Model_CNN([(1, 6, 5), (2,), (6, 12, 5), (2,),("reshape"),(12*4*4, 120),(120, 10)], "ReLU", [1e-4, 1e-4, 1e-4, 1e-4])
linear_model = nn.models.Model_MLP([28*28, 600, 10], 'ReLU', [1e-4, 1e-4])
print(model)
X = np.random.randn(16, 1, 28, 28)
Y = model(X)
print(Y.shape)
print(model.backward(Y).shape)

A CNN Model With Whose Sublayer is as below:
                    ['A Conv2d Layer with fan_in:1, fan_out:6, kernel_size:5', 'A Max Pooling with kernel size:2', 'A Relu', 'A Conv2d Layer with fan_in:6, fan_out:12, kernel_size:5', 'A Max Pooling with kernel size:2', 'A Relu', 'A reshape layer', 'A Linear Layer With Size (192, 120)', 'A Relu', 'A Linear Layer With Size (120, 10)']
                
(16, 10)
(16, 1, 28, 28)


In [3]:
import mynn as nn
import numpy as np
maxpool = nn.op.MaxPool(3)
X = np.random.randn(1, 1, 5, 5)
Y = maxpool(X)
print(X)
print(Y)
grad = maxpool.backward(Y)
print(grad)

[[[[-0.29796387  0.04534184 -0.96975248  0.34969088 -1.68670888]
   [-1.38366456  0.77444898 -1.24438968 -0.38563044 -0.46548582]
   [ 1.4563753   0.63595711  1.18212638  1.13751325  0.20759483]
   [-1.65763602 -0.45781169  0.18384728 -1.15566621  1.10085234]
   [ 0.64629089 -0.66509908 -0.35892644  0.32152349 -0.77909116]]]]
[[[[1.4563753]]]]
[[[[0.        0.        0.        0.        0.       ]
   [0.        0.        0.        0.        0.       ]
   [1.4563753 0.        0.        0.        0.       ]
   [0.        0.        0.        0.        0.       ]
   [0.        0.        0.        0.        0.       ]]]]


In [4]:
import numpy as np
import time
A = np.random.randn(10240, 1024)
B = np.random.randn(1024, 10240)
s = time.time()
A@B
print(time.time()-s)

1.8124003410339355


In [38]:
import torch

# 构造测试数据
batchsize = 4
t = torch.randn(batchsize, 1)
t = t.repeat(1, 6) # 扩展为 (batchsize, 6)
t.requires_grad = True  # 设置 t 为需要梯度的变量
new_t = t.sum(dim=1, keepdim=True)/6
xt = new_t**2 + 3*new_t  # 假设 xt = t^2 + 3t
xt = xt.expand(4, 6)  # 扩展为 (batchsize, 6)
xt = torch.zeros_like(xt, requires_grad=True)  # 假设 xt = t^2 + 3t
xt = new_t**2 + 3*new_t  # 假设 xt = t^2 + 3t
# t = t.expand(4, 6)  # 扩展为 (batchsize, 6)

# 计算 dxt_dt
dxt_dt = torch.autograd.grad(
    outputs=xt,
    inputs=t,
    grad_outputs=torch.ones_like(xt),
    # is_grads_batched=True,
    retain_graph=True
)[0]

print("t:", t)
print("xt:", xt)
print("dxt_dt:", dxt_dt)

# 理论值：dxt_dt 应为 2*t + 3
expected = 2 * t + 3
# expected = expected.repeat(1, 6)
print("Expected dxt_dt:", expected)

t: tensor([[ 0.3642,  0.3642,  0.3642,  0.3642,  0.3642,  0.3642],
        [-2.0319, -2.0319, -2.0319, -2.0319, -2.0319, -2.0319],
        [ 0.5698,  0.5698,  0.5698,  0.5698,  0.5698,  0.5698],
        [-1.1332, -1.1332, -1.1332, -1.1332, -1.1332, -1.1332]],
       requires_grad=True)
xt: tensor([[ 1.2254],
        [-1.9671],
        [ 2.0343],
        [-2.1155]], grad_fn=<AddBackward0>)
dxt_dt: tensor([[ 0.6214,  0.6214,  0.6214,  0.6214,  0.6214,  0.6214],
        [-0.1773, -0.1773, -0.1773, -0.1773, -0.1773, -0.1773],
        [ 0.6899,  0.6899,  0.6899,  0.6899,  0.6899,  0.6899],
        [ 0.1223,  0.1223,  0.1223,  0.1223,  0.1223,  0.1223]])
Expected dxt_dt: tensor([[ 3.7285,  3.7285,  3.7285,  3.7285,  3.7285,  3.7285],
        [-1.0638, -1.0638, -1.0638, -1.0638, -1.0638, -1.0638],
        [ 4.1397,  4.1397,  4.1397,  4.1397,  4.1397,  4.1397],
        [ 0.7335,  0.7335,  0.7335,  0.7335,  0.7335,  0.7335]],
       grad_fn=<AddBackward0>)


In [2]:
class HHL():
    def __init__(self):
        self.a =1

hhh = HHL()

print(hhh.__class__.__name__)

HHL
