In [1]:
#卷积层
#用3个3x3的过滤器去扫描6x6的图像,每个过滤器都会得到4x4的扫描结果,每个像素等于如下
#z = w1*x1 + w2*x2 + ... + w9*x9 + b
#a = a(z)

#池化层
#用一个2x2的最大池去扫描4x4的扫描结果,会得到一个2x2的池化结果,每个像素等于如下
#池化层不存在学习参数
#z = max(x1,x2,x3,x4)
#a = z

#输出层
#用3个神经元做one hot输出.这里是全连接网络
#z = w1*x1 + w2*x2 + w3*x3 + w4*x4 + b
#a = a(z)

#误差函数
#C = 1/2 * [(y1 - a1)^2 + (y2 - a2)^2 + (y3 - a3)^2] <- 对所有x求和

#定义符号delta = dC/dz

#sigmoid函数求导
#da/dz = a(z) * (1 - a(z))

#计算输出层的delta
#delta = dC/dz = dC/da * da/dz
#因为 C = 1/2 * [(y1 - a1)^2 + (y2 - a2)^2 + (y3 - a3)^2] <- 对所有x求和
#所以 dC/da = a - y
#所以 delta = (a - y) * a(z) * (1 - a(z))

#计算卷积层的delta
#delta = dC/dz = [dC/dz * dz/dx][输出层] * [da/dz * dz/dx][池化层] * [da/dz][卷积层] <- 对3个过滤器求和
#delta = {[dC/dz * dz/dx][输出层] <- 对3个神经元求和} * [da/dz * dz/dx][池化层] * [da/dz][卷积层]

#因为 输出层 z = w1*x1 + w2*x2 + w3*x3 + w4*x4 + b
#所以 输出层 dz/dx = w

#因为 池化层 a = z = max(x1,x2,x3,x4)
#所以 池化层 da/dz = 1
#所以 池化层 dz/dx = 1:如果x最大 0:如果x不是最大

#所以 delta = {[delta(输出层) * w][输出层] <- 对3个神经元求和} * [1:x是最大 0:x不是最大][卷积层] * [a(z) * (1 - a(z))][卷积层]

#求梯度
#dC/dw = delta * x <- 对所有x求和
#dC/db = delta <- 对所有x求和

In [2]:
import numpy as np


#加载数据
def load_data():
    with open('数字123.txt') as fr:
        lines = fr.readlines()

    x = np.empty((len(lines), 36), dtype=int)
    y = np.empty((len(lines), 3), dtype=int)

    for i in range(len(lines)):
        line = lines[i].strip().split(',')
        x[i] = line[:36]
        if line[36] == '1':
            y[i] = [1, 0, 0]
        if line[36] == '2':
            y[i] = [0, 1, 0]
        if line[36] == '3':
            y[i] = [0, 0, 1]

    return x, y


x, y = load_data()
x.shape, y.shape, x[0].reshape(6, 6), y[0]

((96, 36),
 (96, 3),
 array([[0, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 0]]),
 array([1, 0, 0]))

In [3]:
#定义常量
N, M = x.shape

lr = 0.2

In [4]:
#定义卷积层
class Conv:
    def __init__(self, w, b):
        self.w = w
        self.b = b

    def run(self, xi):
        z = np.zeros((4, 4))
        xi = xi.reshape((6, 6))
        #扫描xi
        for i in range(4):
            for j in range(4):
                #取xi中的小窗口
                window = xi[i:i + 3, j:j + 3]

                #线性计算
                z[i, j] = np.multiply(window, self.w).sum() + self.b

        self.z = z

        #激活函数,sigmoid
        self.a = 1 / (1 + np.exp(-self.z))

In [5]:
#定义池化层
class Pool:
    def __init__(self):
        pass

    def run(self, xi):
        a = np.zeros((2, 2))

        #最大池化
        a[0, 0] = xi[0:2, 0:2].max()
        a[0, 1] = xi[0:2, 2:4].max()
        a[1, 0] = xi[2:4, 0:2].max()
        a[1, 1] = xi[2:4, 2:4].max()

        self.a = a

In [6]:
#定义输出层
class Out:
    def __init__(self, w, b):
        self.w = w
        self.b = b

    def run(self, x1, x2, x3):

        #线性计算
        self.z = np.multiply(x1.reshape(1, -1), self.w[0]).sum()
        self.z += np.multiply(x2.reshape(1, -1), self.w[1]).sum()
        self.z += np.multiply(x3.reshape(1, -1), self.w[2]).sum()
        self.z += self.b

        #激活函数,sigmoid
        self.a = 1 / (1 + np.exp(-self.z))

In [7]:
#初始化卷积层
cs = []

w = np.array([[-1.277, -0.454, 0.358], [1.138, -2.398, -1.664],
              [-0.794, 0.899, 0.675]])
cs.append(Conv(w, b=-3.363))

w = np.array([[-1.274, 2.338, 2.301], [0.649, -0.339, -2.054],
              [-1.022, -1.204, -1.900]])
cs.append(Conv(w, b=-3.176))

w = np.array([[-1.869, 2.044, -1.290], [-1.710, -2.091, -2.946],
              [0.201, -1.323, 0.207]])
cs.append(Conv(w, b=-1.739))

In [8]:
#初始化池化层
#其实这3个p没有区别,也没有学习的空间,只是为了存储3个a所以new了3个
ps = []
ps.append(Pool())
ps.append(Pool())
ps.append(Pool())

In [9]:
#初始化输出层
os = []
w = np.array([[-0.276, 0.124, -0.961, 0.718], [-3.680, -0.594, 0.280, -0.782],
              [-1.475, -2.010, -1.085, -0.188]])
os.append(Out(w, b=2.060))

w = np.array([[0.010, 0.661, -1.591, 2.189], [1.728, 0.003, -0.250, 1.898],
              [0.238, 1.589, 2.246, -0.093]])
os.append(Out(w, b=-2.746))

w = np.array([[-1.322, -0.218, 3.527, 0.061], [0.613, 0.218, -2.130, -1.678],
              [1.236, -0.486, -0.144, -1.235]])
os.append(Out(w, b=-1.818))

In [10]:
#运算函数
def run(xi):
    for i in range(3):
        cs[i].run(xi)
    for i in range(3):
        ps[i].run(cs[i].a)
    for i in range(3):
        os[i].run(ps[0].a, ps[1].a, ps[2].a)


run(x[0])
os[0].a, os[1].a, os[2].a

(0.7859501531761532, 0.10857818037340734, 0.1369293947699326)

In [11]:
#计算输出层的delta
#delta = dC/dz = dC/da * da/dz
#因为 C = 1/2 * [(y1 - a1)^2 + (y2 - a2)^2 + (y3 - a3)^2] <- 对所有x求和
#所以 dC/da = a - y
#所以 delta = (a - y) * a(z) * (1 - a(z))
def get_delta_out():
    delta_out = np.zeros((N, 3))
    for i in range(N):
        run(x[i])

        for j in range(3):
            delta_out[i, j] = (os[j].a - y[i, j]) * os[j].a * (1 - os[j].a)

    return delta_out


delta_out = get_delta_out()
delta_out[:5]

array([[-0.03601014,  0.01050917,  0.01618228],
       [-0.08526375,  0.01883231,  0.01726081],
       [-0.09935389,  0.02124917,  0.02006905],
       [-0.03910314,  0.01076829,  0.01868235],
       [-0.03850947,  0.01151791,  0.01831856]])

In [12]:
#计算卷积层的delta
#delta = dC/dz = [dC/dz * dz/dx][输出层] * [da/dz * dz/dx][池化层] * [da/dz][卷积层] <- 对3个过滤器求和
#delta = {[dC/dz * dz/dx][输出层] <- 对3个神经元求和} * [da/dz * dz/dx][池化层] * [da/dz][卷积层]

#因为 输出层 z = w1*x1 + w2*x2 + w3*x3 + w4*x4 + b
#所以 输出层 dz/dx = w

#因为 池化层 a = z = max(x1,x2,x3,x4)
#所以 池化层 da/dz = 1
#所以 池化层 dz/dx = 1:如果x最大 0:如果x不是最大


#所以 delta = {[delta(输出层) * w][输出层] <- 对3个神经元求和} * [1:x是最大 0:x不是最大][卷积层] * [a(z) * (1 - a(z))][卷积层]
def get_delta_conv():
    delta_conv = np.zeros((N, 3, 4, 4))

    pool_idx = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 3, 3],
                         [2, 2, 3, 3]])

    #遍历所有数据
    for i in range(N):
        run(x[i])

        #遍历三个卷积器
        for j in range(3):

            #每个卷机器的delta是一个4x4的矩阵
            #扫描这个矩阵
            for _1 in range(4):
                #print(i, j, _1)
                for _2 in range(4):

                    #先计算这部分:{[delta(输出层) * w][输出层] <- 对3个神经元求和}
                    #输出层的w是一个3x4的矩阵,每个卷机器对应其中的一行
                    #pool_idx[_1, _2]是把卷机器4x4的尺寸压缩到2x2的尺寸,因为有池化层
                    w_idx = (j, pool_idx[_1, _2])
                    sum_delta_out = delta_out[i, 0] * os[0].w[w_idx]
                    sum_delta_out += delta_out[i, 1] * os[1].w[w_idx]
                    sum_delta_out += delta_out[i, 2] * os[2].w[w_idx]

                    #[1:x是最大 0:x不是最大][卷积层]
                    d_p = 0
                    #判断当前的卷机器输出是否是对应的池化层输出
                    if cs[j].a[_1, _2] == ps[j].a[int(_1 / 2), int(_2 / 2)]:
                        d_p = 1

                    #[a(z) * (1 - a(z))][卷积层]
                    #这里就是简单的sigmoid函数求导
                    d_a = cs[j].a[_1, _2] * (1 - cs[j].a[_1, _2])

                    delta_conv[i, j, _1, _2] = (sum_delta_out * d_p * d_a)

    return delta_conv


delta_conv = get_delta_conv()
delta_conv[1, 1]

array([[ 0.        ,  0.        ,  0.00695707,  0.        ],
       [ 0.07410988,  0.        ,  0.        ,  0.        ],
       [-0.00251409, -0.        ,  0.00569069,  0.        ],
       [-0.00251409, -0.        ,  0.00569069,  0.        ]])

In [13]:
def get_gradient_conv():
    gradient_conv_w = np.zeros((3, 3, 3))
    gradient_conv_b = np.zeros(3)

    #遍历所有数据
    for i in range(N):

        xi = x[i].reshape((6, 6))

        #遍历3个卷机器
        for j in range(3):

            #扫描xi
            for _1 in range(3):
                for _2 in range(3):
                    #取xi中的小窗口
                    window = xi[_1:_1 + 4, _2:_2 + 4]

                    #求梯度
                    #dC/dw = delta * x <- 对所有x求和
                    gradient = np.multiply(window, delta_conv[i, j]).sum()
                    gradient_conv_w[j, _1, _2] += gradient

            #求梯度
            #dC/db = delta <- 对所有x求和
            gradient_conv_b[j] += delta_conv[i, j].sum()

    return gradient_conv_w, gradient_conv_b


gradient_conv_w, gradient_conv_b = get_gradient_conv()
gradient_conv_w, gradient_conv_b

(array([[[-0.01717084, -0.22068176, -2.3031552 ],
         [-3.46326777,  0.03506276,  0.07279098],
         [-1.72280208, -3.67698433, -3.09158066]],
 
        [[-0.14762899, -1.66071179, -0.05313807],
         [-1.60017334,  0.4337165 ,  0.32155536],
         [ 0.18894448,  0.92748115, -0.34197684]],
 
        [[-0.04400932, -1.21537877, -0.02442836],
         [ 0.03122772, -0.22773512,  0.02196469],
         [-0.16531662,  0.17641142, -0.64013626]]]),
 array([-2.98935125, -0.80548239, -1.15646339]))

In [14]:
def get_gradient_out():
    #3个输出神经元 * 3x4w矩阵
    gradient_out_w = np.zeros((3, 3, 4))
    gradient_out_b = np.zeros(3)

    #遍历所有数据
    for i in range(N):
        run(x[i])

        #遍历三个输出神经元
        for j in range(3):

            #遍历3个池化器
            for k in range(3):

                #求梯度
                #dC/dw = delta * x <- 对所有x求和
                gradient = ps[k].a * delta_out[i, j]
                gradient = gradient.reshape(-1)

                gradient_out_w[j, k] += gradient

            #dC/db = delta <- 对所有x求和
            gradient_out_b[j] += delta_out[i, j]

    return gradient_out_w, gradient_out_b


gradient_out_w, gradient_out_b = get_gradient_out()
gradient_out_w, gradient_out_b

(array([[[ 0.05746021, -0.04133349,  0.1514548 , -0.0116005 ],
         [ 0.23565552, -0.07645565, -0.0514156 ,  0.03793918],
         [ 0.17841863, -0.11448917, -0.11509322, -0.12559874]],
 
        [[-0.067052  ,  0.00551467,  0.19821577, -0.3021513 ],
         [-1.51468394, -0.164552  ,  0.04691571, -0.9632686 ],
         [-1.00859612, -0.32069493, -0.29414387, -0.40720453]],
 
        [[-0.29095766, -0.11754116, -1.15545481,  0.02907475],
         [-2.00549104, -0.21869132, -0.18069517, -0.30321599],
         [-1.24136192,  0.00422127, -0.04509259, -0.00628449]]]),
 array([-0.57808623, -1.57259232, -2.5003205 ]))

In [15]:
#更新参数
def update():
    for i in range(3):
        cs[i].w -= gradient_conv_w[i] * lr
        cs[i].b -= gradient_conv_b[i] * lr

        os[i].w -= gradient_out_w[i] * lr
        os[i].b -= gradient_out_b[i] * lr


update()

cs[0].w, cs[0].b, os[0].w, os[0].b

(array([[-1.27356583, -0.40986365,  0.81863104],
        [ 1.83065355, -2.40501255, -1.6785582 ],
        [-0.44943958,  1.63439687,  1.29331613]]),
 -2.765129749407543,
 array([[-0.28749204,  0.1322667 , -0.99129096,  0.7203201 ],
        [-3.7271311 , -0.57870887,  0.29028312, -0.78958784],
        [-1.51068373, -1.98710217, -1.06198136, -0.16288025]]),
 2.17561724570285)

In [16]:
#批量训练
for _ in range(49):
    delta_out = get_delta_out()
    delta_conv = get_delta_conv()
    gradient_conv_w, gradient_conv_b = get_gradient_conv()
    gradient_out_w, gradient_out_b = get_gradient_out()
    update()
    
cs[0].w, cs[0].b, os[0].w, os[0].b

(array([[-0.64597966, -0.78274848,  0.34882081],
        [ 2.39503468, -3.77907531, -2.73777482],
        [ 0.72335571,  1.98043273,  1.3402626 ]]),
 -3.468490587928705,
 array([[-0.52670233,  0.73697115, -1.78487478,  0.54287261],
        [-4.97646234,  0.17952161,  0.55181929, -0.92276147],
        [-2.01860614, -1.77628825, -0.8836164 ,  0.03805839]]),
 3.1559393950141774)

In [17]:
#测试
correct = 0

for i in range(N):
    run(x[i])

    pred = np.array([os[0].a, os[1].a, os[2].a])
    if pred.argmax() == y[i].argmax():
        correct += 1

correct / N

1.0