# hands-on implementation of cnn with numpy--------fullyconnected layer

## fc layer 
$$FullyConnected\ layer = Flatten\ layer + perceptron\ layer$$

> NOTE:fc layer 已经可以通过**global_average_pooling**实现代替
所以直接将每一个channel的feature map全局池化降维到一个点得到一个channels长度的列向量，再应用MLP进行维度变换。

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline
print(np.__version__)

1.16.4


In [23]:
class FullyConnectedLayer(object):
    def __init__(self,shape,output_size):
        self.input_shape = shape
        self.batch_size = shape[0]
        self.output_shape = [self.batch_size,output_size]
        self.output_size = output_size
        
        num = 1
        for i in shape[1:]:
            num *= i
        weight_scaler = np.sqrt(num/2)
        self.weights = np.random.randn(num,output_size)/weight_scaler
        self.bias = np.random.randn(output_size)/weight_scaler
        
        self.w_grad = np.zeros(self.weights.shape)
        self.b_grad = np.zeros(self.bias.shape)
    def forward_propagate(self,x):
        self.x = x.reshape([self.batch_size,-1])
        return np.dot(self.x,self.weights) + self.bias
    def gradient_cal(self,delta):
        # 对于w_grad 对于batch中的每一个sample进行反向传播之后sum
        for i in range(delta.shape[0]):
            x_i = self.x[i][:,np.newaxis]
            delta_i = delta[i][:,np.newaxis].T
            self.w_grad += np.dot(x_i,delta_i)
            self.b_grad += np.reshape(delta_i,self.bias.shape)
        # 如果对于整体进行快速处理 可以使用以下code
        delta_transposed = np.transpose(delta[...,np.newaxis],[0,2,1])
        x_extend = self.x[...,np.newaxis]
        # w_grad_everybatch = np.dot(x_extend,delta_transposed) # 默认忽略第一维度 进行运算
        w_grad_everybatch = np.matmul(x_extend,delta_transposed) # 默认忽略第一维度 进行运算
        # print(x_extend.shape,delta_transposed.shape,w_grad_everybatch.shape)
        w_grad = np.sum(w_grad_everybatch,axis=0)
        b_grad = np.sum(delta,axis=0)
        # demo fpr consistence of calculation
        print(w_grad-self.w_grad)
        print(b_grad-self.b_grad)
        
        # input_delta
        input_delta = np.dot(delta,self.weights.T).reshape(self.input_shape)
        return input_delta
    def backward_propagate(self,delta,learning_rate=1e-5,weight_decay=1e-4):
        # zero gradient
        self.w_grad = np.zeros(self.weights.shape)
        self.b_grad = np.zeros(self.bias.shape)
        
        input_delta = self.gradient_cal(delta)
        # use weight decay -> l2 regularization
        self.weights *= (1-weight_decay)
        self.bias *= (1-weight_decay)
        self.weights -= learning_rate * self.w_grad
        self.bias -= learning_rate * self.b_grad
        return input_delta

In [6]:
from functools import reduce
# 对于可是迭代对象进行某个累积函数操作
reduce(lambda x,y:x*y,[1,2,5,36,4]) 
# 可以替代上面的循环

1440

In [24]:
# test code
if __name__ == "__main__":
    img = np.array([[[1,2],[3,4]],[[1,2],[3,4]]])
    fc = FullyConnectedLayer(img.shape, 2)
    out = fc.forward_propagate(img)
    fc.backward_propagate(np.array([[1, -2],[3,4]]))
    print(img.shape,out.shape,fc.w_grad.shape,fc.bias.shape)
    print(out)
    print(fc.w_grad)
    print(fc.b_grad)
    print(fc.weights)
    print(fc.bias)

[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
[0. 0.]
(2, 2, 2) (2, 2) (4, 2) (2,)
[[-1.51977166 -5.2736936 ]
 [-1.51977166 -5.2736936 ]]
[[ 4.  2.]
 [ 8.  4.]
 [12.  6.]
 [16.  8.]]
[4. 2.]
[[-0.21713341  0.9877772 ]
 [-1.12031796 -1.15371593]
 [-0.24995073 -1.6138691 ]
 [ 0.43325534  0.07739696]]
[-0.04625952  0.57788791]


In [20]:
# dot无效 matmul有效
a = np.ones([2,3,4])
b = np.ones([2,4,5])
print(a.dot(b).shape)
print(np.matmul(a,b).shape)

(2, 3, 2, 5)
(2, 3, 5)
