In [1]:
import numpy as np

input: (len of batch, num of features)

weight: (num of features, num of hidden neurons)

bias: num of hidden neurons (broadcast)

### make an example of neural networks 

    input features: 4
    hidden layers: 2 (20, 20)
    output classes: 10
    batch size: 25
    activation f: sigmoid

In [2]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [3]:
x = np.random.randn(25, 4)
w1 = np.random.randn(4, 20)
w2 = np.random.randn(20, 20)
w3 = np.random.randn(20, 10)
b1 = np.random.randn(20)
b2 = np.random.randn(20)

In [4]:
h1 = np.matmul(x, w1) + b1
h2 = np.matmul(h1, w2) + b2
h3 = np.matmul(h2, w3)
y = sigmoid(h3)

output = y.argmax(axis=1)

In [5]:
output

array([5, 5, 5, 4, 5, 5, 0, 3, 3, 5, 0, 3, 0, 3, 5, 3, 0, 3, 0, 3, 2, 8,
       0, 0, 0])

### TwoLayerNet class

In [6]:
class Sigmoid:
    def __init__(self):
        self.params = list()
    
    def forward(self, x):
        return 1 / (1 + np.exp(-x))
    
class Affine:
    def __init__(self, w, b):
        self.params = [w, b]
        self.grads = [np.zeros_like(w), np.zeros_like(b)]
        self.x = None
        
    def forward(self, x):
        w, b = self.params
        out = np.matmul(x, w) + b
        self.x = x
        return out

    def backward(self, dout):
        w, b = self.params
        dx = np.matmul(dout, w.T)
        dw = np.matmul(self.x.T, dout)
        db = np.sum(dout, axis=0)

        self.grads[0][...] = dw
        self.grads[1][...] = db
        return dx

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size):
        I, H, O = input_size, hidden_size, output_size
        
        # initialize
        w1 = np.random.randn(I, H)
        b1 = np.random.randn(H)
        w2 = np.random.randn(H, O)
        b2 = np.random.randn(O)
        
        # make layers
        self.layers = [
            Affine(w1, b1),
            Sigmoid(),
            Affine(w2, b2)
        ]
        
        # parameters
        self.params = list()
        for layer in self.layers:
            self.params += layer.params
            
    def predict(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

class MatMul:
    def __init__(self, w):
        self.params = [w]
        self.grads = [np.zeros_like(w)]
        self.x = None

    def forward(self, x):
        w = self.params
        out = np.matmul(x, w)
        self.x = x
        return out
    
    def backward(self, dout):
        w = self.params
        dx = np.matmul(dout, w.T)
        dw = np.matmul(self.x.T, dout)
        self.grads[0][...] = dw # [...]: deep copy
        return dx

In [7]:
x = np.random.randn(10, 2)
model = TwoLayerNet(2, 4, 3)
s = model.predict(x)
s.argmax(axis=1)

array([2, 1, 2, 1, 1, 1, 1, 1, 2, 2])

In [8]:
s

array([[-1.23350734,  0.43879021,  0.58573659],
       [-2.39311572,  1.33205717,  0.59469898],
       [ 0.07359665, -0.18569587,  0.6302144 ],
       [-1.77459451,  1.18265017,  0.63969191],
       [-2.46917619,  1.42073371,  0.58501893],
       [-1.90508727,  1.08485231,  0.56693969],
       [-2.17589771,  0.98408014,  0.65438396],
       [-2.11279047,  1.20719844,  0.574844  ],
       [-0.89702147,  0.2054461 ,  0.58852337],
       [-1.55726994,  0.56462495,  0.62330277]])

### Optimizer

In [9]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
    
    def update(self, params, grads):
        for i in range(len(params)):
            params[i] -= self.lr * grads[i]

class Momentum:
    '''
    momentum SGD
    '''
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None

    def update(self, params, grads):
        if self.v is None:
            self.v = dict()
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)

        for key in params.key():
            self.v[key] = self.momentum*self.v[key] - self.lr*grads[key]
            params[key] += self.v[key]