In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
X = np.random.uniform(low=-2, high=5, size=(6,8))
ytrue = np.array([1,0,1,2,0,2])

In [2]:
def get_data(path):
    data = pd.read_csv(path, index_col=0)

    cols = list(data.columns)
    target = cols.pop()

    X = data[cols].copy()
    y = data[target].copy()

    y = LabelEncoder().fit_transform(y)

    return np.array(X), np.array(y)

X, y = get_data(r'C:\Users\12482\Desktop\articles\nn_from_scratch\iris.csv')

In [7]:
class DenseLayer:
    def __init__(self, neurons):
        self.neurons = neurons
        self.b = np.zeros((1, neurons))
        
    def ReLU(self, inputs):
        return np.maximum(0, inputs)

    def Softmax(self, inputs):
        exp_scores = np.exp(inputs)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        return probs
    
    def relu_derivative(self, dA, Z):
        try:
            dZ = np.array(dA, copy = True)
            dZ[Z <= 0] = 0
        except:
            dA = dA.T
            dZ = np.array(dA, copy = True)
            dZ[Z <= 0] = 0
        return dZ
        
    def forward(self, inputs, last=False):
        self.weights = np.random.uniform(low=-1, high=1, size=(inputs.shape[1], self.neurons))
        self.Z = np.dot(inputs, self.weights) + self.b
        if last == True:
            self.A = self.Softmax(self.Z)
        else:
            self.A = self.ReLU(self.Z)
        self.inp = inputs
            
    def backward(self, dA_curr, W_curr, Z_curr, A_prev, last=False):
        m = A_prev.shape[0]

        if last == True:
            dW_curr = np.dot(dA_curr.T, A_prev) / m
            db_curr = np.sum(dA_curr, axis=0, keepdims=True) / m
            dA_prev = np.dot(W_curr, dA_curr.T)
        else:
            dZ_curr = self.relu_derivative(dA_curr, Z_curr)
            dW_curr = np.dot(dZ_curr.T, A_prev) / m
            db_curr = np.sum(dZ_curr, axis=0, keepdims=True) / m
            dA_prev = np.dot(W_curr, dZ_curr.T)

        return dA_prev, dW_curr, db_curr

class Network:
    def __init__(self):
        self.network = [] ## layers
        self.memory = {} ## Z, A
        self.params = {} ## W, b
        self.gradients = {} ## dW, db
        
    def add(self, layer):
        self.network.append(layer)
    
    def _calculate_loss(self, outputs, labels):
        samples = len(labels)

        out_clipped = np.clip(outputs, 1e-7, 1-1e-7)

        if len(labels.shape) == 1:
            confs = out_clipped[range(samples), labels]
        elif len(labels.shape) == 2:
            confs = np.sum(out_clipped*labels, axis=1)

        return np.mean(-np.log(confs))

    def _get_accuracy(self, predicted, actual):
        return np.mean(np.argmax(predicted, axis=1)==actual)

    def _forwardprop(self, data):
        new_out = []
        for idx, layer in enumerate(self.network):
            if layer != self.network[-1]:
                if not new_out:
                    layer.forward(data)
                    new_out.append(layer.A)
                    self.memory[idx+1] = {'W':layer.weights, 'Z':layer.Z, 'A':layer.A,
                                         'b':layer.b}
                else:
                    layer.forward(new_out[-1])
                    new_out.append(layer.A)
                    self.memory[idx+1] = {'W':layer.weights, 'Z':layer.Z, 'A':layer.A,
                                         'b':layer.b}
            else:
                layer.forward(new_out[-1], last=True)
                new_out.append(layer.A)
                self.memory[idx+1] = {'W':layer.weights, 'Z':layer.Z, 'A':layer.A,
                                      'b':layer.b}
        
        return new_out[-1]
    
    def _backprop(self, actual_y, predicted_y):
        dscores = predicted_y
        dscores[range(X.shape[0]),actual_y] -= 1
        dscores /= X.shape[0]

        dA_prev = dscores
        
        for idx, layer in reversed(list(enumerate(self.network))):
            
            layer_idx_curr = idx + 1

            ## if output layer --> insert dC/y_hat, otherwise update with dC/dZ
            dA_curr = dA_prev

            ## if input layer --> no previous activation, inputs are initial data
            if idx != 0:
                A_prev = self.memory[idx]['A']
            else: 
                A_prev = layer.inp

            Z_curr = self.memory[layer_idx_curr]['Z']
            W_curr = self.memory[layer_idx_curr]['W']

            if idx == len(self.network):
                dA_prev, dW_curr, db_curr = layer.backward(dA_curr, W_curr, Z_curr, A_prev, last=True)
            else:
                dA_prev, dW_curr, db_curr = layer.backward(dA_curr, W_curr, Z_curr, A_prev)

            self.gradients[layer_idx_curr] = {'dW':dW_curr, 'db':db_curr}

    def train(self, X, y, epochs, lr=0.03):
        self.loss = []
        self.accuracy = []

        for _ in range(epochs):
            yhat = self._forwardprop(X)
            self.loss.append(self._calculate_loss(outputs=yhat, labels=y))
            self.accuracy.append(self._get_accuracy(predicted=yhat, actual=y))

            self._backprop(actual_y=y, predicted_y=yhat)
            
            for layer_idx in range(len(self.network)):
                self.memory[layer_idx+1]['W'] -= lr * self.gradients[layer_idx+1]['dW'].T
                self.memory[layer_idx+1]['b'] -= lr * self.gradients[layer_idx+1]['db']
        

model = Network()
model.add(DenseLayer(X.shape[0]))
model.add(DenseLayer(8))
model.add(DenseLayer(10))
model.add(DenseLayer(3))

# model.train(X=X, y=y, epochs=50)
out = model._forwardprop(X)
out.shape

(150, 3)

In [12]:
for x in model.network:
    print(x.neurons)

150
8
10
3


In [11]:
for x in model.memory.keys():
    print(model.memory[x]['W'].shape)

(4, 150)
(150, 8)
(8, 10)
(10, 3)


In [23]:
# model.memory[2]['W'] -= 0.01 * model.gradients[2]['dW'].T
# model.memory[2]['b'] -= 0.01 * model.gradients[2]['db'].T

print(model.memory[1]['b'].shape,model.gradients[1]['db'].shape)
print(model.memory[2]['b'].shape,model.gradients[2]['db'].shape)
print(model.memory[3]['b'].shape,model.gradients[3]['db'].shape)
print(model.memory[4]['b'].shape,model.gradients[4]['db'].shape)

(1, 6) (1, 6)
(1, 15) (1, 15)
(1, 10) (1, 10)
(1, 3) (1, 3)


In [205]:
actual_y = model._one_hot(labels=ytrue)
actual_y = actual_y.reshape(out.shape)

def relu_derivative(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0
    return dZ

def backward(dA_curr, W_curr, Z_curr, A_prev):
    m = A_prev.shape[1]

    dZ_curr = relu_derivative(dA_curr, Z_curr)
    dW_curr = np.dot(dZ_curr.T, A_prev) / m
    db_curr = np.sum(dZ_curr, axis=1, keepdims=True) / m
    dA_prev = np.dot(W_curr, dZ_curr.T)
    
    return dA_prev, dW_curr, db_curr

dA_curr = - (np.divide(actual_y, out) - np.divide(1 - actual_y, 1 - out))
W_curr = model.memory[4]['W']
Z_curr = model.memory[4]['Z']
A_prev = model.memory[3]['A']
    
dA_prev, dW_curr, db_curr = backward(dA_curr, W_curr, Z_curr, A_prev)

In [206]:
W_curr = model.memory[3]['W']
Z_curr = model.memory[3]['Z']
A_prev = model.memory[2]['A']

dA_prev1, dW_curr1, db_curr1 = backward(dA_prev, W_curr, Z_curr, A_prev)

In [207]:
W_curr = model.memory[2]['W']
Z_curr = model.memory[2]['Z']
A_prev = model.memory[1]['A']

dA_prev2, dW_curr2, db_curr2 = backward(dA_prev1.T, W_curr, Z_curr, A_prev)

In [208]:
W_curr = model.memory[1]['W']
Z_curr = model.memory[1]['Z']
A_prev = X

dA_prev3, dW_curr3, db_curr3 = backward(dA_prev2, W_curr, Z_curr, A_prev)

In [209]:
print(dW_curr.shape)
print(dW_curr1.shape)
print(dW_curr2.shape)
print(dW_curr3.shape)

(3, 6)
(6, 10)
(10, 6)
(6, 8)


In [211]:
print(model.gradients[4]['dW'].shape)
print(model.gradients[3]['dW'].shape)
print(model.gradients[2]['dW'].shape)
print(model.gradients[1]['dW'].shape)

(3, 6)
(6, 10)
(10, 6)
(6, 8)
