In [3]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist as dataset

(train_inputs, train_labels), (test_inputs, test_labels) = dataset.load_data()
# train_inputs = train_inputs.astype("float32") / 255.0
# training_data = [np.reshape(x, (784,1)) for x in train_inputs]

training_data = train_inputs.reshape(-1, 784)

label = np.eye(10)[train_labels]

2024-07-09 18:18:46.224464: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
def sigmoid(x):
    return (1.0 / (1.0 + np.exp(-x)))

In [5]:
def loss(yhat, y):
    m = y.shape[0]
    epsilon = 1e-7
    yhat = np.clip(yhat, epsilon, 1 - epsilon)
    losses = -np.sum(y * np.log(yhat))
    return losses/m

In [6]:
hd = [1,2,3]
print(hd[1])

2


In [9]:
class forwardPass(object):
    def __init__(self, layer):
        self.num_layer = len(layer)
        self.layer = layer
        self.w1 = np.random.randn(layer[0], layer[1]) * 0.01
        self.b1 = np.zeros((1, layer[1]))
        self.w2 = np.random.randn(layer[1], layer[2]) * 0.01
        self.b2 = np.zeros((1, layer[2]))

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / np.sum(exp_x, keepdims=True)
    
    def relu(self, x):
        return np.maximum(0,x)
    
    def forward(self, x):
        r1 = np.dot(x, self.w1) + self.b1
        x1 = self.relu(r1)
        r2 = np.dot(x1, self.w2) + self.b2
        pred = self.softmax(r2)
        return pred
    
    def get_weight(self):
        return np.concatenate([self.w1.ravel(), 
                               self.b1.ravel(), 
                               self.w2.ravel(), 
                               self.b2.ravel()])
    
    def set_weight(self, weights):
        sizes = [self.w1.size, self.b1.size, 
                 self.w2.size, self.b2.size]
        self.w1 = weights[:sizes[0]].reshape(self.w1.shape)
        self.b1 = weights[sizes[0]:sizes[0]+sizes[1]].reshape(self.b1.shape)
        self.w2 = weights[sizes[0]+sizes[1]:sizes[0]+
                          sizes[1]+sizes[2]].reshape(self.w2.shape)
        self.b2 = weights[-sizes[3]:].reshape(self.b2.shape)

    def nevo(self):
        sigma = 1e-3
        theta = self.get_weight()

        for generation in range(100):
            directions = []
            loss_list = []
            accuracy = []
            for individual in range(2500):
                # generate random pertubation
                epsilon =  np.random.normal(0, sigma, size=theta.shape)
                losses = []
                for batch in range(15):
                    # evaluate on a batch of random images
                    idx = np.random.randint(0, len(training_data) - 1)
                    sample = training_data[idx]
                    # set weights an compute forwardpass + loss
                    self.set_weight(theta + epsilon)
                    pred = self.forward(sample)
                    this_loss = loss(pred, label[idx])
                    
                    losses.append(this_loss)
                    accuracy.append(np.argmax(pred) == train_labels[idx])
            
                directions.append(epsilon)
                loss_list.append(np.mean(losses))

            directions = np.array(directions)
            loss_list = np.array(loss_list)
            
            # normalize to -0.5 to 0.5
            fitness = ((loss_list - loss_list.min()) / (loss_list.max() - loss_list.min())) - 0.5

            # estimate gradient by multiplying the random directions with the found loss decrease
            #  note that this includes the directions that turned out to increase the loss, but
            #  since the values are normalized between -0.5 and +0.5, they will end up with a 
            #  negative sign so everything works out
            gradient = np.mean(np.multiply(directions.T, fitness), axis=1) / sigma
            # bonus question: why devide by sigma? 

            # use the gradient for gradient descent
            theta = theta - 0.1 * gradient

            # print(len(theta))

            print(f"Iteration: {generation+1}/100, mean accuracy: {np.mean(accuracy):.4f} \n")


        

In [10]:
ls = forwardPass([784, 100, 10])
ls.nevo()

Iteration: 1/100, mean accuracy: 0.0806 

Iteration: 2/100, mean accuracy: 0.0793 

Iteration: 3/100, mean accuracy: 0.0818 

Iteration: 4/100, mean accuracy: 0.0876 

Iteration: 5/100, mean accuracy: 0.0842 

Iteration: 6/100, mean accuracy: 0.0885 

Iteration: 7/100, mean accuracy: 0.0897 

Iteration: 8/100, mean accuracy: 0.1001 

Iteration: 9/100, mean accuracy: 0.1034 

Iteration: 10/100, mean accuracy: 0.1108 

Iteration: 11/100, mean accuracy: 0.1209 

Iteration: 12/100, mean accuracy: 0.1318 



KeyboardInterrupt: 