In [2]:
import sys
sys.path.append("../Coding")
import numpy as np
import keras
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

2024-07-10 08:54:40.692653: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
class MLP:
    def __init__(self, input_size, hidden_size, output_size):
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))

    def relu(self, x):
        return np.maximum(0, x)

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def forward(self, x):
        z1 = np.dot(x, self.W1) + self.b1
        a1 = self.relu(z1)
        z2 = np.dot(a1, self.W2) + self.b2
        y_pred = self.softmax(z2)
        return y_pred

    def get_weights(self):
        return np.concatenate([self.W1.ravel(), self.b1.ravel(), self.W2.ravel(), self.b2.ravel()])

    def set_weights(self, weights):
        sizes = [self.W1.size, self.b1.size, self.W2.size, self.b2.size]
        self.W1 = weights[:sizes[0]].reshape(self.W1.shape)
        self.b1 = weights[sizes[0]:sizes[0]+sizes[1]].reshape(self.b1.shape)
        self.W2 = weights[sizes[0]+sizes[1]:sizes[0]+sizes[1]+sizes[2]].reshape(self.W2.shape)
        self.b2 = weights[-sizes[3]:].reshape(self.b2.shape)
        
        
def cross_entropy_loss(y_pred, y_true):
    epsilon = 1e-7
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    loss = -np.sum(y_true * np.log(y_pred))
    return loss / y_true.shape[0]


In [4]:
# load model and data

X = x_train.reshape(-1, 784)
Y = np.eye(10)[y_train]  # one-hot encoding, e.g. label 4 -> [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]


mlp = MLP(input_size=784, hidden_size=16, output_size=10)


In [7]:
# these parameters are not really tuned, feel free to improve!

iterations = 100
population_size = 2500
sigma = 1e-3    # we want the purtubation strength in the same ballpark as the weights
learning_rate = 0.5
batch_size = 16

theta = mlp.get_weights()

print("theta", theta.shape, 'weight magnitude: ', np.mean(np.abs(theta)))

for iteration in range(iterations):
    directions = []
    loss = []
    accuracy = []
    for individual in range(population_size):
        # generate random pertubation
        epsilon =  np.random.normal(0, sigma, size=theta.shape)
        losses = []
        for b in range(batch_size):
            # evaluate on a batch of random images
            idx = np.random.randint(0, len(X) - 1)
            sample = X[idx]
            # set weights an compute forwardpass + loss
            mlp.set_weights(theta + epsilon)
            y_pred = mlp.forward(sample)
            cross_entropy = cross_entropy_loss(y_pred, Y[idx])
            
            losses.append(cross_entropy)
            accuracy.append(np.argmax(y_pred) == y_train[idx])
    
        directions.append(epsilon)
        loss.append(np.mean(losses))

    directions = np.array(directions)
    loss = np.array(loss)
    
    # normalize to -0.5 to 0.5
    fitness = ((loss - loss.min()) / (loss.max() - loss.min())) - 0.5

    # estimate gradient by multiplying the random directions with the found loss decrease
    #  note that this includes the directions that turned out to increase the loss, but
    #  since the values are normalized between -0.5 and +0.5, they will end up with a 
    #  negative sign so everything works out
    gradient = np.mean(np.multiply(directions.T, fitness), axis=1) / sigma
    # bonus question: why devide by sigma? 

    # use the gradient for gradient descent
    theta = theta - learning_rate * gradient

    # print(len(theta))

    print(f"Iteration: {iteration+1}/{iterations}, mean accuracy: {np.mean(accuracy):.4f}, mean loss: {np.mean(loss)}", end='')
    print(f" , gradient magnitude {np.mean(np.abs(gradient))/np.mean(np.abs(theta))}, gradient mean {np.mean(gradient)}")


theta (12730,) weight magnitude:  0.008048773592840132
Iteration: 1/100, mean accuracy: 0.1305, mean loss: 0.2395050756748789 , gradient magnitude 0.2795705629181174, gradient mean -3.1312425846834915e-05
Iteration: 2/100, mean accuracy: 0.1409, mean loss: 0.23519497253207083 , gradient magnitude 0.2592709063556789, gradient mean 1.7426001046042663e-05
Iteration: 3/100, mean accuracy: 0.1463, mean loss: 0.23283764341287097 , gradient magnitude 0.31332735273692586, gradient mean 2.0504755285732186e-05
Iteration: 4/100, mean accuracy: 0.1563, mean loss: 0.2306572989573046 , gradient magnitude 0.24633108931448938, gradient mean -2.2089311668897614e-05
Iteration: 5/100, mean accuracy: 0.1752, mean loss: 0.22679559240611816 , gradient magnitude 0.25532838377707295, gradient mean 3.836595452630927e-05
Iteration: 6/100, mean accuracy: 0.1803, mean loss: 0.22310493152660785 , gradient magnitude 0.26960190379325916, gradient mean -1.1896332252146087e-05
Iteration: 7/100, mean accuracy: 0.2046, 

KeyboardInterrupt: 