# Tema 3 - Multi Layer Perceptron

In [2]:
import math
from venv import create

import numpy as np
from numpy.ma.core import reshape, argmax
from torchvision.datasets import MNIST

def download_mnist(is_train: bool):
    dataset = MNIST(root='./data',
    transform=lambda x: np.array(x).flatten(),
    download=True,
    train=is_train)
    mnist_data = []
    mnist_labels = []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)
    return mnist_data, mnist_labels

### Initializarea si normalizarea datelor

In [3]:
def normalize_data(v: np.array) -> np.array:
    return v / 256

def one_hot_encode(v: np.array, nr_classes: int) -> np.array:
    return np.array([np.array([int(i == label) for i in range(nr_classes)]) for label in v])

def initialize_data() -> ():
    train_x, train_y = download_mnist(True)
    test_x, test_y = download_mnist(False)
    
    #convertim datele in np.array s
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)
    
    train_x, test_x = normalize_data(train_x), normalize_data(test_x)
    train_y, test_y = one_hot_encode(train_y, 10), one_hot_encode(test_y, 10)
    
    return train_x, train_y, test_x, test_y

In [4]:
train_x, train_y, test_x, test_y = initialize_data()

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1129)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1129)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1129)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1129)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw



### Initializarea si antreanarea modelului

In [5]:
def softmax(z: np.array) -> np.array:
    z = z - np.max(z, axis=-1, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=-1, keepdims=True)

def sigmoid(z: np.array) -> np.array:
    return 1 / (1 + np.exp(-z))

def relu(z: np.array) -> np.array:
    return np.maximum(z, 0)

def relu_derivative(z: np.array) -> np.array:
    return np.where(z > 0, 1, 0)

def cross_entropy_loss(y_true: np.array, y_pred: np.array) -> np.array:
    return -np.sum(y_true * np.log(y_pred + 1e-9)) / y_true.shape[1]  # Added small value to avoid log(0)

Partea de stochastic gradient descent. Se observa partea de forward propagation in care trecem batch-ul prin retea, urmata de partea de backward propagation in care calculam derivatele functii de pierdere in raport cu valorile neuronilor fiecarui strat.<br>
Pentru stratul de iesire avem $delta^2 = y^2 - labels$<br>
Pentru stratul ascuns avem $delta^1 = delta^2 \cdot (w^2)^T \times ReLU'(z^1)$<br>
La sfarsit actualizam ponderile si bias-urile inmultind cu learning rate-ul si impartind la batch size.

In [6]:
losses = []

def train(instances: np.array, labels: np.array, hidden_layer: np.array, output_layer: np.array, alpha: float) -> None:
    #print(hidden_layer[1][0])
    #forward
    instances = np.hstack((np.ones((instances.shape[0], 1)), instances))
     
    z1 = np.dot(instances, hidden_layer)
    y1 = relu(z1)
    
    y1 = np.hstack((np.ones((y1.shape[0], 1)), y1))
    z2 = np.dot(y1, output_layer)
    y2 = softmax(z2) #np.array([softmax(row) for row in z2])
    
    losses.append(cross_entropy_loss(labels, y2))
    
    #backward
    delta2 = y2 - labels
    #delta1 = np.dot(delta2, output_layer[1:, :].T) * relu_derivative(z1) #[1:] because we don't take the bias into consideration when we compute dC/dy1
    delta1 = np.dot(delta2, output_layer[1:, :].T) * relu_derivative(z1)
    #print(instances)
    #gradient descent
    output_layer -= alpha * np.dot(y1.T, delta2) / instances.shape[0]
    hidden_layer -= alpha * np.dot(instances.T, delta1) / instances.shape[0]
    #print(instances.shape, delta1.shape)
    

In [7]:
def create_layer(in_size: int, out_size: int) -> np.array:
    return np.random.randn(in_size, out_size) / np.sqrt(in_size)

def create_model(epochs: int, alpha: float, batch_size: int, hidden_layer: np.array, output_layer: np.array, instances: np.array, labels:np.array) -> None:
    for epoch in range(epochs):
        for i in range(instances.shape[0] // batch_size):
            #print(np.max(hidden_layer[0]), np.max(output_layer[0]))
            train(instances[i: i + batch_size], labels[i: i + batch_size], hidden_layer, output_layer, alpha)
        #alpha *= 0.998

In [8]:
hidden_layer = create_layer(785, 100) # 785 fiindca adaugam si bias-ul aici
output_layer = create_layer(101, 10) # acelasi motiv pentru 101

create_model(30, 0.001, 1, hidden_layer, output_layer, train_x, train_y)
#print(hidden_layer[1][0])

### Testarea modelului

In [10]:
def make_prediction(instance: np.array, hidden_layer: np.array, output_layer: np.array) -> np.array:
    instance = np.hstack((np.ones((instance.shape[0], 1)), instance))
     
    z1 = np.dot(instance, hidden_layer).T
    y1 = relu(z1)
    
    y1 = np.vstack((np.ones((1, y1.shape[1])), y1))
    z2 = np.dot(y1.T, output_layer)
    y2 = softmax(z2) #np.array([softmax(row) for row in z2.T])
    #print(z2, y2)
    
    return y2
    
def test_model(hidden_layer, output_layer, test_x: np.array, test_y: np.array) -> float:
    good_guess = 0
    for instance, label in zip(test_x, test_y):
        #print(make_prediction(np.atleast_2d(instance), hidden_layer, output_layer))
        if np.argmax(make_prediction(np.atleast_2d(instance), hidden_layer, output_layer)) == np.argmax(label):
            good_guess += 1
    return good_guess / len(test_y)

In [11]:
print(test_model(hidden_layer, output_layer, test_x, test_y))
print(test_model(hidden_layer, output_layer, train_x, train_y))

0.9746
0.9920666666666667


In [293]:
params_list = [
    {"Epochs": 10, "Learning Rate": 0.01, "Batch Size": 100},
    {"Epochs": 10, "Learning Rate": 0.001, "Batch Size": 100},
    {"Epochs": 10, "Learning Rate": 0.01, "Batch Size": 10},
    {"Epochs": 10, "Learning Rate": 0.001, "Batch Size": 10},
    {"Epochs": 20, "Learning Rate": 0.01, "Batch Size": 100},
    {"Epochs": 20, "Learning Rate": 0.001, "Batch Size": 100},
    {"Epochs": 20, "Learning Rate": 0.01, "Batch Size": 10},
    {"Epochs": 20, "Learning Rate": 0.001, "Batch Size": 10},
    {"Epochs": 100, "Learning Rate": 0.01, "Batch Size": 100},
    {"Epochs": 100, "Learning Rate": 0.001, "Batch Size": 100},
    {"Epochs": 100, "Learning Rate": 0.01, "Batch Size": 10},
    {"Epochs": 100, "Learning Rate": 0.001, "Batch Size": 10},
]

In [287]:
train_x, train_y, test_x, test_y = initialize_data()

In [294]:
import time

nr_tests_per_param = 1

results = []
for params in params_list:
    for _ in range(nr_tests_per_param):
        print(params)
        hidden_layer = create_layer(785, 100) # 785 fiindca adaugam si bias-ul aici
        output_layer = create_layer(101, 10) # acelasi motiv pentru 101
        create_model(10, 0.1, 100, hidden_layer, output_layer, train_x, train_y)
        start = time.time()
        create_model(params["Epochs"], params["Learning Rate"], params["Batch Size"], hidden_layer, output_layer, train_x, train_y)
        end = time.time()
        
        results.append({"Epochs": params["Epochs"], "Learning Rate": params["Learning Rate"], "Batch Size": params["Batch Size"], "Training Time": end - start, "Accuracy": test_model(hidden_layer, output_layer, test_x, test_y)})

{'Epochs': 10, 'Learning Rate': 0.01, 'Batch Size': 100}
{'Epochs': 10, 'Learning Rate': 0.001, 'Batch Size': 100}
{'Epochs': 10, 'Learning Rate': 0.01, 'Batch Size': 10}
{'Epochs': 10, 'Learning Rate': 0.001, 'Batch Size': 10}
{'Epochs': 20, 'Learning Rate': 0.01, 'Batch Size': 100}
{'Epochs': 20, 'Learning Rate': 0.001, 'Batch Size': 100}
{'Epochs': 20, 'Learning Rate': 0.01, 'Batch Size': 10}
{'Epochs': 20, 'Learning Rate': 0.001, 'Batch Size': 10}
{'Epochs': 100, 'Learning Rate': 0.01, 'Batch Size': 100}
{'Epochs': 100, 'Learning Rate': 0.001, 'Batch Size': 100}
{'Epochs': 100, 'Learning Rate': 0.01, 'Batch Size': 10}
{'Epochs': 100, 'Learning Rate': 0.001, 'Batch Size': 10}


### Rezultate

In [295]:
import pandas as pd

df = pd.DataFrame(results)
df.to_csv("model_training_results.csv", index=False)

In [296]:
averaged_results = df.groupby(["Epochs", "Learning Rate", "Batch Size"]).agg({
    "Accuracy": "mean",
    "Training Time": "mean"
}).reset_index()

print(averaged_results)

    Epochs  Learning Rate  Batch Size  Accuracy  Training Time
0       10          0.001          10    0.9163      30.149229
1       10          0.001         100    0.8632       7.468807
2       10          0.010          10    0.9301      30.596278
3       10          0.010         100    0.8655       6.951658
4       20          0.001          10    0.9246      60.069152
5       20          0.001         100    0.8637      14.971083
6       20          0.010          10    0.9377      60.757613
7       20          0.010         100    0.8616      15.142627
8      100          0.001          10    0.9340     303.825330
9      100          0.001         100    0.8623      75.375756
10     100          0.010          10    0.9376     301.530252
11     100          0.010         100    0.8615      77.504213


### Interpretarea rezultatelor<br>
Comparand cu rezultatele obrinute de perceptron la tema anterioara, pot deduce ca undeva am facut o greseala. Acuratetea modelului acestuia ar fi trebuit sa o depaseasca pe cea a perceptronului, el nemaifiind liniar. 