In [225]:
import pickle
import os
import pandas as pd
import numpy as np
import torch

In [226]:
#TODO: 
# L1, L2 reg
# Droupout on hidden layer
# adaptive learning rate when metrics reach plateau
# data augmentation by cropping the photos and also rotating them by some small degree to obtain more synthetic data

In [227]:
train_file = "fii-nn-2025-homework-2/extended_mnist_train.pkl"
test_file = "fii-nn-2025-homework-2/extended_mnist_test.pkl"

with open(train_file, "rb") as fp:
    train = pickle.load(fp)

with open(test_file, "rb") as fp:
    test = pickle.load(fp)

train_data = []
train_labels = []
for image, label in train:
    train_data.append(image.flatten())
    train_labels.append(label)

test_data = []
for image, label in test:
    test_data.append(image.flatten())


In [228]:

shuffle_idx = np.random.permutation(len(train_data))
train_data = np.array(train_data)[shuffle_idx]
train_labels = np.array(train_labels)[shuffle_idx]

test_data = np.array(test_data)

print("Train samples: ", len(train_data))
print("Test samples: ", len(test_data))
print("Image shape: ", train_data[1].shape)

Train samples:  60000
Test samples:  10000
Image shape:  (784,)


In [229]:
def normalize(data):
    return data / 255.0

train_data = np.array(normalize(train_data)).astype(np.float64)
test_data = np.array(normalize(test_data)).astype(np.float64)

In [230]:
def xavier_init(n_inputs, n_neurons):
    return np.random.normal(0, np.sqrt(1 / n_inputs), (n_inputs, n_neurons)).astype(np.float64)

In [231]:
#Initialization
n_out_neurons = 10
n_hidden_neurons = 100
learning_rate = 0.1
learning_rate_decay = 0.001
step_size = 5
batch_size = 32
epochs = 30
n_inputs = train_data.shape[1]
weights = [xavier_init(n_inputs, n_hidden_neurons), xavier_init(n_hidden_neurons, n_out_neurons)]  # xavier initialization
biases = [np.zeros(100, dtype=np.float64), np.zeros(10, dtype=np.float64)]

print("Weights shape: ", [w.shape for w in weights])
print("Biases shape: ", [b.shape for b in biases])


Weights shape:  [(784, 100), (100, 10)]
Biases shape:  [(100,), (10,)]


In [None]:
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Numerical stability improvement
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def softmax_test(z):
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def relu(x):
    return np.clip(x, min = 0)

def relu_deriv(x):
    return (x > 0).astype(np.float64)


In [233]:
def one_hot(batch, output_size):
    result = np.zeros((len(batch), output_size))
    for i, l in enumerate(batch):
        result[i, l] = 1
    return result

In [234]:
def cross_entropy(target, y):
    return -np.sum(target * np.log(y + 1e-10))


In [235]:
def split(data, batch_size):
    return np.array_split(data, len(data) / batch_size)

In [None]:
def forward(x, w, b):
    # (32, 784) @ (748, 100) = (32, 100)
    # (32, 100) @ (100, 10) = (32, 10)
    x_in = x
    layers = len(w)
    y_list = []
    #ReLU on the hidden layers
    for layer in range(layers - 1):
        #print(f"x_in: {x_in.shape}, w_layer{layer}: {w[layer].shape}")
        z = x_in @ w[layer] + b[layer]
        #print(f"z: {z.shape}")
        y = relu(z)
        y_list.append(y)
        #print(f"y: {y.shape}")
        x_in = y
    
    # Softmax on the last layer (output layer)
    z = x_in @ w[-1] + b[-1]
    y = softmax(z)
    y_list.append(y)
    #print(f"y: {y.shape}")
    return y_list

In [None]:
def gradient_descent(w, b, g):
    pass

def backward(x, y, target, w, b, layer, out_layer):
    # softmax on the final layer
    delta = y[-1] - target
    gradient_w = y[-2].T @ delta
    gradient_b = np.sum(delta, axis=0)

    w[-1] -= gradient_w * learning_rate
    b[-1] = gradient_b * learning_rate

    L = len(w)

    for i in range(L - 1, 0, -1):
        
    




In [238]:
def l1_l2_penalty(w, l1=0.0, l2=0.0):
    pass

In [None]:
# Batch Training
batches = split(train_data, batch_size)
label_batches = split(train_labels, batch_size)
label_batches = [one_hot(batch, n_out_neurons) for batch in label_batches]
print("Batch shape: ", batches[0].shape)
print("Batch label shape: ", label_batches[0].shape)
print("Weights shape: ", [w.shape for w in weights])
print("Biases shape: ", [b.shape for b in biases])
print("Starting training...")

for epoch in range(epochs):

    epoch_loss = np.float64(0.0)
    if epoch and epoch % step_size == 0:
        learning_rate = learning_rate - learning_rate_decay
        print("Lower lr: ", learning_rate)

    for x, target in zip(batches, label_batches):

        #forward
        # z = np.dot(x, weights) + biases
        # y = softmax(z)

        y_list = forward(x, weights, biases)

        loss = torch.nn.functional.cross_entropy(input = torch.tensor(y), target=torch.tensor(target))

        backward(x, y_list, target, weights, biases, layer = 1, out_layer = True)

        #backward
        # gradient_w = x.T @ (y - target)
        # gradient_b = np.sum(y - target, axis = 0)
        
        # weights -= gradient_w * learning_rate
        # biases -= gradient_b * learning_rate

        epoch_loss += loss
    
    print(f"Epoch {epoch + 1} completed. Loss: {epoch_loss / len(batches)}")

Batch shape:  (32, 784)
Batch label shape:  (32, 10)
Weights shape:  [(784, 100), (100, 10)]
Biases shape:  [(100,), (10,)]
Starting training...
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_layer0: (784, 100)
z: (32, 100)
y: (32, 100)
y: (32, 10)
x_in: (32, 784), w_

KeyboardInterrupt: 

In [None]:
def test_network(test_data, weights, biases):
    results = []
    for sample_v in test_data:
        predictions = np.array([])
        for i in range(n_out_neurons):
            prediction = np.dot(weights[:, i], sample_v) + biases[i]
            predictions = np.append(predictions, prediction)
        softmax_preds = softmax_test(predictions)
        argmax_pred = np.argmax(softmax_preds)
        results.append(argmax_pred)
    
    return results

In [None]:
predictions_csv = {
    "ID": [],
    "target": [],
}

predictions = test_network(test_data, weights, biases)

for i, label in enumerate(predictions):
    predictions_csv["ID"].append(i)
    predictions_csv["target"].append(label)

df = pd.DataFrame(predictions_csv)
df.to_csv("submission.csv", index=False)

TypeError: list indices must be integers or slices, not tuple