In [61]:
import numpy as np
import pickle
import pandas as pd
import time

np.random.seed(42)
print("✓ Imports complete")

✓ Imports complete


In [62]:
# Load data
train_file = "./input/extended_mnist_train.pkl"
test_file = "./input/extended_mnist_test.pkl"

with open(train_file, "rb") as fp:
    train = pickle.load(fp)
with open(test_file, "rb") as fp:
    test = pickle.load(fp)


In [63]:
# Preprocess
train_data = []
train_labels = []
for image, label in train:
    train_data.append(image.flatten() / 255.0)
    train_labels.append(label)

test_data = []
for image, label in test:
    test_data.append(image.flatten() / 255.0)

X_train_full = np.array(train_data)
y_train_full = np.array(train_labels)
X_test = np.array(test_data)

# Split
split = int(0.9 * len(X_train_full))
X_train = X_train_full[:split]
y_train = y_train_full[:split]
X_val = X_train_full[split:]
y_val = y_train_full[split:]

In [64]:
# Helper functions
def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivative(a):
    return a * (1.0 - a)

def one_hot_encode(y, n_classes=10):
    n = len(y)
    one_hot = np.zeros((n, n_classes))
    one_hot[np.arange(n), y] = 1
    return one_hot

In [65]:
def initialize_weights(input_size, hidden_size, output_size):
    W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
    b1 = np.zeros((1, hidden_size))
    W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
    b2 = np.zeros((1, output_size))
    return W1, b1, W2, b2

def forward(X, W1, b1, W2, b2, dropout_rate=0, training=True):
    z1 = np.dot(X, W1) + b1
    a1 = sigmoid(z1)
    
    dropout_mask = None
    if training and dropout_rate > 0:
        dropout_mask = (np.random.rand(*a1.shape) > dropout_rate) / (1 - dropout_rate)
        a1 = a1 * dropout_mask
    
    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)
    
    return z1, a1, z2, a2, dropout_mask

def backward(X, y, z1, a1, z2, a2, W2, dropout_mask, l2_lambda=0.0001):
    m = X.shape[0]
    y_one_hot = one_hot_encode(y, 10)
    
    delta2 = a2 - y_one_hot
    dW2 = np.dot(a1.T, delta2) / m + l2_lambda * W2
    db2 = np.sum(delta2, axis=0, keepdims=True) / m
    
    delta1 = np.dot(delta2, W2.T) * sigmoid_derivative(a1)
    if dropout_mask is not None:
        delta1 = delta1 * dropout_mask
    
    dW1 = np.dot(X.T, delta1) / m
    db1 = np.sum(delta1, axis=0, keepdims=True) / m
    
    return dW1, db1, dW2, db2

def compute_accuracy(X, y, W1, b1, W2, b2):
    _, _, _, a2, _ = forward(X, W1, b1, W2, b2, dropout_rate=0, training=False)
    predictions = np.argmax(a2, axis=1)
    return np.mean(predictions == y)

In [None]:
HIDDEN_SIZE = 120
EPOCHS = 60
BATCH_SIZE = 128     
LEARNING_RATE = 0.5
DROPOUT_RATE = 0.03
L2_LAMBDA = 0.001     


In [None]:
# Initialize
W1, b1, W2, b2 = initialize_weights(784, HIDDEN_SIZE, 10)

n_samples = len(X_train)
n_batches = n_samples // BATCH_SIZE

# Training
start_time = time.time()
lr = LEARNING_RATE
best_val_acc = 0
patience_counter = 0
for epoch in range(EPOCHS):
    idx = np.random.permutation(n_samples)
    X_shuffled = X_train[idx]
    y_shuffled = y_train[idx]
    
    for i in range(n_batches):
        start = i * BATCH_SIZE
        end = start + BATCH_SIZE
        
        X_batch = X_shuffled[start:end]
        y_batch = y_shuffled[start:end]
        
        z1, a1, z2, a2, dropout_mask = forward(
            X_batch, W1, b1, W2, b2, 
            dropout_rate=DROPOUT_RATE, training=True
        )
        
        dW1, db1, dW2, db2 = backward(
            X_batch, y_batch, z1, a1, z2, a2, W2, dropout_mask, L2_LAMBDA
        )
        
        W1 -= lr * dW1
        b1 -= lr * db1
        W2 -= lr * dW2
        b2 -= lr * db2
    
    if epoch % 10 == 0 or epoch == EPOCHS - 1:
        train_acc = compute_accuracy(X_train, y_train, W1, b1, W2, b2)
        val_acc = compute_accuracy(X_val, y_val, W1, b1, W2, b2)
        
        elapsed = time.time() - start_time
        print(f"epoca {epoch+1:3d}/{EPOCHS} "
              f"training acc: {train_acc:.4f} val: {val_acc:.4f} | "
              f"lr: {lr:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= 3:
                lr *= 0.7
                patience_counter = 0
                print(f"Learning rate redus la {lr:.6f}")

epoca   1/60 training acc: 0.9019 val: 0.9233 | lr: 0.5000
epoca  11/60 training acc: 0.9565 val: 0.9652 | lr: 0.5000
epoca  21/60 training acc: 0.9672 val: 0.9712 | lr: 0.5000
epoca  31/60 training acc: 0.9720 val: 0.9730 | lr: 0.5000
epoca  41/60 training acc: 0.9755 val: 0.9740 | lr: 0.5000
epoca  51/60 training acc: 0.9776 val: 0.9755 | lr: 0.5000
epoca  60/60 training acc: 0.9786 val: 0.9755 | lr: 0.5000


In [68]:
# Final metrics
train_acc_final = compute_accuracy(X_train_full, y_train_full, W1, b1, W2, b2)
val_acc_final = compute_accuracy(X_val, y_val, W1, b1, W2, b2)
print(f"train acc: {train_acc_final*100:.2f}%")
print(f"vall acc: {val_acc_final*100:.2f}%")
print(f"training time {training_time:.2f}s ({training_time/60:.2f}min)")


train acc: 97.83%
vall acc: 97.55%
training time 75.52s (1.26min)


In [69]:
# Create submission
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'target': predictions.astype(int)
})

submission.to_csv('submission.csv', index=False)
print("submission done")

submission done
