In [29]:
import numpy as np
import pickle
import pandas as pd
import time

np.random.seed(74)
print("✓ Imports complete")

✓ Imports complete


In [None]:
# Load data
train_file = "../input/extended_mnist_train.pkl"
test_file = "../input/extended_mnist_test.pkl"

with open(train_file, "rb") as fp:
    train = pickle.load(fp)
with open(test_file, "rb") as fp:
    test = pickle.load(fp)


In [31]:
# Preprocess
train_data = []
train_labels = []
for image, label in train:
    train_data.append(image.flatten() / 255.0)
    train_labels.append(label)

test_data = []
for image, label in test:
    test_data.append(image.flatten() / 255.0)

X_train_full = np.array(train_data)
y_train_full = np.array(train_labels)
X_test = np.array(test_data)

# Split
split = int(0.9 * len(X_train_full))
X_train = X_train_full[:split]
y_train = y_train_full[:split]
X_val = X_train_full[split:]
y_val = y_train_full[split:]

In [32]:
def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivat(a):
    return a * (1.0 - a)

def one_hot_encode(y, n_classes=10):
    n = len(y)
    one_hot = np.zeros((n, n_classes))
    one_hot[np.arange(n), y] = 1
    return one_hot

def softmax(z):
    z = z - np.max(z, axis=1, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def cross_entropy_loss(a2, y, W1, W2, l2_lambda=0.0):
    m = a2.shape[0]
    y_encoded = one_hot_encode(y, a2.shape[1])
    eps = 1e-12
    loss = -np.sum(y_encoded * np.log(a2 + eps)) / m

    if l2_lambda and l2_lambda > 0:
        loss += 0.5 * l2_lambda * (np.sum(W1**2) + np.sum(W2**2))
    return loss

In [None]:
def initialize_weights(input_size, hidden_size, output_size):
    
    W1 = np.random.randn(input_size, hidden_size) * np.sqrt(1.0 / input_size)
    b1 = np.zeros((1, hidden_size))
    W2 = np.random.randn(hidden_size, output_size) * np.sqrt(1.0 / hidden_size)
    b2 = np.zeros((1, output_size))
    return W1, b1, W2, b2

def forward(X, W1, b1, W2, b2, dropout_rate=0, training=True):
    z1 = np.dot(X, W1) + b1
    a1 = sigmoid(z1)
    

    #aplicam un dropout pe stratu 1
    dropout_mask = None
    if training and dropout_rate > 0: # aici verificam daca suntem in starea de training 
        dropout_mask = (np.random.rand(*a1.shape) > dropout_rate) / (1 - dropout_rate)
        a1 = a1 * dropout_mask
    

    #calculeaza output ul de pe stratul 2
    z2 = np.dot(a1, W2) + b2
    a2 = softmax(z2)
    
    return z1, a1, z2, a2, dropout_mask

def backward(X, y, z1, a1, z2, a2, W1, W2, dropout_mask, l2_lambda=0.0001):
    m = X.shape[0]
    y_encoded = one_hot_encode(y, a2.shape[1])
    
    delta2 = a2 - y_encoded #diferenta dintre ce am prezis si ce trebuia prezis

    #regularizarea l2 pentru gradienti folosind stratu 2
    dW2 = np.dot(a1.T, delta2) / m + l2_lambda * W2
    db2 = np.sum(delta2, axis=0, keepdims=True) / m
    

    #propagam eroarea la primul strat 
    delta1 = np.dot(delta2, W2.T) * sigmoid_derivat(a1)
    if dropout_mask is not None:
        delta1 = delta1 * dropout_mask #aplicand si dropout ul

    #same si aici regularizarea l2 folosind stratu 1
    dW1 = np.dot(X.T, delta1) / m + l2_lambda * W1
    db1 = np.sum(delta1, axis=0, keepdims=True) / m
    
    return dW1, db1, dW2, db2

def compute_accuracy(X, y, W1, b1, W2, b2):
    _, _, _, a2, _ = forward(X, W1, b1, W2, b2, dropout_rate=0, training=False)
    predictions = np.argmax(a2, axis=1)
    return np.mean(predictions == y)

In [34]:
#hyperparametri
HIDDEN_SIZE = 100  # conform cerintei
EPOCHS = 60
BATCH_SIZE = 128
LEARNING_RATE = 0.01  
DROPOUT_RATE = 0.3   # 30% din neuroni sunt inchisi 
L2_LAMBDA = 0.0001   #

In [None]:
# Initialize
W1, b1, W2, b2 = initialize_weights(784, HIDDEN_SIZE, 10)

n_samples = len(X_train)
n_batches = (n_samples + BATCH_SIZE - 1) // BATCH_SIZE  # handle last batch

# Training
print(f"incepem training cu {n_samples} si folosim {BATCH_SIZE} per epoca")
print(f"784 (28x28) -> {HIDDEN_SIZE} (marimea hidden layer ului -> 10")
print(f"folosim pentur regularizare : dropout={DROPOUT_RATE} si L2={L2_LAMBDA}")

start_time = time.time()
lr = LEARNING_RATE
best_val_acc = 0
k_ast = 0 # de cate ori schimbam learning rate ul 
best_weights = None

for epoch in range(EPOCHS):
    # dam  un mic shuffle
    idx = np.random.permutation(n_samples)
    X_shuffled = X_train[idx]
    y_shuffled = y_train[idx]
    
    epoch_loss = 0

    #training looo00p
    for i in range(n_batches):
        inceput = i * BATCH_SIZE
        sfarsit = min(inceput + BATCH_SIZE, n_samples)
        
        X_batch = X_shuffled[inceput:sfarsit]
        y_batch = y_shuffled[inceput:sfarsit]
        
        #faza de ghicire
        z1, a1, z2, a2, dropout_mask = forward(
            X_batch, W1, b1, W2, b2, 
            dropout_rate=DROPOUT_RATE, training=True
        )
        
        # calculam lossul pe training
        batch_loss = cross_entropy_loss(a2, y_batch, W1, W2, L2_LAMBDA)
        epoch_loss += batch_loss * len(X_batch)
        
        # faza de invatare
        dW1, db1, dW2, db2 = backward(
            X_batch, y_batch, z1, a1, z2, a2, W1, W2, dropout_mask, L2_LAMBDA
        )
        
        # calculam noile weights 
        W1 -= lr * dW1
        b1 -= lr * db1
        W2 -= lr * dW2
        b2 -= lr * db2
    
    # avg loss pt fiecare epoca
    epoch_loss /= n_samples
    
    # raportam ce avem odata la 5 epoci
    if epoch % 5 == 0 or epoch == EPOCHS - 1:
        train_acc = compute_accuracy(X_train, y_train, W1, b1, W2, b2)
        val_acc = compute_accuracy(X_val, y_val, W1, b1, W2, b2)
        
        
        _, _, _, a2_val, _ = forward(X_val, W1, b1, W2, b2, dropout_rate=0, training=False)
        val_loss = cross_entropy_loss(a2_val, y_val, W1, W2, L2_LAMBDA)
        
        elapsed = time.time() - start_time
        print(f"epoca {epoch+1:3d}/{EPOCHS} | "
              f"train acc: {train_acc:.4f}  val acc: {val_acc:.4f} | "
              f"train loss: {epoch_loss:.4f}  val loss: {val_loss:.4f} | "
              f"lr: {lr:.4f}  "
              f"time: {elapsed:.1f}s")
        
        #verificam daca, cu lr-ul actual modelul s-a imbunatatit sau nu
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_weights = (W1.copy(), b1.copy(), W2.copy(), b2.copy())
            print(f"noua acuratete de validare: {best_val_acc:.4f}")
            k_ast = 0 # daca s-a imbunatatit, resetam contorul
        else:
            k_ast += 1 #altfel crestem contorul 
            if k_ast >= 3: #si aplicam lr decay
                lr *= 0.7 #reducem cu 30%
                k_ast = 0 #resetam contorul pt urmatorul lr 
                print(f"reducem lr la  {lr:.6f}")


#alegem cele mai bune weighturi pt modelul nostru nou 
if best_weights is not None:
    W1, b1, W2, b2 = best_weights
    print("am ales cele mai bune weighturi")

incepem training cu 54000 si folosim 128 per epoca
784 (28x28) -> 100 (marimea hidden layer ului -> 10
folosim pentur regularizare : dropout=0.3 si L2=0.0001
epoca   1/60 | train acc: 0.6178  val acc: 0.6472 | train loss: 2.2438  val loss: 2.0715 | lr: 0.0100  time: 1.2s
noua acuratete de validare: 0.6472
epoca   6/60 | train acc: 0.8040  val acc: 0.8428 | train loss: 1.1823  val loss: 1.0075 | lr: 0.0100  time: 6.3s
noua acuratete de validare: 0.8428
epoca  11/60 | train acc: 0.8508  val acc: 0.8850 | train loss: 0.8054  val loss: 0.6458 | lr: 0.0100  time: 11.4s
noua acuratete de validare: 0.8850
epoca  16/60 | train acc: 0.8701  val acc: 0.8998 | train loss: 0.6575  val loss: 0.5031 | lr: 0.0100  time: 16.5s
noua acuratete de validare: 0.8998
epoca  21/60 | train acc: 0.8815  val acc: 0.9070 | train loss: 0.5787  val loss: 0.4298 | lr: 0.0100  time: 21.6s
noua acuratete de validare: 0.9070
epoca  26/60 | train acc: 0.8882  val acc: 0.9118 | train loss: 0.5296  val loss: 0.3861 | lr:

In [36]:

training_time = time.time() - start_time

train_acc_final = compute_accuracy(X_train_full, y_train_full, W1, b1, W2, b2)
val_acc_final = compute_accuracy(X_val, y_val, W1, b1, W2, b2)

_, _, _, a2_test, _ = forward(X_test, W1, b1, W2, b2, dropout_rate=0, training=False)
predictions = np.argmax(a2_test, axis=1)

_, _, _, a2_train, _ = forward(X_train_full, W1, b1, W2, b2, dropout_rate=0, training=False)
_, _, _, a2_val, _ = forward(X_val, W1, b1, W2, b2, dropout_rate=0, training=False)
train_loss = cross_entropy_loss(a2_train, y_train_full, W1, W2, L2_LAMBDA)
val_loss = cross_entropy_loss(a2_val, y_val, W1, W2, L2_LAMBDA)

print(f"rez")
print(f"acuratetea training: {train_acc_final*100:.2f}%")
print(f"acuratetea de validare modelului: {val_acc_final*100:.2f}%")
print(f"training loss: {train_loss:.4f}")
print(f"validation loss loss: {val_loss:.4f}")
print(f"timp: {training_time:.2f}s ({training_time/60:.2f}min)")

rez
acuratetea training: 90.99%
acuratetea de validare modelului: 92.53%
training loss: 0.3379
validation loss loss: 0.2844
timp: 62.00s (1.03min)


In [37]:
# Create submission
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'target': predictions.astype(int)
})

submission.to_csv('submission.csv', index=False)
print("submission done")

submission done
