In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
from scipy import ndimage

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Chinese MNIST: Two‑Layer MLP with Mini‑Batch Adam + Augmentation + Early‑Stopping
## All using numpy
In this notebook we will:
*  Load and split the data into train / validation / test.
*  Define helper functions (`iterate_minibatches`, `augment_batch_shifts`, `show_random_prediction`, `predict_random_samples`).
*  Implement a mini‑batch Adam optimizer with on‑the‑fly random shifts and early stopping.
*  Train the model and track train/test accuracy.
*  Evaluate on the held‑out test set.
* Visualize a few random predictions.

# Load & preprocess data

In [None]:
# 1.1 Read CSV into NumPy array
df = pd.read_csv('/kaggle/input/chinese-mnist-digit-recognizer/chineseMNIST.csv')
raw = df.values              # shape should be (m, 4098)
m, n = raw.shape
df.head()

In [None]:
# 1.2 Build label mappings
digit_labels = raw[:, 4096].astype(int)
char_labels  = raw[:, 4097].astype(str)
unique_chars = sorted(set(char_labels))
char_to_idx  = { ch:i for i,ch in enumerate(unique_chars) }
idx_to_char  = { i:ch for ch,i in char_to_idx.items() }
N_CHARS      = len(unique_chars)

print(f"Found {N_CHARS} distinct Chinese symbols: {unique_chars}")

In [None]:
# 1.3 Three‑way split
def load_and_split_three(raw_data, test_size=1000, val_size=1000, seed=42):
    """
    Splits raw_data into train, validation, and test sets.
    raw_data: NumPy array of shape (m, 4098)
    test_size: number of samples for final test set
    val_size:  number of samples for validation set
    seed:      random seed for reproducibility
    Returns:
      (X_train, Yd_train, Yc_train),
      (X_val,   Yd_val,   Yc_val),
      (X_test,  Yd_test,  Yc_test)
    """
    # Extract pixels and labels
    pixels = raw_data[:, :4096].astype(float)
    X = pixels.T  # shape (4096, m)

    Y_digit = raw_data[:, 4096].astype(int)
    Y_char  = np.array([char_to_idx[ch] for ch in raw_data[:, 4097]], dtype=int)

    # Shuffle indices
    m = X.shape[1]
    rng = np.random.default_rng(seed)
    perm = rng.permutation(m)

    # Compute split indices
    test_idx = perm[:test_size]
    val_idx  = perm[test_size:test_size + val_size]
    train_idx= perm[test_size + val_size:]

    # Create splits
    X_test  = X[:, test_idx]
    Yd_test = Y_digit[test_idx]
    Yc_test = Y_char[test_idx]

    X_val   = X[:, val_idx]
    Yd_val  = Y_digit[val_idx]
    Yc_val  = Y_char[val_idx]

    X_train  = X[:, train_idx]
    Yd_train = Y_digit[train_idx]
    Yc_train = Y_char[train_idx]

    return (X_train, Yd_train, Yc_train), (X_val, Yd_val, Yc_val), (X_test, Yd_test, Yc_test)

(train_set, val_set, test_set) = load_and_split_three(raw, test_size=1000, val_size=1000)
X_train, Yd_train, Yc_train = train_set
X_val,   Yd_val,   Yc_val   = val_set
X_test,  Yd_test,  Yc_test  = test_set

print("X_train:", X_train.shape, "Yc_train:", Yc_train.shape)
print("X_val:  ", X_val.shape,   "Yc_val:  ", Yc_val.shape)
print("X_test: ", X_test.shape,  "Yc_test: ", Yc_test.shape)


## Helper functions

Below are our mini‑batch iterator, augmentation function, and a quick way to visualize a random test prediction.


In [None]:
# helper functions

# function to iterate through mini-batches to help with over-fitting
def iterate_minibatches(X, Y, batch_size=64):
    """
    Generator yielding mini-batches of (X, Y).
    - X: numpy array of shape (n_x, m)
    - Y: numpy array of shape (m,)
    - batch_size: number of samples per batch
    """
    m = X.shape[1]
    perm = np.random.permutation(m)
    for i in range(0, m, batch_size):
        idx = perm[i:i + batch_size]
        yield X[:, idx], Y[idx]

# function to shift pictures to help with over-fitting 
def augment_batch_shifts(Xb, max_shift=3):
    """
    Xb: (4096, batch_size) flattened 64×64 images
    returns: same shape, each image randomly rolled by up to ±max_shift pixels
    """
    n_x, m = Xb.shape
    H = W = int(np.sqrt(n_x))
    imgs = Xb.reshape(H, W, m)
    for i in range(m):
        dx = np.random.randint(-max_shift, max_shift+1)
        dy = np.random.randint(-max_shift, max_shift+1)
        imgs[:, :, i] = np.roll(np.roll(imgs[:, :, i], dx, axis=0), dy, axis=1)
    return imgs.reshape(n_x, m)

# function to show what the current model parameters would predict
def show_random_prediction(W1, b1, W2, b2, X, Y, idx_to_char, forward_prop):
    """
    Pick a random sample from (X, Y), run forward_prop, and display:
      - the image
      - the model's predicted character vs. the true character
    """
    # Choose a random sample index
    idx = np.random.randint(0, X.shape[1])
    image_vec = X[:, idx]
    image = image_vec.reshape(64, 64)  # reshape back to 64×64

    # Forward pass for this single example
    _, _, _, A2 = forward_prop(
        W1, b1, W2, b2,
        image_vec.reshape(-1, 1)  # shape (4096, 1)
    )
    probs = A2.flatten()

    # Determine predicted and true labels
    pred_idx = np.argmax(probs)
    pred_char = idx_to_char[pred_idx]
    true_char = idx_to_char[Y[idx]]

    # Print out details
    print(f"Sample index: {idx}")
    print("Class probabilities:", np.round(probs, 3))
    print(f"Predicted → char: {pred_char}")
    print(f"True      → char: {true_char}")

    # Plot the image
    plt.figure(figsize=(4, 4))
    plt.imshow(image, cmap='gray')
    plt.axis('off')
    plt.show()

# function I use to get prediction samples with the validation set
def predict_random_samples(
    W1, b1, W2, b2,
    X_val, Yc_val, Yd_val=None,
    num_samples=8
):
    """
    Show `num_samples` random images from (X_val, Yc_val),
    predict with the network, and display true vs predicted.
    """
    m_val, n_val = X_val.shape[1], X_val.shape[0]
    side = int(np.sqrt(n_val))                   #  sqrt(4096) = 64
    idxs = np.random.choice(m_val, size=num_samples, replace=False)

    # forward prop on entire val set once
    _, _, _, A2_val = forward_prop(W1, b1, W2, b2, X_val)
    preds_c = np.argmax(A2_val, axis=0)

    plt.figure(figsize=(num_samples*1.5, 2))
    for i, idx in enumerate(idxs):
        img = X_val[:, idx].reshape(side, side)
        true_c = Yc_val[idx]
        pred_c = preds_c[idx]

        ax = plt.subplot(1, num_samples, i+1)
        ax.imshow(img, cmap='gray')
        ax.axis('off')
        ax.set_title(f"T:{true_c}\nP:{pred_c}", fontsize=8)
    plt.tight_layout()
    plt.show()


## Model Architecture

* Input layer: 4096 neurons (64×64 pixel values)

* Hidden layer: 128 neurons, ReLU activation

* Output layer: 15 neurons, Softmax activation (amount of characters)

In [None]:
# Variables for my nerual netowrk
INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE = 4096, 128, N_CHARS
 
def init_params():
    W1 = np.random.randn(HIDDEN_SIZE, INPUT_SIZE) * np.sqrt(2.0/INPUT_SIZE)
    b1 = np.zeros((HIDDEN_SIZE, 1))
    W2 = np.random.randn(OUTPUT_SIZE, HIDDEN_SIZE) * np.sqrt(2.0/HIDDEN_SIZE)
    b2 = np.zeros((OUTPUT_SIZE, 1))
    return W1, b1, W2, b2

def ReLu(Z, derv=False):
    if derv: return np.where(Z>0, 1, 0)
    return np.maximum(Z, 0)

def softmax(Z):
    Zc   = Z - np.max(Z, axis=0, keepdims=True)
    expZ = np.exp(Zc)
    return expZ / np.sum(expZ, axis=0, keepdims=True)

def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1 
    A1 = ReLu(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def one_hot(Y, n_classes):
    m = Y.size
    Y_oh = np.zeros((n_classes, m))
    Y_oh[Y, np.arange(m)] = 1
    return Y_oh

def deriv_ReLu(Z):
    return Z > 0
    
def back_prop(W2, Z1, A1, Z2, A2, X, Y):
    m    = Y.size
    Y_oh = one_hot(Y, A2.shape[0])

    dZ2  = A2 - Y_oh
    dW2  = (1/m) * dZ2.dot(A1.T)
    db2  = (1/m) * np.sum(dZ2, axis=1, keepdims=True)

    dZ1  = W2.T.dot(dZ2) * (Z1 > 0)
    dW1  = (1/m) * dZ1.dot(X.T)
    db1  = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 -= alpha * dW1
    b1 -= alpha * db1
    W2 -= alpha * dW2
    b2 -= alpha * db2
    return W1, b1, W2, b2

#  Implementation Details

### Key components:

* init_params(): Initialization for weights, zeros for biases

* forward_prop(): Computes layer outputs and activations

* ReLu(): Applies the rectified linear unit activation function

* softmax(): Converts logits into probability distributions over classes

* back_prop(): Derives gradients for weights and biases

* update_params(): Applies gradient descent updates

In [None]:
# Normal gradient descent
def gradient_descent(X, Y, iterations, alpha):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = back_prop(W2, Z1, A1, Z2, A2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 50 == 0:
            preds = np.argmax(A2, axis=0)
            acc   = np.mean(preds == Y)
            print(f"Iteration {i:4d} — accuracy with train data: {acc*100:5.2f}%")
    return W1, b1, W2, b2

In [None]:
# Train with gradient descent
W1, b1, W2, b2 = gradient_descent(X_train, Yc_train,
                                  iterations=500,
                                  alpha=0.05)

# Final evaluation on test set
_, _, _, A2_test = forward_prop(W1, b1, W2, b2, X_test)
preds = np.argmax(A2_test, axis=0)
test_acc = np.mean(preds == Yc_test)
print(f"Test accuracy: {test_acc*100:.2f}%")

This is a simple 2 layer neural network and its accuracy ranges from about 30% to 45% over 500 iterations on the Chinese MNIST data set, this model on the english data set would be around 80% I want to improve upon this model and get it to atleast 85% accuracy

In [None]:
# Run the see the current model in action
# The model will have poor performance at this stage
show_random_prediction(
    W1, b1, W2, b2,
    X_test, Yc_test,
    idx_to_char,
    forward_prop
)

In [None]:
# Adam optimization for 2‐layer network
def gradient_descent_adam(
    X_train, Y_train,
    X_test,   Y_test,
    iterations=500,
    alpha=0.001,
    beta1=0.9, beta2=0.999,
    epsilon=1e-8,
    print_every=50
):
    """
    Adam optimizer with train & validation logging.
      - X_train, Y_train: training data
      - X_test,   Y_test:   validation data for monitoring
    """
    # 1) Init parameters & Adam moments
    W1, b1, W2, b2 = init_params()
    mW1, vW1 = np.zeros_like(W1), np.zeros_like(W1)
    mb1, vb1 = np.zeros_like(b1), np.zeros_like(b1)
    mW2, vW2 = np.zeros_like(W2), np.zeros_like(W2)
    mb2, vb2 = np.zeros_like(b2), np.zeros_like(b2)

    for t in range(1, iterations + 1):
        # 2) Forward + backward on TRAINING set
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X_train)
        dW1, db1, dW2, db2 = back_prop(W2, Z1, A1, Z2, A2, X_train, Y_train)

        # 3) Adam moment updates
        mW1 = beta1*mW1 + (1-beta1)*dW1
        mb1 = beta1*mb1 + (1-beta1)*db1
        mW2 = beta1*mW2 + (1-beta1)*dW2
        mb2 = beta1*mb2 + (1-beta1)*db2

        vW1 = beta2*vW1 + (1-beta2)*(dW1**2)
        vb1 = beta2*vb1 + (1-beta2)*(db1**2)
        vW2 = beta2*vW2 + (1-beta2)*(dW2**2)
        vb2 = beta2*vb2 + (1-beta2)*(db2**2)

        # 4) Bias correction
        mW1_corr = mW1 / (1 - beta1**t)
        mb1_corr = mb1 / (1 - beta1**t)
        mW2_corr = mW2 / (1 - beta1**t)
        mb2_corr = mb2 / (1 - beta1**t)
        vW1_corr = vW1 / (1 - beta2**t)
        vb1_corr = vb1 / (1 - beta2**t)
        vW2_corr = vW2 / (1 - beta2**t)
        vb2_corr = vb2 / (1 - beta2**t)

        # 5) Parameter update
        W1 -=  alpha * mW1_corr / (np.sqrt(vW1_corr) + epsilon)
        b1 -=  alpha * mb1_corr / (np.sqrt(vb1_corr) + epsilon)
        W2 -=  alpha * mW2_corr / (np.sqrt(vW2_corr) + epsilon)
        b2 -=  alpha * mb2_corr / (np.sqrt(vb2_corr) + epsilon)

        # 6) Logging
        if t % print_every == 0 or t == 1:
            train_acc = np.mean(np.argmax(A2,axis=0) == Y_train)*100

            # forward on TEST set
            _, _, _, A2_test = forward_prop(W1, b1, W2, b2, X_test)
            test_acc = np.mean(np.argmax(A2_test,axis=0) == Y_test)*100

            print(f"Iteration {t:4d} — accuracy with train data: {train_acc:5.2f}%   accuracy with test data: {test_acc:5.2f}%")

    return W1, b1, W2, b2



In [None]:
# Run Adam
W1, b1, W2, b2 = gradient_descent_adam(
    X_train, Yc_train, X_test,  Yc_test,

)

# Evaluate on the test set
_, _, _, A2_test = forward_prop(W1, b1, W2, b2, X_test)
preds = np.argmax(A2_test, axis=0)
accuracy = np.mean(preds == Yc_test)
print(f"Test accuracy with Adam: {accuracy*100:.2f}%")

This model is better than the previous one topping out at about 55% to 60% over 500 iterations. with that said this model is heavily over-fitted as you can see by the accuracy with train data. This needs to be fixed in order to increase the valid accuracy. I will start by adding mini-batches to see if that helps. 

In [None]:
# Run the see the current model in action
show_random_prediction(
    W1, b1, W2, b2,
    X_test, Yc_test,
    idx_to_char,
    forward_prop
)

In [None]:
# Adam optimization with mini-batches
def gradient_descent_adam_mb(
    X_train, Y_train, X_test, Y_test,
    epochs=100, alpha=0.001,
    beta1=0.9, beta2=0.999, epsilon=1e-8,
    batch_size=64, print_every=5
):
    """
    Adam optimizer with mini-batches and validation logging.
      - epochs: number of full passes over the data
      - batch_size: mini-batch size
      - print_every: log every N epochs
    """
    # Initialize parameters & moments
    W1, b1, W2, b2 = init_params()
    mW1, vW1 = np.zeros_like(W1), np.zeros_like(W1)
    mb1, vb1 = np.zeros_like(b1), np.zeros_like(b1)
    mW2, vW2 = np.zeros_like(W2), np.zeros_like(W2)
    mb2, vb2 = np.zeros_like(b2), np.zeros_like(b2)
    t = 0

    for epoch in range(1, epochs + 1):
        # Mini-batch updates
        for Xb, Yb in iterate_minibatches(X_train, Y_train, batch_size):
            t += 1
            Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, Xb)
            dW1, db1, dW2, db2 = back_prop(W2, Z1, A1, Z2, A2, Xb, Yb)

            # Adam first moments
            mW1 = beta1*mW1 + (1-beta1)*dW1
            mb1 = beta1*mb1 + (1-beta1)*db1
            mW2 = beta1*mW2 + (1-beta1)*dW2
            mb2 = beta1*mb2 + (1-beta1)*db2

            # Adam second moments
            vW1 = beta2*vW1 + (1-beta2)*(dW1**2)
            vb1 = beta2*vb1 + (1-beta2)*(db1**2)
            vW2 = beta2*vW2 + (1-beta2)*(dW2**2)
            vb2 = beta2*vb2 + (1-beta2)*(db2**2)

            # Bias-corrected moments
            mW1_corr = mW1 / (1 - beta1**t)
            mb1_corr = mb1 / (1 - beta1**t)
            mW2_corr = mW2 / (1 - beta1**t)
            mb2_corr = mb2 / (1 - beta1**t)
            vW1_corr = vW1 / (1 - beta2**t)
            vb1_corr = vb1 / (1 - beta2**t)
            vW2_corr = vW2 / (1 - beta2**t)
            vb2_corr = vb2 / (1 - beta2**t)

            # Parameter updates
            W1 -= alpha * mW1_corr / (np.sqrt(vW1_corr) + epsilon)
            b1 -= alpha * mb1_corr / (np.sqrt(vb1_corr) + epsilon)
            W2 -= alpha * mW2_corr / (np.sqrt(vW2_corr) + epsilon)
            b2 -= alpha * mb2_corr / (np.sqrt(vb2_corr) + epsilon)

        # Epoch logging
        if epoch % print_every == 0 or epoch == 1:
            _, _, _, A2_tr = forward_prop(W1, b1, W2, b2, X_train)
            train_acc = np.mean(np.argmax(A2_tr, axis=0) == Y_train) * 100
            _, _, _, A2_val = forward_prop(W1, b1, W2, b2, X_test)
            test_acc = np.mean(np.argmax(A2_val, axis=0) == Y_test) * 100
            print(f"Epoch {epoch:3d} — accuracy with train data: {train_acc:5.2f}%   accuracy with test data: {test_acc:5.2f}%")

    return W1, b1, W2, b2

In [None]:
W1, b1, W2, b2 = gradient_descent_adam_mb(
    X_train, Yc_train, X_test,  Yc_test,
)

# Evaluate on the test set
_, _, _, A2_test = forward_prop(W1, b1, W2, b2, X_test)
preds = np.argmax(A2_test, axis=0)
accuracy = np.mean(preds == Yc_test)
print(f"Test accuracy with Adam and mini-batches: {accuracy*100:.2f}%")

I added mini-batches to try and decrease the over fitting and also increase the accuracy. This did not decrease the overfitting at all but it did increase the accuracy of the model up to a max of about 70% to 75%. To fix the over-fitting I will be doing data augmentation to shift the image which will decrease the over fit and also increase accuracy. 

In [None]:
# Run the see the current model in action
show_random_prediction(
    W1, b1, W2, b2,
    X_test, Yc_test,
    idx_to_char,
    forward_prop
)

In [None]:
# Adam optimization with mini-batches and shift
def gradient_descent_adam_mb_aug(
    X_train, Y_train, X_test, Y_test,
    epochs=50, alpha=0.001, max_shift=3,
    beta1=0.9, beta2=0.999, epsilon=1e-8,
    batch_size=64, print_every=5
):
    W1, b1, W2, b2 = init_params()
    mW1, vW1 = np.zeros_like(W1), np.zeros_like(W1)
    mb1, vb1 = np.zeros_like(b1), np.zeros_like(b1)
    mW2, vW2 = np.zeros_like(W2), np.zeros_like(W2)
    mb2, vb2 = np.zeros_like(b2), np.zeros_like(b2)
    t = 0

    for epoch in range(1, epochs+1):
        for Xb, Yb in iterate_minibatches(X_train, Y_train, batch_size):
            t += 1

            # 1) augment your batch
            Xb_aug = augment_batch_shifts(Xb, max_shift=3)

            # 2) forward on the _augmented_ batch
            Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, Xb_aug)

            # 3) backward on the same augmented batch
            dW1, db1, dW2, db2 = back_prop(W2, Z1, A1, Z2, A2, Xb_aug, Yb)

            # 4) Adam‐style moment updates
            mW1 = beta1*mW1 + (1-beta1)*dW1
            mb1 = beta1*mb1 + (1-beta1)*db1
            mW2 = beta1*mW2 + (1-beta1)*dW2
            mb2 = beta1*mb2 + (1-beta1)*db2

            vW1 = beta2*vW1 + (1-beta2)*(dW1**2)
            vb1 = beta2*vb1 + (1-beta2)*(db1**2)
            vW2 = beta2*vW2 + (1-beta2)*(dW2**2)
            vb2 = beta2*vb2 + (1-beta2)*(db2**2)

            # bias‑correction
            mW1_corr = mW1   / (1 - beta1**t)
            mb1_corr = mb1   / (1 - beta1**t)
            mW2_corr = mW2   / (1 - beta1**t)
            mb2_corr = mb2   / (1 - beta1**t)
            vW1_corr = vW1   / (1 - beta2**t)
            vb1_corr = vb1   / (1 - beta2**t)
            vW2_corr = vW2   / (1 - beta2**t)
            vb2_corr = vb2   / (1 - beta2**t)

            # 5) update parameters
            W1 -= alpha * mW1_corr / (np.sqrt(vW1_corr) + epsilon)
            b1 -= alpha * mb1_corr / (np.sqrt(vb1_corr) + epsilon)
            W2 -= alpha * mW2_corr / (np.sqrt(vW2_corr) + epsilon)
            b2 -= alpha * mb2_corr / (np.sqrt(vb2_corr) + epsilon)

        # logging after each epoch
        if epoch % print_every == 0 or epoch == 1:
            _, _, _, A2_tr = forward_prop(W1, b1, W2, b2, X_train)
            train_acc = np.mean(np.argmax(A2_tr,axis=0)==Y_train)*100
            _, _, _, A2_test = forward_prop(W1, b1, W2, b2, X_test)
            test_acc   = np.mean(np.argmax(A2_test,axis=0)==Y_test)*100
            print(f"Epoch {epoch:3d} — accuracy with train data: {train_acc:5.2f}%   accuracy with test data: {test_acc:5.2f}%")

    return W1, b1, W2, b2


In [None]:
W1, b1, W2, b2 = gradient_descent_adam_mb_aug(
    X_train, Yc_train, X_test, Yc_test,
)

# Evaluate on the test set
_, _, _, A2_test = forward_prop(W1, b1, W2, b2, X_test)
preds = np.argmax(A2_test, axis=0)
accuracy = np.mean(preds == Yc_test)
print(f"Test accuracy with Adam, mini-batches and augmentation: {accuracy*100:.2f}%")

As you can see from the train data accuracy compared to the test data accuracy adding a shift to the pictures helped with the over-fitting problem and also pushed the model to the 85% mark. I want to max out this accuracy so I will now implement an early stop so I can run more epochs and get a higher accuracy at the end as well as adjust the input variables.

In [None]:
# Run the see the current model in action
show_random_prediction(
    W1, b1, W2, b2,
    X_test, Yc_test,
    idx_to_char,
    forward_prop
)

In [None]:
# Adam optimization with mini-batches and shift with an early stop for best model possible
# I also tweaked the function input values to get even better results
def gradient_descent_adam_mb_aug_earlystop(
    X_train, Y_train, X_test, Y_test,
    epochs=1000, alpha=5e-4, max_shift=2,
    beta1=0.9, beta2=0.995, epsilon=1e-8,
    batch_size=128, print_every=5,
    patience=100
):
    """
    Mini-batch Adam with data augmentation, early stopping.
      - patience: # of epochs to wait after last val improvement
    """
    # Initialize parameters & Adam moments
    W1, b1, W2, b2 = init_params()
    mW1, vW1 = np.zeros_like(W1), np.zeros_like(W1)
    mb1, vb1 = np.zeros_like(b1), np.zeros_like(b1)
    mW2, vW2 = np.zeros_like(W2), np.zeros_like(W2)
    mb2, vb2 = np.zeros_like(b2), np.zeros_like(b2)
    t = 0

    best_val = -np.inf
    best_params = (W1.copy(), b1.copy(), W2.copy(), b2.copy())
    wait = 0

    for epoch in range(1, epochs + 1):
        for Xb, Yb in iterate_minibatches(X_train, Y_train, batch_size):
            t += 1
            # augmentation
            Xb_aug = augment_batch_shifts(Xb, max_shift = 2)
            # forward/backprop
            Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, Xb_aug)
            dW1, db1, dW2, db2 = back_prop(W2, Z1, A1, Z2, A2, Xb_aug, Yb)

            # Adam update steps...
            mW1 = beta1*mW1 + (1-beta1)*dW1
            mb1 = beta1*mb1 + (1-beta1)*db1
            mW2 = beta1*mW2 + (1-beta1)*dW2
            mb2 = beta1*mb2 + (1-beta1)*db2

            vW1 = beta2*vW1 + (1-beta2)*(dW1**2)
            vb1 = beta2*vb1 + (1-beta2)*(db1**2)
            vW2 = beta2*vW2 + (1-beta2)*(dW2**2)
            vb2 = beta2*vb2 + (1-beta2)*(db2**2)

            mW1_corr = mW1   / (1 - beta1**t)
            mb1_corr = mb1   / (1 - beta1**t)
            mW2_corr = mW2   / (1 - beta1**t)
            mb2_corr = mb2   / (1 - beta1**t)
            vW1_corr = vW1   / (1 - beta2**t)
            vb1_corr = vb1   / (1 - beta2**t)
            vW2_corr = vW2   / (1 - beta2**t)
            vb2_corr = vb2   / (1 - beta2**t)

            W1 -= alpha * mW1_corr / (np.sqrt(vW1_corr) + epsilon)
            b1 -= alpha * mb1_corr / (np.sqrt(vb1_corr) + epsilon)
            W2 -= alpha * mW2_corr / (np.sqrt(vW2_corr) + epsilon)
            b2 -= alpha * mb2_corr / (np.sqrt(vb2_corr) + epsilon)

        # End of epoch: evaluate validation accuracy
        _, _, _, A2_test = forward_prop(W1, b1, W2, b2, X_test)
        test_acc = np.mean(np.argmax(A2_test, axis=0) == Y_test) * 100

        # Check for improvement
        if  test_acc > best_val:
            best_val = test_acc
            best_params = (W1.copy(), b1.copy(), W2.copy(), b2.copy())
            wait = 0
        else:
            wait += 1

        # Logging
        if epoch % print_every == 0 or epoch == 1:
            _, _, _, A2_tr = forward_prop(W1, b1, W2, b2, X_train)
            train_acc = np.mean(np.argmax(A2_tr, axis=0) == Y_train) * 100
            print(f"Epoch {epoch:3d} — accuracy with train data: {train_acc:5.2f}%  accuracy with test data: {test_acc:5.2f}%")

        # Early stopping
        if wait >= patience:
            print(f"Early stopping at epoch {epoch}. Best test accuracy: {best_val:.2f}%")
            W1, b1, W2, b2 = best_params
            break

    # Restore best weights
    W1, b1, W2, b2 = best_params
    print(f"Training finished. Best test accuracy: {best_val:.2f}%")
    return W1, b1, W2, b2

In [None]:
W1, b1, W2, b2 = gradient_descent_adam_mb_aug_earlystop(
    X_train, Yc_train, X_test, Yc_test,
)

# Evaluate on the test set
_, _, _, A2_test = forward_prop(W1, b1, W2, b2, X_test)
preds = np.argmax(A2_test, axis=0)
accuracy = np.mean(preds == Yc_test)
print(f"Test accuracy with Adam, mini-batches, data augmentation and early stop: {accuracy*100:.2f}%")

I added an early stop for the model to stop training once the test accuracy tops out. this way I can get the best model possible while keeping the over fit low. adding the early stop and changing the alpha and beta values as well as doubling the batch amount pushed this model alot and now this model gets over 90% with only a 2 layer network.  

In [None]:
# Run the see the final model in action against the test set
show_random_prediction(
    W1, b1, W2, b2,
    X_test, Yc_test,
    idx_to_char,
    forward_prop
)

In [None]:
# test the final model on the validation set
predict_random_samples(
    W1, b1, W2, b2,
    X_val, Yc_val, Yd_val, num_samples=10
)

9 out of 10 of these statistically will be correct based on the model accuracy. 


# Author and Credits

**Kaleb Coleman**  
Data Science Major, Northern Arizona University

**Inspired by**: Video tutorial by Samson Zhang: https://www.youtube.com/watch?v=w8yWXqWQYmU