In [1]:
#library imports numpy, pandas and matplotlib
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt


In [2]:
#importing dataset using pandas
#df_train for training and df_test for testig purposes
df_train  = pd.read_csv('data/dataSet1.csv', header = None)
df_test  = pd.read_csv('data/dataSet2.csv', header = None)


In [3]:
#creating numpy arrays train and test out of pd dataframe for easier data manipulation 
train = df_train.to_numpy()
test = df_test.to_numpy()


In [4]:
#checking array shape for verfication 
train.shape


(2810, 65)

In [5]:
#creating train and test data
X_train = train[:, :64] #X_train contains all input values wich are the pixel values here
Y_train = train[:, 64] #Y_train contains all label/output values wich are the actual numbers


#similary this will be test data
X_test = test[:, :64]
Y_test = test[:, 64]




In [6]:
#defining hyperparameters 

#h is the number of neurons in the hidden layer
#training_cycles is number of cycles
#learning_rate is to control the models learning rate

h = 180
training_cycles = 8000
learning_rate = 0.009


In [7]:
#function to initialize parameters using the He initialization

def init_params(h):
    w1 = np.random.randn(h, 64) * np.sqrt(2. / (64))  # He initialization
    b1 = np.zeros((h, 1))  
    w2 = np.random.randn(10, h) * np.sqrt(2. / (h))  # He initialization
    b2 = np.zeros((10, 1))  
    return w1, b1, w2, b2


In [8]:
#defineing Sigmoid function
def sigmoid(Z):
    return 1/(1+np.exp(-Z))


In [9]:
#defineing Leaky ReLU function
def LeakyReLU(Z, alpha=0.01):
    return np.where(Z > 0, Z, alpha * Z)


In [10]:
#Forward Propogation Function

def feed_forward(w1, b1, w2, b2, X):
    Z1 = w1.dot(X.T) + b1 #input layer dot product with bias added
    A1 = LeakyReLU(Z1) #ReLU activation function
    Z2 = w2.dot(A1) + b2 #hidden layer dot product with bias added
    A2 = sigmoid(Z2) #sigmoid activation
   
    return Z1, A1, Z2, A2


In [11]:
#mapping label to an array
#ex: if label is 5 then map the 4th element as 1

def map_labels(Y):
    l1 = len(Y) #get len of the array
    l2 = 10 #10 as the numbers range from 0-9
    mapped_label = np.zeros((l2, l1), dtype=int)
    for i, label in enumerate(Y):
        mapped_label[label][i] = 1
    return mapped_label


In [12]:
# Backpropagation for the sigmoid output

def back_prop(Z1, A1, Z2, A2, w2, X, Y):
    m = Y.size
    mapped_label = map_labels(Y)
    
    dZ2 = A2 - mapped_label  # Error for output layer with sigmoid
    dw2 = (1 / m) * np.dot(dZ2, A1.T)  # Gradient for w2
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
    
    # Backpropagation for the hidden layer (Z1 -> A1)
    dZ1 = np.dot(w2.T, dZ2) * (A1 > 0)  # Derivative of ReLU
    dw1 = (1 / m) * np.dot(dZ1, X)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)
    
    return dw1, db1, dw2, db2


In [13]:
#function to update the params as we train
#alpha is the learning rate

def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
    w1 = w1 - alpha * dw1
    b1 = b1 - alpha * db1
    w2 = w2 - alpha * dw2
    b2 = b2 - alpha * db2
    return w1, b1, w2, b2



In [14]:
#function to get predictions
def get_predictions(A2):
    return np.argmax(A2, axis=0) 

In [15]:
#function to get accuracy of predicted data
def get_accuracy(predictions, Y):
    mapped_label = map_labels(Y)
    accuracy = np.sum(predictions == np.argmax(mapped_label, axis=0)) / Y.size *100
    return accuracy


In [16]:
#function to evaluate model, to be used on test data
def model_eval(X, Y, w1, b1, w2, b2):

    Z1, A1, Z2, A2 = feed_forward(w1, b1, w2, b2, X)  
    predictions = get_predictions(A2)                
    accuracy = get_accuracy(predictions, Y)          
    print(f"Accuracy on Test Data: {accuracy:.2f}%")  
    return accuracy


In [17]:

def train_model(X_train, Y_train, X_test, Y_test, max_iterations=10000, initial_learning_rate=0.009):
    # Initialize parameters
    w1, b1, w2, b2 = init_params(h)  # Ensure h is your hidden layer size
    
    # RMSprop parameters
    epsilon = 1e-8
    weight_decay = 1e-4  # L2 regularization
    
    # RMSprop specific variables
    cache_w1, cache_b1, cache_w2, cache_b2 = np.zeros_like(w1), np.zeros_like(b1), np.zeros_like(w2), np.zeros_like(b2)
    
    best_accuracy = 0
    patience = 10
    no_improve_counter = 0
    
    best_w1, best_b1, best_w2, best_b2 = w1.copy(), b1.copy(), w2.copy(), b2.copy()  
    
    for iteration in range(max_iterations):
        # Dynamic learning rate adjustment
        lr = initial_learning_rate * (0.99 ** (iteration // 500))
        
        # Forward and backward pass
        Z1, A1, Z2, A2 = feed_forward(w1, b1, w2, b2, X_train)
        dw1, db1, dw2, db2 = back_prop(Z1, A1, Z2, A2, w2, X_train, Y_train)
        
        # Add weight decay (L2 regularization)
        dw1 += weight_decay * w1
        dw2 += weight_decay * w2
        
        # RMSprop optimization with squared gradients
        cache_w1 = 0.9 * cache_w1 + 0.1 * (dw1 ** 2)
        cache_b1 = 0.9 * cache_b1 + 0.1 * (db1 ** 2)
        cache_w2 = 0.9 * cache_w2 + 0.1 * (dw2 ** 2)
        cache_b2 = 0.9 * cache_b2 + 0.1 * (db2 ** 2)
        
        # Parameter updates with RMSprop
        w1 -= lr * dw1 / (np.sqrt(cache_w1) + epsilon)
        b1 -= lr * db1 / (np.sqrt(cache_b1) + epsilon)
        w2 -= lr * dw2 / (np.sqrt(cache_w2) + epsilon)
        b2 -= lr * db2 / (np.sqrt(cache_b2) + epsilon)
        
        # Periodic evaluation every 100 iterations
        if iteration % 100 == 0:
            Z1_test, A1_test, Z2_test, A2_test = feed_forward(w1, b1, w2, b2, X_test)
            test_predictions = get_predictions(A2_test)
            test_accuracy = get_accuracy(test_predictions, Y_test)
            
            print(f"Iteration {iteration}: Test Accuracy {test_accuracy:.2f}% | Learning rate: {lr:.6f} | Weight Decay: {weight_decay:.6f}")
            
            # Update best accuracy and monitor no improvement
            if test_accuracy > best_accuracy:
                best_accuracy = test_accuracy
                best_w1, best_b1, best_w2, best_b2 = w1.copy(), b1.copy(), w2.copy(), b2.copy()  # Save the best model
                no_improve_counter = 0
            else:
                no_improve_counter += 1
            
            # If we reach 98% accuracy, stop
            if test_accuracy >= 98.4:
                print(f"Achieved target accuracy of 98% at iteration {iteration}.")
                break
            
            # If no improvement after 'patience' iterations, adjust hyperparameters
            if no_improve_counter > patience:
                print(f"No improvement for {patience} iterations. Adjusting hyperparameters...")
                weight_decay *= 0.5  # Decrease weight decay to allow for more complex models
                initial_learning_rate *= 0.9  # Reduce learning rate for finer updates
                no_improve_counter = 0  # Reset the counter
    
    # Once training is over, return the best model
    return best_w1, best_b1, best_w2, best_b2

# Execute with RMSprop optimization and increased iterations for further tuning
best_w1, best_b1, best_w2, best_b2 = train_until_98_with_rmsprop(
    X_train, Y_train, X_test, Y_test, 
    max_iterations=20000, 
    initial_learning_rate=0.009
)


NameError: name 'train_until_98_with_rmsprop' is not defined

In [None]:
accuracy = model_eval(X_test, Y_test, best_w1, best_b1, best_w2, best_b2)
