In [2]:
import numpy as np
import pandas as pd
import cv2
import torch
import gc

In [3]:
def sigmoid(x, derivative = False):
    val = 1/(1 + np.exp(-x))
    if(derivative):
        return val * (1 - val)
    else:
        return val
    
def tanh(x, derivative = False):
    val = np.tanh(x)
    if derivative:
        return 1 - val**2
    else:
        return val
    
def relu(x, derivative = False):
    if derivative:
        return np.where(x > 0, 1, 0)
    return np.maximum(0,x)

def softmax(x):
    max_x = np.max(x, axis=1, keepdims=True)
    shifted_x = x - max_x
    exp_x = np.exp(shifted_x)
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy_loss(y_pred, y_true):
    m = y_true.shape[0]
    loss = -np.sum(y_true * np.log(y_pred + 1e-9)) / m
    return loss

In [4]:
def flatten_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"Warning: Unable to load image '{image_path}'")
        return None
    return image.flatten() / 255.0

In [5]:
def train_sgd(model, X, y, learning_rate):
        m = X.shape[0]
        indices = np.random.permutation(m)
        total_loss = 0

        for i in indices:
            X_batch = X[i:i+1]
            y_batch = y[i:i+1]
            total_loss += train_step(model, X_batch, y_batch, learning_rate)

        return total_loss/m

def train_minibatch(model, X, y, learning_rate, batch_size):
        """ Mini-batch Gradient Descent: Updates weights after each mini-batch """
        m = X.shape[0]
        indices = np.random.permutation(m)

        X_shuffled = X[indices]
        y_shuffled = y[indices]

        total_loss = 0
        num_batches = 0

        for i in range(0, m, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]
            total_loss += train_step(model, X_batch, y_batch, learning_rate)  # Use returned loss
            num_batches += 1

        return total_loss / num_batches

def train_batch(model, X, y, learning_rate):
        m = X.shape[0]
        indices = np.random.permutation(m)

        X_shuffled = X[indices]
        y_shuffled = y[indices]
        return train_step(model, X_shuffled, y_shuffled, learning_rate)

def train_step(model, X_batch, y_batch, learning_rate):
        """ Performs forward, backward pass and updates weights """
        y_pred = model.forward(X_batch)
        loss = cross_entropy_loss(y_pred, y_batch)
        grads_w, grads_b = model.backward(X_batch, y_batch)

        for i in range(len(model.weights)):
            model.weights[i] -= learning_rate * grads_w[i]
            model.biases[i] -= learning_rate * grads_b[i]

        return loss

In [6]:
class MLP:
    def __init__(self, input_size, hidden_layers, output_size, activation='relu', optimizer='batchgd', learning_rate=0.01, batch_size=None):
        self.layers = [input_size] + hidden_layers + [output_size]
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.weights = []
        self.biases = []

        for i in range(len(self.layers) - 1):
            layer_input_size = self.layers[i]
            layer_output_size = self.layers[i + 1]
            
            weight_matrix = np.random.randn(layer_input_size, layer_output_size) * 0.1
            bias = np.zeros((1, layer_output_size))
            
            self.weights.append(weight_matrix)
            self.biases.append(bias)

        activation_funs = {'sigmoid': sigmoid, 'tanh': tanh, 'relu': relu}
        self.activation = activation_funs[activation]

        self.learning_rate = learning_rate

    def forward(self, X):
        self.a = []
        self.a.append(X)
        self.z_values = []

        
        for i in range(len(self.weights)):
            W = self.weights[i]
            b = self.biases[i]
            prev_a = self.a[-1]
            
            z = np.dot(prev_a, W) + b
            self.z_values.append(z)

            if i < len(self.weights) - 1:
                activation_output = self.activation(z)
            else:
                activation_output = softmax(z)
            
            self.a.append(activation_output)

        return self.a[-1]

    def backward(self, X, y):
        m = X.shape[0]

        grads_w = [np.zeros_like(w) for w in self.weights]
        grads_b = [np.zeros_like(b) for b in self.biases]

        dz = self.a[-1] - y

        for i in reversed(range(len(self.weights))):
            W = self.weights[i]
            a_prev = self.a[i]

            dw = np.dot(a_prev.T, dz) / m
            db = np.sum(dz, axis=0, keepdims=True) / m

            grads_w[i] = dw
            grads_b[i] = db

            if i > 0:
                dz = np.dot(dz, W.T) * self.activation(self.z_values[i-1], derivative=True)

        return grads_w, grads_b
    
    def train(self, X, y, epochs=100):
        for epoch in range(epochs):
            if self.optimizer == 'sgd':
                # print("sgd")
                loss_val = train_sgd(self, X, y, self.learning_rate)
            elif self.optimizer == 'minibatchgd':
                # print("mini")
                loss_val = train_minibatch(self, X, y, self.learning_rate, self.batch_size)
            elif self.optimizer == 'batchgd':
                # print("whole")
                loss_val = train_batch(self, X, y, self.learning_rate)

            # print(f'Epoch {epoch}, Loss: {loss_val:.4f}')

In [7]:
def test_model(model, test_csv, symbol_to_index):
    df_test = pd.read_csv(test_csv)
    
    X_test = np.array([flatten_image(path) for path in df_test["path"] if flatten_image(path) is not None])
    symbol_ids_test = df_test["symbol_id"].values[:len(X_test)]
    
    # Convert symbol IDs to one-hot encoding
    unique_symbols = list(symbol_to_index.keys())
    y_true_one_hot = np.eye(len(unique_symbols))[np.array([symbol_to_index[s] for s in symbol_ids_test])]

    y_pred_probs = model.forward(X_test)
    y_pred_indices = np.argmax(y_pred_probs, axis=1)
    
    index_to_symbol = {idx: symbol for symbol, idx in symbol_to_index.items()}
    y_pred_symbols = np.array([index_to_symbol[idx] for idx in y_pred_indices])
    
    accuracy = np.mean(y_pred_symbols == symbol_ids_test) * 100
    loss = cross_entropy_loss(y_pred_probs, y_true_one_hot)
    del df_test, X_test
    gc.collect()
    return accuracy, loss

In [8]:
def mlp_executive(hidden_layers_list, activation, optimizer, learning_rate, batch_size):
    np.random.seed(42)

    train_accuracies = []
    test_accuracies = []
    train_losses = []
    test_losses = []

    for i in range(1, 11):
        df = 0
        df = pd.read_csv(f"../fold-{i}/train.csv")

        X_train, symbol_ids = [], []

        for path, symbol_id in zip(df["path"], df["symbol_id"]):
            img = flatten_image(path)
            if img is not None:
                X_train.append(img)
                symbol_ids.append(symbol_id)

        X_train = np.array([img for img in X_train if img is not None])

        symbol_ids = np.array(symbol_ids)

        unique_symbols, indices = np.unique(symbol_ids, return_inverse=True)
        symbol_to_index = {symbol: idx for idx, symbol in enumerate(unique_symbols)}
        
        # print(symbol_to_index)
        y_train = np.eye(len(unique_symbols))[indices]
        model = MLP(input_size=X_train.shape[1], hidden_layers = hidden_layers_list, output_size=len(unique_symbols), activation=activation, optimizer=optimizer, learning_rate=learning_rate, batch_size=batch_size)
        
        model.train(X_train, y_train, epochs=2)

        train_accuracy, train_loss = test_model(model, f"../fold-{i}/train.csv", symbol_to_index)
        test_accuracy, test_loss = test_model(model, f"../fold-{i}/test.csv", symbol_to_index)

        train_accuracies.append(train_accuracy)
        test_accuracies.append(test_accuracy)
        train_losses.append(train_loss)
        test_losses.append(test_loss)

        print(f"fold : {i}, train accuracy : {train_accuracy}, test accuracy : {test_accuracy}")

        del X_train, y_train, model  # Remove large variables
        gc.collect()
    
    train_accuracy_mean = np.mean(train_accuracies)
    train_accuracy_std = np.std(train_accuracies)
    test_accuracy_mean = np.mean(test_accuracies)
    test_accuracy_std = np.std(test_accuracies)
    train_loss_mean = np.mean(train_losses)
    train_loss_std = np.std(train_losses)
    test_loss_mean = np.mean(test_losses)
    test_loss_std = np.std(test_losses)

    print("\nFinal Statistics:")
    print(f"Train Accuracy - Mean: {train_accuracy_mean:.2f}, Std: {train_accuracy_std:.2f}")
    print(f"Test Accuracy - Mean: {test_accuracy_mean:.2f}, Std: {test_accuracy_std:.2f}")
    print(f"Train loss - Mean: {train_loss_mean:.2f}, Std: {train_loss_std:.2f}")
    print(f"Test loss - Mean: {test_loss_mean:.2f}, Std: {test_loss_std:.2f}")
    

In [8]:
mlp_executive([256, 128], 'relu', 'batchgd', 0.01, 64)

fold : 1, train accuracy : 1.8989559709338077, test accuracy : 1.9303201506591336
fold : 2, train accuracy : 0.6338903283803077, test accuracy : 0.554735910298023
fold : 3, train accuracy : 0.8386201427438541, test accuracy : 0.7213386152663631
fold : 4, train accuracy : 0.8826755110400507, test accuracy : 0.7703703703703704
fold : 5, train accuracy : 1.1889428316655106, test accuracy : 1.152155837985509
fold : 6, train accuracy : 0.8208792587651811, test accuracy : 0.7733491969066032
fold : 7, train accuracy : 0.6595626683568372, test accuracy : 0.5784483272705587
fold : 8, train accuracy : 0.5993359779803433, test accuracy : 0.6514463303848912
fold : 9, train accuracy : 0.46391922658131785, test accuracy : 0.5809078931608576
fold : 10, train accuracy : 0.6966985986857731, test accuracy : 0.6242122321589341

Final Statistics:
Train Accuracy - Mean: 0.87, Std: 0.39
Test Accuracy - Mean: 0.83, Std: 0.40
Train loss - Mean: 6.03, Std: 0.04
Test loss - Mean: 6.03, Std: 0.04


In [9]:
mlp_executive([256, 128], 'sigmoid', 'batchgd', 0.01, 64)

fold : 1, train accuracy : 0.2261291580986637, test accuracy : 0.24717514124293788
fold : 2, train accuracy : 0.7204801438316324, test accuracy : 0.719976394216583
fold : 3, train accuracy : 0.07864128998149617, test accuracy : 0.08277656240761544
fold : 4, train accuracy : 0.044265912604553447, test accuracy : 0.035555555555555556
fold : 5, train accuracy : 1.1935664982330987, test accuracy : 1.193728471314883
fold : 6, train accuracy : 0.12283470806944782, test accuracy : 0.12492563950029743
fold : 7, train accuracy : 0.332752337189035, test accuracy : 0.3100960104955573
fold : 8, train accuracy : 0.2620444749539607, test accuracy : 0.2629691608893139
fold : 9, train accuracy : 1.9229880885603987, test accuracy : 1.8924422086477422
fold : 10, train accuracy : 0.03562663288734067, test accuracy : 0.024008162775343615

Final Statistics:
Train Accuracy - Mean: 0.49, Std: 0.59
Test Accuracy - Mean: 0.49, Std: 0.58
Train loss - Mean: 6.05, Std: 0.02
Test loss - Mean: 6.05, Std: 0.02


In [10]:
mlp_executive([256, 128], 'tanh', 'batchgd', 0.01, 64)

fold : 1, train accuracy : 0.40399098128153077, test accuracy : 0.3766478342749529
fold : 2, train accuracy : 0.46269367035058956, test accuracy : 0.4367069932133373
fold : 3, train accuracy : 0.19825535289452814, test accuracy : 0.18329095961686276
fold : 4, train accuracy : 0.46710448076745203, test accuracy : 0.4977777777777778
fold : 5, train accuracy : 0.2661910895340005, test accuracy : 0.3385200142534743
fold : 6, train accuracy : 0.46558316768258456, test accuracy : 0.41641879833432477
fold : 7, train accuracy : 0.5479849997359109, test accuracy : 0.4234003220227801
fold : 8, train accuracy : 1.2290347918495588, test accuracy : 1.3267989481233564
fold : 9, train accuracy : 0.5477282475995645, test accuracy : 0.5329979638280034
fold : 10, train accuracy : 0.7481592906341541, test accuracy : 0.684232639097293

Final Statistics:
Train Accuracy - Mean: 0.53, Std: 0.27
Test Accuracy - Mean: 0.52, Std: 0.30
Train loss - Mean: 6.06, Std: 0.03
Test loss - Mean: 6.06, Std: 0.03


In [9]:
mlp_executive([256, 128], 'relu', 'minibatchgd', 0.01, 64)

fold : 1, train accuracy : 71.0528229778962, test accuracy : 68.18502824858757
fold : 2, train accuracy : 71.68777431124742, test accuracy : 68.50988492180583
fold : 3, train accuracy : 72.73195876288659, test accuracy : 69.65056465440786
fold : 4, train accuracy : 72.69718151666909, test accuracy : 69.14962962962963
fold : 5, train accuracy : 72.51230225568877, test accuracy : 68.7136239458368
fold : 6, train accuracy : 72.96909980650231, test accuracy : 69.60142772159429
fold : 7, train accuracy : 72.76250462156024, test accuracy : 68.90691156300316
fold : 8, train accuracy : 70.47808265291978, test accuracy : 66.919674874492
fold : 9, train accuracy : 73.1283201900551, test accuracy : 69.12205054497545
fold : 10, train accuracy : 72.6321484179136, test accuracy : 68.72936798511495

Final Statistics:
Train Accuracy - Mean: 72.27, Std: 0.84
Test Accuracy - Mean: 68.75, Std: 0.75
Train loss - Mean: 1.07, Std: 0.03
Test loss - Mean: 1.24, Std: 0.03


In [10]:
mlp_executive([256, 128], 'sigmoid', 'minibatchgd', 0.01, 64)

fold : 1, train accuracy : 41.529082722277685, test accuracy : 41.36064030131827
fold : 2, train accuracy : 42.07934535455555, test accuracy : 42.20714074948362
fold : 3, train accuracy : 42.544937879989426, test accuracy : 42.03866847986756
fold : 4, train accuracy : 41.99381598594062, test accuracy : 41.68888888888888
fold : 5, train accuracy : 42.225304666600614, test accuracy : 41.72110701983608
fold : 6, train accuracy : 42.55694313281338, test accuracy : 41.77275431290898
fold : 7, train accuracy : 42.3110442085248, test accuracy : 42.01800942214801
fold : 8, train accuracy : 41.35550260394321, test accuracy : 41.638775998087496
fold : 9, train accuracy : 42.63305506978586, test accuracy : 42.65181458857349
fold : 10, train accuracy : 42.84564431425329, test accuracy : 42.5784766820719

Final Statistics:
Train Accuracy - Mean: 42.21, Std: 0.46
Test Accuracy - Mean: 41.97, Std: 0.39
Train loss - Mean: 2.90, Std: 0.03
Test loss - Mean: 2.92, Std: 0.02


In [9]:
mlp_executive([256, 128], 'tanh', 'minibatchgd', 0.01, 64)

fold : 1, train accuracy : 67.01423555781831, test accuracy : 64.58921845574388
fold : 2, train accuracy : 66.9061657236529, test accuracy : 64.8686928297433
fold : 3, train accuracy : 67.15305313243456, test accuracy : 65.27523207000532
fold : 4, train accuracy : 67.39452159780124, test accuracy : 64.71703703703704
fold : 5, train accuracy : 67.19838832193929, test accuracy : 64.27723007483074
fold : 6, train accuracy : 67.58286389782265, test accuracy : 64.93753718024985
fold : 7, train accuracy : 67.32160777478477, test accuracy : 64.60134772496869
fold : 8, train accuracy : 66.72035168084699, test accuracy : 64.20631125986135
fold : 9, train accuracy : 67.20493615336392, test accuracy : 64.77422445801892
fold : 10, train accuracy : 67.35412873090016, test accuracy : 64.25184562751336

Final Statistics:
Train Accuracy - Mean: 67.19, Std: 0.24
Test Accuracy - Mean: 64.65, Std: 0.32
Train loss - Mean: 1.38, Std: 0.01
Test loss - Mean: 1.49, Std: 0.02


In [8]:
mlp_executive([256, 128], 'relu', 'sgd', 0.001, 64)

fold : 1, train accuracy : 66.21352675531107, test accuracy : 64.34792843691149
fold : 2, train accuracy : 66.9696208555867, test accuracy : 65.20507524343464
fold : 3, train accuracy : 68.03991541104944, test accuracy : 65.97883285047004
fold : 4, train accuracy : 66.91023930020216, test accuracy : 64.75851851851851
fold : 5, train accuracy : 67.10261237161069, test accuracy : 64.22377954626441
fold : 6, train accuracy : 66.56848695376529, test accuracy : 64.07495538370019
fold : 7, train accuracy : 65.89222521523266, test accuracy : 63.05683105730813
fold : 8, train accuracy : 67.3045062408829, test accuracy : 65.00717188620607
fold : 9, train accuracy : 67.51047612762729, test accuracy : 65.12756018684873
fold : 10, train accuracy : 67.07505343994933, test accuracy : 64.90006602244763

Final Statistics:
Train Accuracy - Mean: 66.96, Std: 0.59
Test Accuracy - Mean: 64.67, Std: 0.75
Train loss - Mean: 1.27, Std: 0.02
Test loss - Mean: 1.40, Std: 0.04


In [9]:
mlp_executive([256, 128], 'sigmoid', 'sgd', 0.001, 64)

fold : 1, train accuracy : 29.581264339696244, test accuracy : 29.384416195856872
fold : 2, train accuracy : 30.2925545978531, test accuracy : 31.03570374741812


KeyboardInterrupt: 

In [10]:
mlp_executive([256, 128], 'tanh', 'sgd', 0.001, 64)

fold : 1, train accuracy : 60.74873876792669, test accuracy : 59.53389830508474


: 

Mean and Standard Deviation of Accuracy:

Mean accuracy: This represents the average accuracy across all 10 folds. It gives you a general idea of how well the model performs across different subsets of data. A higher mean accuracy suggests that, on average, your model performs well across all folds.

Standard deviation: This shows how much the model's accuracy varies between the different folds. If the standard deviation is low, the model’s performance is consistent across different train-test splits. If the standard deviation is high, the model's performance is fluctuating significantly from fold to fold, indicating that the model might not generalize well across different data distributions.
Impact of High vs. Low Standard Deviation:

High standard deviation: A high standard deviation in accuracy across the 10 folds suggests that the model’s performance is inconsistent. This could indicate that the model is highly sensitive to specific data splits, and might not generalize well to new, unseen data. For example, if the model performs very well on some folds but poorly on others, its generalization ability may be questionable.
Low standard deviation: A low standard deviation in accuracy suggests that the model is stable, meaning its performance is consistent across different data splits. This is a good sign of the model's generalization ability, as it implies that the model is not overfitting to specific parts of the data.
Choosing Between Two Configurations:

When comparing two configurations where one has a slightly higher mean accuracy but a significantly higher standard deviation compared to another with marginally lower mean accuracy and lower standard deviation, it’s better to choose the configuration with lower standard deviation, even if the mean accuracy is slightly lower.
This is because model consistency is important for generalization. A configuration with high variance (high standard deviation) suggests that the model's performance might be sensitive to the specific data split, which could lead to poor real-world performance. A model with a lower standard deviation implies that it is stable and less likely to be affected by specific data variations, which is desirable for robustness and reliability.