# **Multiclass Neural Network for Tifinagh Character Recognition**

This notebook implements a multilayer perceptron (MLP) to classify handwritten Tifinagh characters from the AMHCD dataset (28,182 images, 33 classes). It includes data preprocessing, model training, hyperparameter tuning, cross-validation, and evaluation, with bonus features: L2 regularization, Adam optimizer, K-fold cross-validation, and data augmentation.

## **1.Imports and Setup**
Import required libraries and set random seed for reproducibility.

In [1]:
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (63.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86


In [2]:
import os
import cv2
import sys
import time
import json
import shutil
import random
import zipfile
import itertools
import kagglehub
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from tqdm import trange, tqdm
from google.colab import files
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Set random seed for reproducibility
np.random.seed(42)

## **2.Activation Functions**
Define ReLU and Softmax activations for the MLP.

In [3]:
# Activation Functions
def relu(x):
    """ReLU activation: max(0, x)"""
    assert isinstance(x, np.ndarray), "Input to ReLU must be a numpy array"
    result = np.maximum(0, x)
    assert np.all(result >= 0), "ReLU output must be non-negative"
    return result

def relu_derivative(x):
    """Derivative of ReLU: 1 if x > 0, else 0"""
    assert isinstance(x, np.ndarray), "Input to ReLU derivative must be a numpy array"
    result = np.where(x > 0, 1, 0)
    assert np.all((result == 0) | (result == 1)), "ReLU derivative must be 0 or 1"
    return result

def softmax(x):
    """Softmax activation: exp(x) / sum(exp(x))"""
    assert isinstance(x, np.ndarray), "Input to softmax must be a numpy array"
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    result = exp_x / np.sum(exp_x, axis=1, keepdims=True)
    assert np.all((result >= 0) & (result <= 1)), "Softmax output must be in [0,1]"
    assert np.allclose(np.sum(result, axis=1), 1), "Softmax output must sum to 1"
    return result


## **3.Data Augmentation and Preprocessing**
Functions for augmenting images (rotation, translation) and preprocessing (grayscale, resize, normalize).

In [4]:
# Data Augmentation
def augment_image(image, max_rotation=10, max_translation=3):
    """Apply random rotation and translation to a 32x32 grayscale image"""
    assert image.shape == (32, 32), "Image must be 32x32"
    angle = np.random.uniform(-max_rotation, max_rotation)
    M = cv2.getRotationMatrix2D((16, 16), angle, 1.0)
    rotated = cv2.warpAffine(image, M, (32, 32), borderMode=cv2.BORDER_REPLICATE)
    tx = np.random.uniform(-max_translation, max_translation)
    ty = np.random.uniform(-max_translation, max_translation)
    M = np.float32([[1, 0, tx], [0, 1, ty]])
    translated = cv2.warpAffine(rotated, M, (32, 32), borderMode=cv2.BORDER_REPLICATE)
    return translated.flatten()

# Image Preprocessing
def load_and_preprocess_image(image_path, data_dir, target_size=(32, 32)):
    """Load and preprocess image: grayscale, resize, normalize"""
    full_path = os.path.join(data_dir, image_path)
    assert os.path.exists(full_path), f"Image not found: {full_path}"
    img = cv2.imread(full_path, cv2.IMREAD_GRAYSCALE)
    assert img is not None, f"Failed to load image: {full_path}"
    img = cv2.resize(img, target_size)
    img = img.astype(np.float32) / 255.0
    return img.flatten()


## **4.Visualization**
Function to display sample images per Tifinagh class.

In [5]:
# Visualization
def display_sample_images_from_paths(labels_df, data_dir, path_column='image_path', label_column='label',
                                     num_samples=33, samples_per_row=11, random_state=83):
    """Display a grid of sample grayscale images, one per class"""
    assert path_column in labels_df.columns, f"Missing column: {path_column}"
    assert label_column in labels_df.columns, f"Missing column: {label_column}"

    rng = np.random.default_rng(random_state)
    unique_labels = labels_df[label_column].unique()
    num_samples = min(num_samples, len(unique_labels))
    selected_indices = [rng.choice(labels_df[labels_df[label_column] == lbl].index)
                        for lbl in rng.choice(unique_labels, size=num_samples, replace=False)]
    selected_df = labels_df.loc[selected_indices].sort_values(by=label_column)

    num_rows = (num_samples + samples_per_row - 1) // samples_per_row
    fig, axes = plt.subplots(num_rows, samples_per_row, figsize=(samples_per_row * 2, num_rows * 3))
    axes = axes.ravel()

    for i, (path, label) in enumerate(zip(selected_df[path_column], selected_df[label_column])):
        try:
            img = Image.open(os.path.join(data_dir, path))
            axes[i].imshow(img, cmap='gray')
            axes[i].set_title(f"Label: {label}", fontsize=17)
            axes[i].axis('off')
        except Exception as e:
            print(f"Error loading {path}: {e}")
            axes[i].set_title("Error")
            axes[i].axis('off')

    for ax in axes[num_samples:]:
        ax.axis('off')

    plt.tight_layout()
    return fig

## **5.Neural Network Implementation**
MLP class with forward propagation, backpropagation, and training logic.

In [6]:
# Neural Network Class
class MulticlassNeuralNetwork:
    def __init__(self, layer_sizes, learning_rate=0.01, l2_lambda=0.01, optimizer='sgd', beta1=0.9, beta2=0.999, epsilon=1e-8):
        """Initialize MLP with specified architecture and parameters"""
        assert len(layer_sizes) >= 2, "At least 2 layers required"
        assert all(size > 0 for size in layer_sizes), "Layer sizes must be positive"
        self.layer_sizes = layer_sizes
        self.learning_rate = learning_rate
        self.l2_lambda = l2_lambda
        self.optimizer = optimizer.lower()
        self.beta1, self.beta2, self.epsilon = beta1, beta2, epsilon
        self.weights = []
        self.biases = []
        self.m_weights, self.v_weights = [], []
        self.m_biases, self.v_biases = [], []
        self.t = 0

        # He initialization
        np.random.seed(42)
        for i in range(len(layer_sizes) - 1):
            w = np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * np.sqrt(2 / layer_sizes[i])
            b = np.zeros((1, layer_sizes[i + 1]))
            self.weights.append(w)
            self.biases.append(b)
            self.m_weights.append(np.zeros_like(w))
            self.v_weights.append(np.zeros_like(w))
            self.m_biases.append(np.zeros_like(b))
            self.v_biases.append(np.zeros_like(b))

    def forward(self, X):
        """Forward propagation"""
        self.z_values, self.activations = [], [X]
        for i in range(len(self.weights) - 1):
            Z = np.dot(self.activations[-1], self.weights[i]) + self.biases[i]
            self.z_values.append(Z)
            self.activations.append(relu(Z))
        Z = np.dot(self.activations[-1], self.weights[-1]) + self.biases[-1]
        self.z_values.append(Z)
        output = softmax(Z)
        self.activations.append(output)
        return output

    def compute_loss(self, y_true, y_pred):
        """Cross-entropy loss with L2 regularization"""
        m = y_true.shape[0]
        y_pred_clipped = np.clip(y_pred, 1e-15, 1 - 1e-15)
        cross_entropy = -np.sum(y_true * np.log(y_pred_clipped)) / m
        l2_term = (self.l2_lambda / (2 * m)) * sum(np.sum(np.square(W)) for W in self.weights)
        total_loss = cross_entropy + l2_term
        assert not np.isnan(total_loss), "Loss is NaN"
        return total_loss

    def compute_accuracy(self, y_true, y_pred):
        """Compute accuracy"""
        predictions = np.argmax(y_pred, axis=1)
        true_labels = np.argmax(y_true, axis=1)
        return np.mean(predictions == true_labels)

    def backward(self, X, y):
        """Backpropagation"""
        m = X.shape[0]
        self.d_weights = [np.zeros_like(w) for w in self.weights]
        self.d_biases = [np.zeros_like(b) for b in self.biases]
        dZ = self.activations[-1] - y
        self.d_weights[-1] = (np.dot(self.activations[-2].T, dZ) + self.l2_lambda * self.weights[-1]) / m
        self.d_biases[-1] = np.sum(dZ, axis=0, keepdims=True) / m
        dA_prev = dZ
        for i in range(len(self.weights) - 2, -1, -1):
            dA = np.dot(dA_prev, self.weights[i + 1].T)
            dZ = dA * relu_derivative(self.z_values[i])
            self.d_weights[i] = (np.dot(self.activations[i].T, dZ) + self.l2_lambda * self.weights[i]) / m
            self.d_biases[i] = np.sum(dZ, axis=0, keepdims=True) / m
            dA_prev = dZ

    def update_parameters(self):
        """Update parameters using SGD or Adam"""
        if self.optimizer == 'sgd':
            for i in range(len(self.weights)):
                self.weights[i] -= self.learning_rate * self.d_weights[i]
                self.biases[i] -= self.learning_rate * self.d_biases[i]
        elif self.optimizer == 'adam':
            self.t += 1
            for i in range(len(self.weights)):
                self.m_weights[i] = self.beta1 * self.m_weights[i] + (1 - self.beta1) * self.d_weights[i]
                self.v_weights[i] = self.beta2 * self.v_weights[i] + (1 - self.beta2) * (self.d_weights[i] ** 2)
                self.m_biases[i] = self.beta1 * self.m_biases[i] + (1 - self.beta1) * self.d_biases[i]
                self.v_biases[i] = self.beta2 * self.v_biases[i] + (1 - self.beta2) * (self.d_biases[i] ** 2)
                m_w_hat = self.m_weights[i] / (1 - self.beta1 ** self.t)
                v_w_hat = self.v_weights[i] / (1 - self.beta2 ** self.t)
                m_b_hat = self.m_biases[i] / (1 - self.beta1 ** self.t)
                v_b_hat = self.v_biases[i] / (1 - self.beta2 ** self.t)
                self.weights[i] -= self.learning_rate * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
                self.biases[i] -= self.learning_rate * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)

    def train(self, X, y, X_val, y_val, epochs=100, batch_size=100, augment=False, early_stopping=True, verbose=1):
        """Train the MLP"""
        train_losses, val_losses, train_accuracies, val_accuracies = [], [], [], []
        best_val_loss = np.inf
        best_weights = [w.copy() for w in self.weights]
        best_biases = [b.copy() for b in self.biases]
        patience, patience_counter = 10, 0

        for epoch in range(epochs):
            start_time = time.time()

            indices = np.random.permutation(X.shape[0])
            X_shuffled, y_shuffled = X[indices], y[indices]
            epoch_loss = 0
            for i in range(0, X.shape[0], batch_size):
                X_batch = X_shuffled[i:i + batch_size]
                y_batch = y_shuffled[i:i + batch_size]
                if augment:
                    X_batch = np.array([augment_image(img.reshape(32, 32)) for img in X_batch])
                outputs = self.forward(X_batch)
                epoch_loss += self.compute_loss(y_batch, outputs)
                self.backward(X_batch, y_batch)
                self.update_parameters()

            train_pred = self.forward(X)
            val_pred = self.forward(X_val)
            train_loss = epoch_loss / (X.shape[0] // batch_size)
            val_loss = self.compute_loss(y_val, val_pred)
            train_accuracy = self.compute_accuracy(y, train_pred)
            val_accuracy = self.compute_accuracy(y_val, val_pred)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            train_accuracies.append(train_accuracy)
            val_accuracies.append(val_accuracy)

            end_time = time.time()
            epoch_time = end_time - start_time

            if early_stopping:
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_weights = [w.copy() for w in self.weights]
                    best_biases = [b.copy() for b in self.biases]
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= patience:
                        if verbose >= 1:
                            print(f"Early stopping at epoch {epoch + 1}")
                        self.weights = best_weights
                        self.biases = best_biases
                        break

            log = (epoch % 10 == 0 or epoch == epochs - 1)
            if verbose == 2 and log:
                print(f"Epoch {epoch + 1:03d} | Time: {epoch_time:.2f}s | "
                      f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | "
                      f"Train Acc: {train_accuracy:.4f} | Val Acc: {val_accuracy:.4f}")
            elif verbose == 1 and log:
                print(f"\nEpoch {epoch + 1:03d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | "
                      f"Train Acc: {train_accuracy:.4f} | Val Acc: {val_accuracy:.4f}")

        return train_losses, val_losses, train_accuracies, val_accuracies



    def predict(self, X):
        """Predict class labels"""
        outputs = self.forward(X)
        return np.argmax(outputs, axis=1)


## **6.K-Fold Cross-Validation**
Function for K-fold cross-validation to evaluate model robustness.

In [7]:
# K-Fold Cross-Validation
def k_fold_cross_validation(X, y_one_hot, layer_sizes, k=5, epochs=100, batch_size=32,
                            learning_rate=0.001, l2_lambda=0.01, optimizer='sgd',
                            augment=False, early_stopping=False, verbose=1):
    assert isinstance(X, np.ndarray) and isinstance(y_one_hot, np.ndarray), "X and y_one_hot must be numpy arrays"
    assert X.shape[0] == y_one_hot.shape[0], "X and y_one_hot must have the same number of samples"
    assert isinstance(layer_sizes, list) and len(layer_sizes) >= 2, "layer_sizes must be a list with at least 2 elements"
    assert isinstance(k, int) and k > 1, "k must be an integer greater than 1"
    assert isinstance(epochs, int) and epochs > 0, "Epochs must be a positive integer"
    assert isinstance(batch_size, int) and batch_size > 0, "Batch size must be a positive integer"
    assert optimizer.lower() in ['sgd', 'adam'], "Optimizer must be 'sgd' or 'adam'"

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_train_losses, fold_val_losses = [], []
    fold_train_accuracies, fold_val_accuracies = [], []

    all_splits = list(kf.split(X))

    for fold in trange(k, desc="Cross-validation folds", disable=(verbose == 0)):
        train_idx, val_idx = all_splits[fold]

        if verbose >= 1:
            print(f"\n=========================== Fold {fold + 1}/{k} ===========================================\n")
        start_time = time.time()

        X_train, X_val = X[train_idx], X[val_idx]
        y_train_one_hot, y_val_one_hot = y_one_hot[train_idx], y_one_hot[val_idx]

        nn = MulticlassNeuralNetwork(
            layer_sizes, learning_rate=learning_rate, l2_lambda=l2_lambda, optimizer=optimizer
        )

        train_losses, val_losses, train_accuracies, val_accuracies = nn.train(
            X_train, y_train_one_hot, X_val, y_val_one_hot,
            epochs=epochs, batch_size=batch_size,
            augment=augment, early_stopping=early_stopping, verbose=verbose
        )

        fold_train_losses.append(train_losses)
        fold_val_losses.append(val_losses)
        fold_train_accuracies.append(train_accuracies)
        fold_val_accuracies.append(val_accuracies)

        final_val_accuracy = val_accuracies[-1]
        if verbose >= 2:
            y_val_pred = nn.predict(X_val)
            print(f"\n===========================Fold {fold + 1} Classification Report: =======================")
            print(classification_report(np.argmax(y_val_one_hot, axis=1), y_val_pred))

        if verbose >= 1:
            print(f"Fold {fold + 1} completed in {time.time() - start_time:.2f} seconds")
            print(f"Final validation accuracy: {final_val_accuracy:.4f}")

    avg_train_losses = np.mean(fold_train_losses, axis=0)
    avg_val_losses = np.mean(fold_val_losses, axis=0)
    avg_train_accuracies = np.mean(fold_train_accuracies, axis=0)
    avg_val_accuracies = np.mean(fold_val_accuracies, axis=0)

    if verbose >= 1:
        print(f"\n=========================== Cross-Validation Summary ===========================")
        print(f"Average final validation accuracy: {avg_val_accuracies[-1]:.4f}")
        print(f"Metrics computed over {epochs} epochs")

    return avg_train_losses, avg_val_losses, avg_train_accuracies, avg_val_accuracies

## **7.Dataset Loading and Preprocessing**
Load AMHCD dataset, preprocess images, encode labels, and split data.

### **7.1. Extracting & Loading AMHCD DataSet**

In [8]:
def cleanup(base_dir="/content/", temp_dir="temp"):
    temp_dir = os.path.join(base_dir, temp_dir)
    shutil.rmtree(temp_dir, ignore_errors=True)
    print(f"{temp_dir} Cleaned up.")

In [9]:
# Download latest version
path = kagglehub.dataset_download("benaddym/amazigh-handwritten-character-database-amhcd")

target_dir = "/content/amhcd"
cleanup(temp_dir=target_dir)
shutil.copytree(path, target_dir, dirs_exist_ok=True)

print("Path to dataset files:", target_dir)

/content/amhcd Cleaned up.
Path to dataset files: /content/amhcd


In [10]:
latin_labels_txt    = target_dir + '/labels/labels/33-common-latin-tifinagh.txt'

tifinagh_labels_txt = target_dir + '/labels/labels/sorted-33-common-tifinagh.txt'
images_root         = target_dir + '/AMHCD_64/AMHCD_64'
output_csv          = target_dir + '/rgb-labels-map.csv'

# Lire les labels Latin et les glyphes Tifinagh
with open(latin_labels_txt, 'r', encoding='utf-8') as f:
    ascii_labels = [line.strip() for line in f if line.strip()]

with open(tifinagh_labels_txt, 'r', encoding='utf-8') as f:
    tif_chars = [line.strip() for line in f if line.strip()]

#  Vérification
if len(ascii_labels) != len(tif_chars):
    print(f"Erreur : {len(ascii_labels)} labels Latin ≠ {len(tif_chars)} glyphes Tifinagh")
    sys.exit(1)



In [11]:
# Génération du CSV
with open(output_csv, 'w', encoding='utf-8', newline='') as out:
    for latin, tif in zip(ascii_labels, tif_chars):
        folder = os.path.join(images_root, latin)
        if not os.path.isdir(folder):
            print(f"Dossier introuvable, je passe : {folder}")
            continue
        for fname in sorted(os.listdir(folder)):
            # filtrer si nécessaire par extension d’image
            if not fname.lower().endswith(('.png','.jpg','.jpeg','.bmp','.tif','.tiff')):
                continue
            rel_path = f"{images_root}/{latin}/{fname}"
            # Écrire UNE ligne contenant chemin + glyphe
            out.write(f"{rel_path},{tif}\n")

print(f"Généré {output_csv} avec {sum(len(os.listdir(os.path.join(images_root,l))) for l in ascii_labels if os.path.isdir(os.path.join(images_root,l)))} entrées.")

Généré /content/amhcd/rgb-labels-map.csv avec 25740 entrées.


In [12]:
labels_df = pd.read_csv(output_csv, names=['image_path', 'label'])

In [13]:
# Debugging: Print sample paths
print("Sample image paths:")
for path in labels_df['image_path'].head(5):
    full_path = os.path.join(target_dir, path)
    print(f"{full_path} -> Exists: {os.path.exists(full_path)}")

print(f"\nLoaded {len(labels_df)} samples with {labels_df['label'].nunique ()} unique classes .")

Sample image paths:
/content/amhcd/AMHCD_64/AMHCD_64/ya/ya_1.jpeg -> Exists: True
/content/amhcd/AMHCD_64/AMHCD_64/ya/ya_10.jpeg -> Exists: True
/content/amhcd/AMHCD_64/AMHCD_64/ya/ya_100.jpeg -> Exists: True
/content/amhcd/AMHCD_64/AMHCD_64/ya/ya_101.jpeg -> Exists: True
/content/amhcd/AMHCD_64/AMHCD_64/ya/ya_102.jpeg -> Exists: True

Loaded 25740 samples with 33 unique classes .


In [14]:
class_counts = labels_df['label'].value_counts()
print("Class distribution summary:")
print(f"Min: {class_counts.min()}, Max: {class_counts.max()}, Mean: {class_counts.mean():.2f}")

Class distribution summary:
Min: 780, Max: 780, Mean: 780.00


In [15]:
print("Label encoding ...")
# Encode labels
label_encoder = LabelEncoder()
labels_df['label_encoded'] = label_encoder.fit_transform(labels_df['label'])
num_classes = len(label_encoder.classes_)

print("==================== Data with label encoded: ======================")
print(labels_df)

Label encoding ...
                                            image_path label  label_encoded
0        /content/amhcd/AMHCD_64/AMHCD_64/ya/ya_1.jpeg     ⴰ              0
1       /content/amhcd/AMHCD_64/AMHCD_64/ya/ya_10.jpeg     ⴰ              0
2      /content/amhcd/AMHCD_64/AMHCD_64/ya/ya_100.jpeg     ⴰ              0
3      /content/amhcd/AMHCD_64/AMHCD_64/ya/ya_101.jpeg     ⴰ              0
4      /content/amhcd/AMHCD_64/AMHCD_64/ya/ya_102.jpeg     ⴰ              0
...                                                ...   ...            ...
25735   /content/amhcd/AMHCD_64/AMHCD_64/yu/yu_95.jpeg     ⵓ             20
25736   /content/amhcd/AMHCD_64/AMHCD_64/yu/yu_96.jpeg     ⵓ             20
25737   /content/amhcd/AMHCD_64/AMHCD_64/yu/yu_97.jpeg     ⵓ             20
25738   /content/amhcd/AMHCD_64/AMHCD_64/yu/yu_98.jpeg     ⵓ             20
25739   /content/amhcd/AMHCD_64/AMHCD_64/yu/yu_99.jpeg     ⵓ             20

[25740 rows x 3 columns]


In [16]:
print(f"Class distribution:")
# display(labels_df['label'].value_counts())
display(labels_df[['label', 'label_encoded']].value_counts())

Class distribution:


Unnamed: 0_level_0,Unnamed: 1_level_0,count
label,label_encoded,Unnamed: 2_level_1
ⴰ,0,780
ⴱ,1,780
ⴳ,2,780
ⴳⵯ,3,780
ⴷ,4,780
ⴹ,5,780
ⴻ,6,780
ⴼ,7,780
ⴽ,8,780
ⴽⵯ,9,780


In [17]:
# Load images
X = np.array([load_and_preprocess_image(path, target_dir) for path in labels_df['image_path']])
y = labels_df['label_encoded'].values

# Verify dimensions
assert X.shape[0] == y.shape[0], "Mismatch between number of images and labels"
assert X.shape[1] == 32 * 32, f"Expected flattened image size of {32 * 32}, got {X.shape[1]}"


In [18]:
fig = display_sample_images_from_paths(labels_df, target_dir, random_state=4)
plt.savefig('sample_images1.png')
files.download('sample_images1.png')
plt.close()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

one_hot_encoder = OneHotEncoder(sparse_output=False)
y_train_one_hot = one_hot_encoder.fit_transform(y_train.reshape(-1, 1))
y_val_one_hot = one_hot_encoder.transform(y_val.reshape(-1, 1))
y_test_one_hot = one_hot_encoder.transform(y_test.reshape(-1, 1))
print(f"Train: {X_train.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}")

Train: 15444, Val: 5148, Test: 5148


# **8.Hyperparameter Tuning**


In [20]:
layer_sizes = [32 * 32, 64, 32, num_classes]
param_distributions = {
    'learning_rate': [0.01, 0.001],
    'l2_lambda': [0.01, 0.001],
    'batch_size': [64, 32],
    'optimizer': ['adam', 'sgd']
}

n_iter = 16

# All possible combinations
keys = list(param_distributions.keys())
all_combinations = [
    dict(zip(keys, values))
    for values in itertools.product(*param_distributions.values())
]

# Shuffle and pick n_iter unique combinations
random.shuffle(all_combinations)
random_param_combinations  = all_combinations[:n_iter]

assert len(random_param_combinations) == len(set(tuple(sorted(p.items())) for p in random_param_combinations)), "Duplicate combinations found!"

results = []
best_val_accuracy = 0
best_params = None

In [None]:
# ===================== Random Search + CV Loop =======================

for i in trange(n_iter, desc="Random Search"):
    params = random_param_combinations[i]
    print(f"\nTrial {i + 1}/{n_iter}: {params}")
    avg_train_losses, avg_val_losses, avg_train_accuracies, avg_val_accuracies = k_fold_cross_validation(
        X_train, y_train_one_hot, layer_sizes, k=5, epochs=100, **params, augment=True, verbose=2
    )
    final_val_accuracy = avg_val_accuracies[-1]
    results.append({'trial': i + 1, 'params': params, 'final_val_accuracy': final_val_accuracy})
    if final_val_accuracy > best_val_accuracy:
        best_val_accuracy = final_val_accuracy
        best_params = params

Random Search:   0%|          | 0/16 [00:00<?, ?it/s]


Trial 1/16: {'learning_rate': 0.01, 'l2_lambda': 0.001, 'batch_size': 64, 'optimizer': 'adam'}



Cross-validation folds:   0%|          | 0/5 [00:00<?, ?it/s][A



Epoch 001 | Time: 4.63s | Train Loss: 3.6134 | Val Loss: 3.5007 | Train Acc: 0.0322 | Val Acc: 0.0227
Epoch 011 | Time: 4.50s | Train Loss: 3.5172 | Val Loss: 3.5002 | Train Acc: 0.0317 | Val Acc: 0.0246
Epoch 021 | Time: 4.70s | Train Loss: 3.5172 | Val Loss: 3.4996 | Train Acc: 0.0317 | Val Acc: 0.0246
Epoch 031 | Time: 4.80s | Train Loss: 3.5171 | Val Loss: 3.4990 | Train Acc: 0.0322 | Val Acc: 0.0227
Epoch 041 | Time: 4.90s | Train Loss: 3.5184 | Val Loss: 3.4989 | Train Acc: 0.0314 | Val Acc: 0.0259
Epoch 051 | Time: 5.20s | Train Loss: 3.5172 | Val Loss: 3.4991 | Train Acc: 0.0317 | Val Acc: 0.0246
Epoch 061 | Time: 5.29s | Train Loss: 3.5168 | Val Loss: 3.5001 | Train Acc: 0.0322 | Val Acc: 0.0227
Epoch 071 | Time: 5.10s | Train Loss: 3.5172 | Val Loss: 3.4987 | Train Acc: 0.0307 | Val Acc: 0.0288
Epoch 081 | Time: 5.70s | Train Loss: 3.5172 | Val Loss: 3.4991 | Train Acc: 0.0313 | Val Acc: 0.0262
Epoch 091 | Time: 11.60s | Train Loss: 3.5172 | Val Loss: 3.4996 | Train Acc: 0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Cross-validation folds:  20%|██        | 1/5 [10:11<40:44, 611.16s/it][A


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        90
           1       0.00      0.00      0.00        93
           2       0.00      0.00      0.00        93
           3       0.00      0.00      0.00       101
           4       0.00      0.00      0.00        85
           5       0.00      0.00      0.00        80
           6       0.00      0.00      0.00       102
           7       0.00      0.00      0.00        83
           8       0.00      0.00      0.00        83
           9       0.00      0.00      0.00       109
          10       0.00      0.00      0.00        76
          11       0.00      0.00      0.00        95
          12       0.00      0.00      0.00        95
          13       0.00      0.00      0.00        93
          14       0.00      0.00      0.00        89
          15       0.00      0.00      0.00        90
          16       0.00      0.00      0.00        96
          17       0.00   

In [None]:
# ===================== Result Summary & Best Pick ========================
df = pd.DataFrame([
    {
        'Trial': r['trial'],
        'learning_rate': r['params']['learning_rate'],
        'l2_lambda': r['params']['l2_lambda'],
        'batch_size': r['params']['batch_size'],
        'optimizer': r['params']['optimizer'],
        'val_accuracy': r['final_val_accuracy']
    }
    for r in results
]).sort_values(by='val_accuracy', ascending=False).reset_index(drop=True)

display(df)

df.to_csv('rs_results_summary.csv', index=False)
files.download('rs_results_summary.csv')


In [None]:
# # ===================== Result Summary & Best Pick ========================
# results_df = pd.DataFrame([
#     {
#         'Trial': r['trial'],
#         'learning_rate': r['params']['learning_rate'],
#         'l2_lambda': r['params']['l2_lambda'],
#         'batch_size': r['params']['batch_size'],
#         'optimizer': r['params']['optimizer'],
#         'val_accuracy': r['final_val_accuracy']
#     }
#     for r in results
# ]).sort_values(by='val_accuracy', ascending=False).reset_index(drop=True)

# display(results_df)

# results_df.to_csv('rs_results_summary.csv', index=False)
# files.download('rs_results_summary.csv')

In [None]:
# ========== Save best parameters and best validation accuracy + layer_sizes ==========
with open("random_search_results.json", "w") as f:
    json.dump({
        "best_params": best_params,
        "best_val_accuracy": best_val_accuracy,
        "layer_sizes": layer_sizes
    }, f)

files.download("random_search_results.json")


## **Perform k-fold cross-validation with SGD & Adam**


In [None]:
# =================== Load saved random_search_results.json file ================
with open("random_search_results.json", "r") as f:
    data = json.load(f)
    best_params = data["best_params"]
    layer_sizes = data["layer_sizes"]
    best_val_accuracy = data["best_val_accuracy"]

learning_rate = best_params['learning_rate']
l2_lambda = best_params['l2_lambda']
batch_size = best_params['batch_size']
optimizer = best_params['optimizer']

### **9.1.Perform k-fold cross-validation with SGD**

In [None]:
# Perform k-fold cross-validation with SGD
print("\n========================= K-Fold Cross-Validation with SGD (Augmented): ===========================")
avg_train_losses_sgd, avg_val_losses_sgd, avg_train_accuracies_sgd, avg_val_accuracies_sgd = k_fold_cross_validation(
    X_train, y_train_one_hot, layer_sizes, k=5, epochs=100, batch_size=batch_size, learning_rate=learning_rate, l2_lambda=l2_lambda, optimizer='sgd', augment=True,
)


In [None]:
print("\nK-Fold Cross-Validation with Adam (Augmented):")
avg_train_losses_adam, avg_val_losses_adam, avg_train_accuracies_adam, avg_val_accuracies_adam = k_fold_cross_validation(
    X_train, y_train_one_hot, layer_sizes, k=5, epochs=100, batch_size=batch_size, learning_rate=learning_rate, l2_lambda=l2_lambda, optimizer='adam', augment=True
)

In [None]:
# Ploting k-fold Cross-Validation Results (with SGD & Adam)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.plot(avg_train_losses_sgd, label='Train Loss SGD')
ax1.plot(avg_val_losses_sgd, label='Val Loss SGD')
ax1.plot(avg_train_losses_adam, label='Train Loss Adam')
ax1.plot(avg_val_losses_adam, label='Val Loss Adam')
ax1.set_title('Average Loss (K-Fold)')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax2.plot(avg_train_accuracies_sgd, label='Train Acc SGD')
ax2.plot(avg_val_accuracies_sgd, label='Val Acc SGD')
ax2.plot(avg_train_accuracies_adam, label='Train Acc Adam')
ax2.plot(avg_val_accuracies_adam, label='Val Acc Adam')
ax2.set_title('Average Accuracy (K-Fold)')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
plt.tight_layout()
plt.savefig('kfold_loss_accuracy_sgd_adam.png')
files.download('kfold_loss_accuracy_sgd_adam.png')
# plt.close()

In [None]:
# =================== Load saved random_search_results.json file ================
with open("random_search_results.json", "r") as f:
    data = json.load(f)
    best_params = data["best_params"]
    layer_sizes = data["layer_sizes"]
    best_val_accuracy = data["best_val_accuracy"]

learning_rate = best_params['learning_rate']
l2_lambda = best_params['l2_lambda']
batch_size = best_params['batch_size']
optimizer = best_params['optimizer']

## **10.Train final model with Best parameters on full train+val set:**


*   Train models on combined train+val set, evaluate on test set, with and without augmentation.
*   Note: We combine `X_train` and `X_val` into `X_train_val` to maximize training data (~ 18,532 samples) for the final model,after hyperparameter tuning. A new 10% validation set is split from `X_train_val` for early stopping to avoid using `X_test`.

In [None]:
# print(f"""## **10.Train final model with Best parameters on full train+val set:**


# *   Train models on combined train+val set, evaluate on test set, with and without augmentation.
# *   Note: We combine `X_train` and `X_val` into `X_train_val` to maximize training data (~ {X_train_final.shape[0]} samples) for the final model,after hyperparameter tuning. A new 10% validation set is split from `X_train_val` for early stopping to avoid using `X_test`.""")

In [None]:

X_train_val = np.concatenate([X_train, X_val])
y_train_val_one_hot = np.concatenate([y_train_one_hot, y_val_one_hot])

# Split a new validation set from X_train_val
X_train_final, X_val_final, y_train_final_one_hot, y_val_final_one_hot = train_test_split(
    X_train_val, y_train_val_one_hot, test_size=0.1, stratify=np.argmax(y_train_val_one_hot, axis=1), random_state=42
)
print(f"Final Train: {X_train_final.shape[0]}, Final Val: {X_val_final.shape[0]}, Test: {X_test.shape[0]}")


### **Case 1:  No Augmentation**

In [None]:
# Case 1: No Augmentation
nn = MulticlassNeuralNetwork(layer_sizes, learning_rate=best_params['learning_rate'],
                             l2_lambda=best_params['l2_lambda'], optimizer=best_params['optimizer'])
train_losses, val_losses, train_accuracies, val_accuracies = nn.train(
    X_train_final, y_train_final_one_hot, X_val_final, y_val_final_one_hot, epochs=100,
    batch_size=best_params['batch_size'], augment=False, early_stopping=True, verbose=2
)


In [None]:
y_pred = nn.predict(X_test)
print("\nTest Set Classification Report (No Augmentation):")
print(classification_report(np.argmax(y_test_one_hot, axis=1), y_pred, target_names=label_encoder.classes_))

test_accuracy = np.mean(y_pred == np.argmax(y_test_one_hot, axis=1))
print(f"Test Accuracy (No Augmentation): {test_accuracy:.4f}")

In [None]:
cm = confusion_matrix(np.argmax(y_test_one_hot, axis=1), y_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix (No Augmentation)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_no_aug.png')
files.download('confusion_matrix_no_aug.png')
# plt.close()


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.plot(train_losses, label='Train Loss')
ax1.plot(val_losses, label='Validation Loss')
ax1.set_title('Loss Curve (No Augmentation)')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax2.plot(train_accuracies, label='Train Accuracy')
ax2.plot(val_accuracies, label='Validation Accuracy')
ax2.set_title('Accuracy Curve (No Augmentation)')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
plt.tight_layout()
plt.savefig('loss_accuracy_no_aug.png')
files.download('loss_accuracy_no_aug.png')
# plt.close()


### **Case 2:  With Augmentation**

In [None]:
# Case 2: With Augmentation
nn_aug = MulticlassNeuralNetwork(layer_sizes, learning_rate=best_params['learning_rate'],
                                 l2_lambda=best_params['l2_lambda'], optimizer=best_params['optimizer'])
train_losses_aug, val_losses_aug, train_accuracies_aug, val_accuracies_aug = nn_aug.train(
    X_train_final, y_train_final_one_hot, X_val_final, y_val_final_one_hot, epochs=100,
    batch_size=best_params['batch_size'], augment=True, early_stopping=True
)


In [None]:
y_pred_aug = nn_aug.predict(X_test)
print("\nTest Set Classification Report (With Augmentation):")
print(classification_report(np.argmax(y_test_one_hot, axis=1), y_pred_aug, target_names=label_encoder.classes_))

test_accuracy_aug = np.mean(y_pred_aug == np.argmax(y_test_one_hot, axis=1))
print(f"Test Accuracy (With Augmentation): {test_accuracy_aug:.4f}")

In [None]:
cm_aug = confusion_matrix(np.argmax(y_test_one_hot, axis=1), y_pred_aug)
plt.figure(figsize=(12, 10))
sns.heatmap(cm_aug, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix (With Augmentation)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_with_aug.png')
files.download('confusion_matrix_with_aug.png')
# plt.close()


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.plot(train_losses_aug, label='Train Loss')
ax1.plot(val_losses_aug, label='Validation Loss')
ax1.set_title('Loss Curve (With Augmentation)')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax2.plot(train_accuracies_aug, label='Train Accuracy')
ax2.plot(val_accuracies_aug, label='Validation Accuracy')
ax2.set_title('Accuracy Curve (With Augmentation)')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
plt.tight_layout()
plt.savefig('loss_accuracy_with_aug.png')
files.download('loss_accuracy_with_aug.png')
# plt.close()


## **11.Alternative: Train on X_train only, use X_val for early stopping**
This section trains the model on `X_train` (~ 16,908 samples) with `X_val` (~ 5,637 samples) for early stopping to compare against the main approach (`X_train_val` with a 10% validation split).

### **Case 1:  No Augmentation (Alternative)**

In [None]:
# Case 1: No Augmentation (Alternative)
nn_alt = MulticlassNeuralNetwork(layer_sizes, learning_rate=best_params['learning_rate'],
                                 l2_lambda=best_params['l2_lambda'], optimizer=best_params['optimizer'])
train_losses_alt, val_losses_alt, train_accuracies_alt, val_accuracies_alt = nn_alt.train(
    X_train, y_train_one_hot, X_val, y_val_one_hot, epochs=100,
    batch_size=best_params['batch_size'], augment=False, early_stopping=True, verbose=2
)

In [None]:
y_pred_alt = nn_alt.predict(X_test)
print("\nTest Set Classification Report (No Augmentation, Alternative):")
print(classification_report(np.argmax(y_test_one_hot, axis=1), y_pred_alt, target_names=label_encoder.classes_))

test_accuracy_alt = np.mean(y_pred_alt == np.argmax(y_test_one_hot, axis=1))
print(f"Test Accuracy (No Augmentation, Alternative): {test_accuracy_alt:.4f}")

In [None]:
cm_alt = confusion_matrix(np.argmax(y_test_one_hot, axis=1), y_pred_alt)
plt.figure(figsize=(12, 10))
sns.heatmap(cm_alt, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix (No Augmentation, Alternative)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_no_aug_alt.png')
files.download('confusion_matrix_no_aug_alt.png')
# plt.close()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.plot(train_losses_alt, label='Train Loss')
ax1.plot(val_losses_alt, label='Validation Loss')
ax1.set_title('Loss Curve (No Augmentation, Alternative)')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax2.plot(train_accuracies_alt, label='Train Accuracy')
ax2.plot(val_accuracies_alt, label='Validation Accuracy')
ax2.set_title('Accuracy Curve (No Augmentation, Alternative)')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
plt.tight_layout()
plt.savefig('loss_accuracy_no_aug_alt.png')
files.download('loss_accuracy_no_aug_alt.png')
# plt.close()

### **Case 2:  With Augmentation (Alternative)**

In [None]:
# Case 2: With Augmentation (Alternative)
nn_aug_alt = MulticlassNeuralNetwork(layer_sizes, learning_rate=best_params['learning_rate'],
                                     l2_lambda=best_params['l2_lambda'], optimizer=best_params['optimizer'])
train_losses_aug_alt, val_losses_aug_alt, train_accuracies_aug_alt, val_accuracies_aug_alt = nn_aug_alt.train(
    X_train, y_train_one_hot, X_val, y_val_one_hot, epochs=100,
    batch_size=best_params['batch_size'], augment=True, early_stopping=True
)

In [None]:
y_pred_aug_alt = nn_aug_alt.predict(X_test)
print("\nTest Set Classification Report (With Augmentation, Alternative):")
print(classification_report(np.argmax(y_test_one_hot, axis=1), y_pred_aug_alt, target_names=label_encoder.classes_))

test_accuracy_aug_alt = np.mean(y_pred_aug_alt == np.argmax(y_test_one_hot, axis=1))
print(f"Test Accuracy (With Augmentation, Alternative): {test_accuracy_aug_alt:.4f}")

In [None]:
cm_aug_alt = confusion_matrix(np.argmax(y_test_one_hot, axis=1), y_pred_aug_alt)
plt.figure(figsize=(12, 10))
sns.heatmap(cm_aug_alt, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix (With Augmentation, Alternative)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_with_aug_alt.png')
files.download('confusion_matrix_with_aug_alt.png')
# plt.close()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.plot(train_losses_aug_alt, label='Train Loss')
ax1.plot(val_losses_aug_alt, label='Validation Loss')
ax1.set_title('Loss Curve (With Augmentation, Alternative)')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax2.plot(train_accuracies_aug_alt, label='Train Accuracy')
ax2.plot(val_accuracies_aug_alt, label='Validation Accuracy')
ax2.set_title('Accuracy Curve (With Augmentation, Alternative)')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
plt.tight_layout()
plt.savefig('loss_accuracy_with_aug_alt.png')
files.download('loss_accuracy_with_aug_alt.png')
# plt.close()

In [None]:
# Comparison Bar Chart
fig, ax = plt.subplots(figsize=(8, 6))
approaches = ['Main (No Aug)', 'Main (With Aug)', 'Alt (No Aug)', 'Alt (With Aug)']
accuracies = [test_accuracy, test_accuracy_aug, test_accuracy_alt, test_accuracy_aug_alt]
ax.bar(approaches, accuracies, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
ax.set_title('Test Accuracy Comparison: Main vs. Alternative')
ax.set_ylabel('Test Accuracy')
ax.set_ylim(0, 1)
for i, v in enumerate(accuracies):
    ax.text(i, v + 0.01, f'{v:.4f}', ha='center')
plt.tight_layout()
plt.savefig('test_accuracy_comparison.png')
files.download('test_accuracy_comparison.png')
# plt.close()

In [None]:
comparison_results = {
    'main_no_aug': test_accuracy,
    'main_with_aug': test_accuracy_aug,
    'alt_no_aug': test_accuracy_alt,
    'alt_with_aug': test_accuracy_aug_alt
}
with open('comparison_results.json', 'w') as f:
    json.dump(comparison_results, f)
files.download('comparison_results.json')

## **12. Results Summary**
- Best parameters: {'learning_rate': 0.001, 'l2_lambda': 0.001, 'batch_size': 32, 'optimizer': 'adam'}
- Best validation accuracy: 82.93%
- **Main Approach**:
  - Test accuracy (No Augmentation): 91.64%
  - Test accuracy (With Augmentation): 84.55%
- **Alternative Approach**:
  - Test accuracy (No Augmentation): 90.69%
  - Test accuracy (With Augmentation): 83.95%
- Figure: Test accuracy comparison (`test_accuracy_comparison.png`)

In [None]:
# Generate Markdown:
print(f"""## **12. Results Summary**
- Best parameters: {best_params}
- Best validation accuracy: {best_val_accuracy:.2%}
- **Main Approach**:
  - Test accuracy (No Augmentation): {test_accuracy:.2%}
  - Test accuracy (With Augmentation): {test_accuracy_aug:.2%}
- **Alternative Approach**:
  - Test accuracy (No Augmentation): {test_accuracy_alt:.2%}
  - Test accuracy (With Augmentation): {test_accuracy_aug_alt:.2%}
- Figure: Test accuracy comparison (`test_accuracy_comparison.png`)
""")


In [None]:
# with open("results_summary.md", "w") as f:
#     f.write("## 12. Results Summary\n")
#     f.write(f"- Best parameters: {best_params}\n")
#     f.write(f"- Best validation accuracy: {best_val_accuracy:.2%}\n")
#     f.write("- **Main Approach**:\n")
#     f.write(f"  - Test accuracy (No Augmentation): {test_accuracy:.2%}\n")
#     f.write(f"  - Test accuracy (With Augmentation): {test_accuracy_aug:.2%}\n")
#     f.write("- **Alternative Approach**:\n")
#     f.write(f"  - Test accuracy (No Augmentation): {test_accuracy_alt:.2%}\n")
#     f.write(f"  - Test accuracy (With Augmentation): {test_accuracy_aug_alt:.2%}\n")
#     f.write("- Figure: Test accuracy comparison (`test_accuracy_comparison.png`)\n")