[Repo Link](https://github.com/habibaelghazouly/ML-projects.git)

## Imports

In [None]:
import os
import sys
sys.path.append("..")
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import time

from src import preprocess_mnist
from src import NNModel
from src import train_model_nn
from src import plot_training_curves
from src import detect_convergence, plot_convergence 
from src.NNs.helpers import get_gradients
from src import LogisticRegressionModel
from src import train_model, test_model
from src import plot_curves, print_confusion_matrix
from src import SoftmaxRegressionModel
from src import CNNModel
from src import train_model_cnn


## Displaying Non-Flattened MNIST Images

In [None]:
train_loader, val_loader, test_loader = preprocess_mnist(flatten=False)

images, labels = next(iter(train_loader))
fig, axes = plt.subplots(1, 8, figsize=(12, 2))
for i in range(8):
    axes[i].imshow(images[i].squeeze(), cmap='gray')
    axes[i].set_title(str(labels[i].item()))
    axes[i].axis('off')
plt.show()


## Displaying Flattened MNIST Data Information

In [None]:
train_loader, val_loader, test_loader = preprocess_mnist(batch_size=64, augment=False, flatten=True)

images, labels = next(iter(train_loader))
print(f"Images batch shape: {images.shape}")
print(f"Labels batch shape: {labels.shape}")
print(f"Example labels: {labels[:10]}")

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Binary Classification (0,1)

In [None]:
# Filter 0 and 1 only
def filter_binary(loader):
    X, y = [], []
    for img, label in loader.dataset:
        if label in [0, 1]:
            X.append(img)
            y.append(label)
    X = torch.stack(X)
    y = torch.tensor(y)
    ds = torch.utils.data.TensorDataset(X, y)
    return torch.utils.data.DataLoader(ds, batch_size=64, shuffle=True)

train_loader_bin = filter_binary(train_loader)
val_loader_bin = filter_binary(val_loader)
test_loader_bin = filter_binary(test_loader)

## Logistic Regression Model

In [None]:
model_log = LogisticRegressionModel(input_dim=784)
loss_fn = nn.BCELoss()

## Train

In [None]:

# Training time
start_time = time.time()
train_losses, val_losses, train_accs, val_accs = train_model(
    model_log, train_loader_bin, val_loader_bin,
    epochs=30, lr=0.01, device=device, loss_fn=loss_fn, binary=True
)

end_time = time.time()
training_time_logistic = end_time - start_time
print(f"Training time for Logistic Regression Model: {training_time_logistic:.2f} seconds")

## Plots


In [None]:
plot_curves(train_losses, val_losses, "Binary Logistic Regression - Loss", "Loss")
plot_curves(train_accs, val_accs, "Binary Logistic Regression - Accuracy", "Accuracy")

## Test

In [None]:
acc_logistic, cm = test_model(model_log, test_loader_bin, device, binary=True)
print(f"Test Accuracy: {acc_logistic*100:.3f}%")
print_confusion_matrix(cm, classes=["0", "1"])

## Softmax Regression

In [None]:
# model
model_softmax = SoftmaxRegressionModel(input_dim=784, num_classes=10)
loss_fn = nn.CrossEntropyLoss()

In [None]:
# training time
start_time = time.time()
train_loader, val_loader, test_loader = preprocess_mnist(flatten=True)

train_losses, val_losses, train_accs, val_accs = train_model(
    model_softmax, train_loader, val_loader,
    epochs=10, lr=0.01, device=device, loss_fn=loss_fn
)

end_time = time.time()
training_time_softmax = end_time - start_time
print(f"Training time for Softmax Model: {training_time_softmax:.2f} seconds")

In [None]:
# plot
plot_curves(train_losses, val_losses, "Softmax Regression - Loss", "Loss")
plot_curves(train_accs, val_accs, "Softmax Regression - Accuracy", "Accuracy")

In [None]:
# test
acc_softmax, cm = test_model(model_softmax, test_loader, device)
print(f"Test Accuracy: {acc_softmax*100:.3f}%")
print_confusion_matrix(cm, classes=[str(i) for i in range(10)])

# per class accuracy
class_correct = [0 for _ in range(10)]
class_total = [0 for _ in range(10)]
model_softmax.eval()
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model_softmax(images)
        _, predicted = torch.max(outputs, 1)
        c = (predicted == labels).squeeze()
        for i in range(len(labels)):
            label = labels[i]
            class_correct[label] += c[i].item()
            class_total[label] += 1
for i in range(10):
    if class_total[i] > 0:
        print(f"Accuracy of class {i}: {100 * class_correct[i] / class_total[i]:.2f}%")
    else:
        print(f"Accuracy of class {i}: N/A (no samples)")

## Neural Network Model

In [None]:
# Model, loss, optimizer
model = NNModel().to(device)
model.apply(model._init_weights)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

epochs = 10

checkpoint_path = "./checkpoints/mnist.pth"
os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)

train_loader, val_loader, test_loader = preprocess_mnist(batch_size=64, augment=False, flatten=True)

# Train
history = train_model_nn(model, train_loader, val_loader, criterion, optimizer, epochs=epochs, device=device, checkpoint_path=checkpoint_path)

# Plot
plot_training_curves(history)
conv_epoch = detect_convergence(history["val_loss_mean"])
plot_convergence(history["train_loss_mean"], history["val_loss_mean"], conv_epoch)


# Analysis


### 1. Learning Rate Analysis

In [None]:
# Test values : [0.001, 0.01, 0.1, 1.0]
learning_rates = [0.001, 0.01, 0.1, 1.0]
epochs = 5

results_lr = {}
for lr in learning_rates:
    print(f"Training with learning rate: {lr}")
    model = NNModel().to(device)
    model.apply(model._init_weights)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    result = train_model_nn(model, train_loader, val_loader, criterion, optimizer, epochs=epochs, device=device)
    results_lr[lr] = result
    plot_training_curves(result)
    conv_epoch = detect_convergence(result["val_loss_mean"])
    plot_convergence(result["train_loss_mean"], result["val_loss_mean"], conv_epoch)

# best lr 
best_lr = None
best_acc = 0.0

for lr, history in results_lr.items():
    val_acc = history["val_acc_mean"][-1]
    print(f"LR {lr:<5} → Final Val Acc: {val_acc*100:.2f}%")
    if val_acc > best_acc:
        best_acc = val_acc
        best_lr = lr

print(f"\nBest Learning Rate: {best_lr} with Val Acc = {best_acc*100:.2f}%")


### 2. Batch Size Analysis


In [None]:
# get the full-batch sample and calculating gradient

full_batch_images, full_batch_labels = next(iter(train_loader))
full_batch_images, full_batch_labels = full_batch_images.to(device), full_batch_labels.to(device)

model = NNModel().to(device)
model.apply(model._init_weights)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)    
full_result = train_model_nn(model, train_loader, val_loader, criterion, optimizer, epochs=5, device=device)

full_batch_gradients = get_gradients(model, criterion, full_batch_images, full_batch_labels)

# Test Values : [16, 32, 64, 128]
batch_sizes = [16, 32, 64, 128]
epochs = 5
results_bs = {} 
grad_noise_results = []

for bs in batch_sizes:  
    print(f"Training with batch size: {bs}")
    train_loader_bs, val_loader_bs, test_loader_bs = preprocess_mnist(batch_size=bs, augment=False, flatten=True)
   
    # model, loss, optimizer
    model = NNModel().to(device)
    model.apply(model._init_weights)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    
    start_time = time.time()
    result = train_model_nn(model, train_loader_bs, val_loader_bs, criterion, optimizer, epochs=epochs, device=device)
    train_time = time.time() - start_time
    results_bs[bs] = result

    final_val_acc = result["val_acc_mean"][-1]
 
    print(f"----- Final Val Acc: {final_val_acc*100:.2f}% | Train Time: {train_time:.2f}s -----")


    # Random stochastic batch
    batch_inputs, batch_targets = next(iter(DataLoader(train_loader_bs.dataset, batch_size=bs, shuffle=True)))
    batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)
    
    stoch_grads = get_gradients(model, criterion, batch_inputs.view(batch_inputs.size(0), -1), batch_targets)
    
    # Gradient noise 
    grad_noise = [sg - tg for sg, tg in zip(stoch_grads, full_batch_gradients)]
    noise_norm = torch.sqrt(sum([g.pow(2).sum() for g in grad_noise])).item()
    
    grad_noise_results.append((bs, final_val_acc * 100, train_time, noise_norm))
    print(f"----- Gradient Noise : {noise_norm:.6f} -----")


summary_df = pd.DataFrame(grad_noise_results, columns=["Batch Size", "Val Accuracy (%)", "Train Time (s)", "Gradient Noise"])
display(summary_df)

 
# best batch size
best_bs = None
best_acc = 0.0  
for bs, history in results_bs.items():
    val_acc = history["val_acc_mean"][-1]
    print(f"BS {bs:<5} → Final Val Acc: {val_acc*100:.2f}%")
    if val_acc > best_acc:
        best_acc = val_acc
        best_bs = bs


#  visualization
plt.figure(figsize=(6,4))
plt.plot(summary_df["Batch Size"], summary_df["Gradient Noise"], marker='o')
plt.title("Gradient Noise vs Batch Size (MNIST)")
plt.xlabel("Batch Size")
plt.ylabel("Gradient Noise")
plt.grid(True)
plt.show()

print(f"\nBest Batch Size: {best_bs} with Val Acc = {best_acc*100:.2f}%")


Batch size & Gradient Noise are inversely proportioned.
As the batch size increases, the noise of gradient estimates decreases, which leads to a smoother curves and more stable convergence.


### 3. Architecture Analysis

In [None]:
layers = [2, 3, 4, 5]
neurons_per_layer = [64, 128, 256, 512]
epochs = 2
results_arch = {}

for num_layers in layers:
    for neurons in neurons_per_layer:
        print(f"Training with {num_layers} layers and {neurons} neurons per layer")

        model = NNModel(hidden_sizes=[neurons]*num_layers).to(device)
        model.apply(model._init_weights)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
        
        result = train_model_nn(model, train_loader, val_loader, criterion, optimizer, epochs=epochs, device=device)
      
        results_arch[(num_layers, neurons)] = result

arch_df = pd.DataFrame(
    [(num_layers, neurons, res["val_acc_mean"][-1] * 100,) 
     for (num_layers, neurons), res in results_arch.items()],
    columns=["Num Layers", "Neurons per Layer", "Val Accuracy (%)"]
)
display(arch_df)            

best_row = arch_df.loc[arch_df["Val Accuracy (%)"].idxmax()]
print(f"\nBest Architecture: {int(best_row['Num Layers'])} layers × {int(best_row['Neurons per Layer'])} neurons")
print(f"Validation Accuracy: {best_row['Val Accuracy (%)']:.2f}%")

## Best NN Model

In [None]:
# Best Architecture Model Training

# Best lr , Best bs , Best architecture
num_layers = int(best_row["Num Layers"])
neurons = int(best_row["Neurons per Layer"])
hidden_sizes = [neurons] * num_layers

train_loader_best, val_loader_best, test_loader_best = preprocess_mnist(batch_size=best_bs, augment=False, flatten=True)

best_model = NNModel(hidden_sizes=hidden_sizes).to(device)
best_model.apply(best_model._init_weights)
best_criterion = nn.CrossEntropyLoss()
best_optimizer = torch.optim.SGD(best_model.parameters(), lr=best_lr)

print(f"Best NN model: {num_layers} layers × {neurons} neurons, lr={best_lr}, batch_size={best_bs}")


### Training the best NN Model


In [None]:
# train best_model
train_time_start = time.time()
train_model_nn(best_model, train_loader_best, val_loader_best, best_criterion, best_optimizer, epochs=5, device=device)
train_time_end = time.time()    
train_time_best_nn = train_time_end - train_time_start
print(f"Training time for Best NN Model: {train_time_best_nn:.2f} seconds with accuracy {best_acc*100:.2f}%") 

## Model Comparision


In [None]:
# Create summary DataFrame
results_df = pd.DataFrame({
    "Model": ["Logistic Regression", "Softmax Regression", "Neural Network"],
    "Accuracy (%)": [acc_logistic * 100, acc_softmax * 100, best_acc * 100],
    "Training Time (s)": [training_time_logistic, training_time_softmax, train_time_best_nn]
})

# Round for neat display
results_df["Accuracy (%)"] = results_df["Accuracy (%)"].round(2)
results_df["Training Time (s)"] = results_df["Training Time (s)"].round(2)

# Display
display(results_df)

# Bar plots
fig, ax1 = plt.subplots(1, 2, figsize=(12, 4))

# Accuracy plot
ax1[0].bar(results_df["Model"], results_df["Accuracy (%)"], color="steelblue")
ax1[0].set_title("Model Accuracy Comparison")
ax1[0].set_ylabel("Accuracy (%)")

# Time plot
ax1[1].bar(results_df["Model"], results_df["Training Time (s)"], color="steelblue")
ax1[1].set_title("Model Training Time Comparison")
ax1[1].set_ylabel("Training Time (seconds)")

plt.tight_layout()
plt.show()


##### **Computational Complexity & Training Time**
- **Logistic Regression:** Fastest and simplest model. It trains quickly since it only involves one linear layer.  
- **Softmax Regression:** Slightly heavier than logistic, but still efficient and quick to converge.  
- **Neural Network:** Most computationally complex as deeper layers and more parameters make training slower but more powerful.



##### **When to Use Each**
- **Logistic Regression:** Best for simple, linearly separable problems or when speed and interpretability matter most.  
- **Softmax Regression:** Great for multi-class problems with mostly linear relationships.  
- **Neural Network:** Ideal for complex, non-linear data like images , use it when accuracy is more important than speed.



## Best Model Evaluation

In [None]:
# test best nn model
best_nn_acc, cm = test_model(best_model, test_loader_best, device)
print(f"Best NN Model Test Accuracy: {best_nn_acc*100:.3f}%")
print_confusion_matrix(cm, classes=[str(i) for i in range(10)])

# Analysis of misclassifications
misclassified_indices = []
for images, labels in test_loader_best:
    images, labels = images.to(device), labels.to(device)
    outputs = best_model(images)
    _, predicted = torch.max(outputs, 1)
    for i in range(len(labels)):
        if predicted[i] != labels[i]:
            misclassified_indices.append((images[i].cpu(), labels[i].cpu(), predicted[i].cpu()))

# Display some misclassified samples
num_to_display = 8
fig, axes = plt.subplots(1, num_to_display, figsize=(15, 3))
for i in range(num_to_display): 
    if i < len(misclassified_indices):
        img, true_label, pred_label = misclassified_indices[i]
        axes[i].imshow(img.view(28, 28), cmap='gray')
        axes[i].set_title(f"T: {true_label.item()} P: {pred_label.item()}")
        axes[i].axis('off')            

plt.show()

# CNN


In [None]:
model_cnn = CNNModel(num_classes=10).to(device)
criterion = nn.CrossEntropyLoss()   
optimizer = torch.optim.SGD(model_cnn.parameters(), lr=0.01)

train_loader, val_loader, test_loader = preprocess_mnist(batch_size=64, augment=True, flatten=False)
epochs = 10
    
train_model_cnn(
    model=model_cnn,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    epochs=epochs,
    device=device
)

## CNN Evaluation 

In [None]:
model_cnn.eval()
correct = 0 
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model_cnn(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
acc_cnn = 100 * correct / total
print(f'Test Accuracy: {acc_cnn:.2f}%')

## CNN vs Fully-connected Network

In [None]:
# performance comparison
models = ['Best NN Model', 'CNN Model']
accuracies = [best_nn_acc * 100, acc_cnn]

#summary table
summary_df = pd.DataFrame({
    "Model": models,
    "Accuracy (%)": accuracies
})

display(summary_df) 
