<a href="https://colab.research.google.com/github/jioffe502/cs5787_a1/blob/main/cs5787_a1_sweep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/cs5787

/content/drive/MyDrive/cs5787


In [3]:
!ls

 accuracy_summary.csv
 convergence_graphs.png
 cs5787_a1.ipynb
 data
'Dropout (GELU) (lr=0.001, batch_size=32, dropout=0.3)_weights.pth'
'Dropout (GELU) (lr=0.001, batch_size=32, dropout=0.5)_weights.pth'
'Dropout (GELU) (lr=0.001, batch_size=32, dropout=0.7)_weights.pth'
'Dropout (GELU) (lr=0.001, batch_size=64, dropout=0.3)_weights.pth'
'Dropout (GELU) (lr=0.001, batch_size=64, dropout=0.5)_weights.pth'
'Dropout (GELU) (lr=0.01, batch_size=128, dropout=0.3)_weights.pth'
'Dropout (GELU) (lr=0.01, batch_size=128, dropout=0.5)_weights.pth'
'Dropout (GELU) (lr=0.01, batch_size=128, dropout=0.7)_weights.pth'
'Dropout (GELU) (lr=0.01, batch_size=32, dropout=0.3)_weights.pth'
'Dropout (GELU) (lr=0.01, batch_size=32, dropout=0.5)_weights.pth'
'Dropout (GELU) (lr=0.01, batch_size=32, dropout=0.7)_weights.pth'
'Dropout (GELU) (lr=0.01, batch_size=64, dropout=0.3)_weights.pth'
'Dropout (GELU) (lr=0.01, batch_size=64, dropout=0.5)_weights.pth'
'Dropout (GELU) (lr=0.01, batch_size=64, dropout=0.7

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from PIL import Image
import struct
import gzip

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:

# Cell 2: Custom FashionMNIST Dataset
def read_idx(filename):
    with gzip.open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)

class CustomFashionMNIST(Dataset):
    def __init__(self, root, train=True, transform=None):
        self.root = root
        self.transform = transform
        self.train = train

        if self.train:
            self.data = read_idx(f'{self.root}/train-images-idx3-ubyte.gz')
            self.targets = read_idx(f'{self.root}/train-labels-idx1-ubyte.gz')
        else:
            self.data = read_idx(f'{self.root}/test-images-idx3-ubyte.gz')
            self.targets = read_idx(f'{self.root}/test-labels-idx1-ubyte.gz')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        img, target = self.data[index], int(self.targets[index])
        img = Image.fromarray(img, mode='L')

        if self.transform is not None:
            img = self.transform(img)

        return img, target



In [6]:
# Cell 3: LeNet5 Model with GELU
class LeNet5GELU(nn.Module):
    def __init__(self, use_dropout=False, use_batch_norm=False):
        super(LeNet5GELU, self).__init__()
        self.use_dropout = use_dropout
        self.use_batch_norm = use_batch_norm

        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, padding=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)

        # Batch normalization layers
        if self.use_batch_norm:
            self.bn1 = nn.BatchNorm2d(6)
            self.bn2 = nn.BatchNorm2d(16)

        # Fully connected layers
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

        # Dropout layer
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # First convolutional layer
        x = self.conv1(x)
        if self.use_batch_norm:
            x = self.bn1(x)
        x = nn.functional.gelu(x)
        x = nn.functional.max_pool2d(x, 2)

        # Second convolutional layer
        x = self.conv2(x)
        if self.use_batch_norm:
            x = self.bn2(x)
        x = nn.functional.gelu(x)
        x = nn.functional.max_pool2d(x, 2)

        # Flatten the output
        x = x.view(-1, 16 * 5 * 5)

        # Fully connected layers
        x = nn.functional.gelu(self.fc1(x))
        if self.use_dropout:
            x = self.dropout(x)
        x = nn.functional.gelu(self.fc2(x))
        if self.use_dropout:
            x = self.dropout(x)
        x = self.fc3(x)

        return x



In [7]:
# Cell 4: Data Loading and Preprocessing
data_path = '/content/drive/MyDrive/cs5787/data'
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = CustomFashionMNIST(root=data_path, train=True, transform=transform)
test_dataset = CustomFashionMNIST(root=data_path, train=False, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)



In [8]:
# Cell 5: Training and Evaluation Functions
def train_and_evaluate(model, optimizer, criterion, train_loader, test_loader, num_epochs=50, use_dropout=False):
    model.to(device)

    train_accuracies = []
    test_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Evaluate on train set (without dropout)
        model.eval()
        if use_dropout:
            model.dropout.eval()  # Disable dropout for evaluation
        train_accuracy = evaluate_model(model, train_loader)
        train_accuracies.append(train_accuracy)

        # Evaluate on test set
        test_accuracy = evaluate_model(model, test_loader)
        test_accuracies.append(test_accuracy)

        print(f'Epoch [{epoch+1}/{num_epochs}], Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

    return train_accuracies, test_accuracies

def evaluate_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    return accuracy



In [None]:
# Cell 6: Training Configurations and Execution
configs = [
    # {"name": "No Regularization (GELU)", "dropout": False, "weight_decay": 0, "batch_norm": False},
    # {"name": "Dropout (GELU)", "dropout": True, "weight_decay": 0, "batch_norm": False},
    # {"name": "Weight Decay (GELU)", "dropout": False, "weight_decay": 1e-4, "batch_norm": False},
    {"name": "Batch Normalization (GELU)", "dropout": False, "weight_decay": 0, "batch_norm": True}
]

# Hyperparameters to experiment with
learning_rates = [0.01, 0.001, 0.0001]
batch_sizes = [32, 64, 128]
num_epochs = 20
dropout_rates = [0.3, 0.5, 0.7]
weight_decay_values = [1e-3, 1e-4, 1e-5]

results = {}

for config in configs:
    print(f"\nTraining with {config['name']}:")

    # Experiment with different learning rates and batch sizes
    for lr in learning_rates:
        for batch_size in batch_sizes:
            model = LeNet5GELU(use_dropout=config['dropout'], use_batch_norm=config['batch_norm'])
            criterion = nn.CrossEntropyLoss()

            # If using dropout, experiment with different dropout rates
            if config['dropout']:
                for dropout_rate in dropout_rates:
                    model.dropout.p = dropout_rate
                    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=config['weight_decay'])

                    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

                    train_accuracies, test_accuracies = train_and_evaluate(model, optimizer, criterion, train_loader, test_loader, num_epochs=num_epochs, use_dropout=config['dropout'])

                    key = f"{config['name']} (lr={lr}, batch_size={batch_size}, dropout={dropout_rate})"
                    results[key] = {
                        'train_accuracies': train_accuracies,
                        'test_accuracies': test_accuracies,
                    }

                    print(f"Final Test Accuracy with {key}: {test_accuracies[-1]:.4f}")

                    # Save model weights
                    torch.save(model.state_dict(), f"{key}_weights.pth")

            # If using weight decay, experiment with different values
            elif config['weight_decay'] > 0:
                for wd in weight_decay_values:
                    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

                    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

                    train_accuracies, test_accuracies = train_and_evaluate(model, optimizer, criterion, train_loader, test_loader, num_epochs=num_epochs, use_dropout=config['dropout'])

                    key = f"{config['name']} (lr={lr}, batch_size={batch_size}, weight_decay={wd})"
                    results[key] = {
                        'train_accuracies': train_accuracies,
                        'test_accuracies': test_accuracies,
                    }

                    print(f"Final Test Accuracy with {key}: {test_accuracies[-1]:.4f}")

                    # Save model weights
                    torch.save(model.state_dict(), f"{key}_weights.pth")

            # For no regularization and batch normalization
            else:
                optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=config['weight_decay'])

                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

                train_accuracies, test_accuracies = train_and_evaluate(model, optimizer, criterion, train_loader, test_loader, num_epochs=num_epochs, use_dropout=config['dropout'])

                key = f"{config['name']} (lr={lr}, batch_size={batch_size})"
                results[key] = {
                    'train_accuracies': train_accuracies,
                    'test_accuracies': test_accuracies,
                }

                print(f"Final Test Accuracy with {key}: {test_accuracies[-1]:.4f}")

                # Save model weights
                torch.save(model.state_dict(), f"{key}_weights.pth")


Training with Weight Decay (GELU):
Epoch [1/20], Train Accuracy: 0.8679, Test Accuracy: 0.8585
Epoch [2/20], Train Accuracy: 0.8512, Test Accuracy: 0.8419
Epoch [3/20], Train Accuracy: 0.8641, Test Accuracy: 0.8530
Epoch [4/20], Train Accuracy: 0.8544, Test Accuracy: 0.8416
Epoch [5/20], Train Accuracy: 0.8599, Test Accuracy: 0.8469
Epoch [6/20], Train Accuracy: 0.8707, Test Accuracy: 0.8592
Epoch [7/20], Train Accuracy: 0.8556, Test Accuracy: 0.8460
Epoch [8/20], Train Accuracy: 0.8742, Test Accuracy: 0.8678
Epoch [9/20], Train Accuracy: 0.8533, Test Accuracy: 0.8396
Epoch [10/20], Train Accuracy: 0.8624, Test Accuracy: 0.8545
Epoch [11/20], Train Accuracy: 0.8627, Test Accuracy: 0.8524
Epoch [12/20], Train Accuracy: 0.8783, Test Accuracy: 0.8708
Epoch [13/20], Train Accuracy: 0.8668, Test Accuracy: 0.8520
Epoch [14/20], Train Accuracy: 0.8656, Test Accuracy: 0.8569
Epoch [15/20], Train Accuracy: 0.8759, Test Accuracy: 0.8654
Epoch [16/20], Train Accuracy: 0.8741, Test Accuracy: 0.86

In [None]:
# Cell 7: Plotting Convergence Graphs
plt.figure(figsize=(20, 15))
best_results = {}

for technique in ['No Regularization (GELU)', 'Dropout (GELU)', 'Weight Decay (GELU)', 'Batch Normalization (GELU)']:
    best_accuracy = 0
    best_key = ''
    for key, data in results.items():
        if key.startswith(technique) and data['test_accuracies'][-1] > best_accuracy:
            best_accuracy = data['test_accuracies'][-1]
            best_key = key
    best_results[technique] = results[best_key]

for i, (name, data) in enumerate(best_results.items(), 1):
    plt.subplot(2, 2, i)
    plt.plot(data['train_accuracies'], label='Train')
    plt.plot(data['test_accuracies'], label='Test')
    plt.title(f'{name} - Convergence Graph\nBest Config: {best_key}')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

plt.tight_layout()
plt.savefig('convergence_graphs.png')
plt.show()


In [None]:
# Cell 8: Creating Summary Tables

# Table 1: Best results for each technique
best_results = {}
for technique in ['No Regularization (GELU)', 'Dropout (GELU)', 'Weight Decay (GELU)', 'Batch Normalization (GELU)']:
    best_accuracy = 0
    best_key = ''
    for key, data in results.items():
        if key.startswith(technique) and data['test_accuracies'][-1] > best_accuracy:
            best_accuracy = data['test_accuracies'][-1]
            best_key = key
    best_results[technique] = (best_key, results[best_key])

summary_data = {
    'Technique': [],
    'Best Configuration': [],
    'Train Accuracy': [],
    'Test Accuracy': []
}

for technique, (key, data) in best_results.items():
    summary_data['Technique'].append(technique)
    summary_data['Best Configuration'].append(key)
    summary_data['Train Accuracy'].append(data['train_accuracies'][-1])
    summary_data['Test Accuracy'].append(data['test_accuracies'][-1])

summary_df = pd.DataFrame(summary_data)
print("Best Results for Each Technique:")
print(summary_df.to_string(index=False))
summary_df.to_csv('best_accuracy_summary.csv', index=False)

# Table 2: All results (for ablation study)
all_results_data = {
    'Configuration': [],
    'Train Accuracy': [],
    'Test Accuracy': []
}

for key, data in results.items():
    all_results_data['Configuration'].append(key)
    all_results_data['Train Accuracy'].append(data['train_accuracies'][-1])
    all_results_data['Test Accuracy'].append(data['test_accuracies'][-1])

all_results_df = pd.DataFrame(all_results_data)
all_results_df = all_results_df.sort_values('Test Accuracy', ascending=False)
print("\nAll Results (sorted by Test Accuracy):")
print(all_results_df.to_string(index=False))
all_results_df.to_csv('all_results_summary.csv', index=False)

In [None]:
print("Ablation Study Analysis:")

# Effect of learning rate
print("\n1. Effect of Learning Rate:")
for lr in learning_rates:
    lr_results = all_results_df[all_results_df['Configuration'].str.contains(f'lr={lr}')]
    avg_test_acc = lr_results['Test Accuracy'].mean()
    print(f"  Learning Rate {lr}: Average Test Accuracy = {avg_test_acc:.4f}")

# Effect of batch size
print("\n2. Effect of Batch Size:")
for bs in batch_sizes:
    bs_results = all_results_df[all_results_df['Configuration'].str.contains(f'batch_size={bs}')]
    avg_test_acc = bs_results['Test Accuracy'].mean()
    print(f"  Batch Size {bs}: Average Test Accuracy = {avg_test_acc:.4f}")

# Effect of dropout rate (if applicable)
if 'dropout' in all_results_df['Configuration'].iloc[0]:
    print("\n3. Effect of Dropout Rate:")
    for dr in dropout_rates:
        dr_results = all_results_df[all_results_df['Configuration'].str.contains(f'dropout={dr}')]
        avg_test_acc = dr_results['Test Accuracy'].mean()
        print(f"  Dropout Rate {dr}: Average Test Accuracy = {avg_test_acc:.4f}")

# Effect of weight decay (if applicable)
if 'weight_decay' in all_results_df['Configuration'].iloc[0]:
    print("\n4. Effect of Weight Decay:")
    for wd in weight_decay_values:
        wd_results = all_results_df[all_results_df['Configuration'].str.contains(f'weight_decay={wd}')]
        avg_test_acc = wd_results['Test Accuracy'].mean()
        print(f"  Weight Decay {wd}: Average Test Accuracy = {avg_test_acc:.4f}")


# Loading PTH files for analysis

In [9]:
import os

def load_and_evaluate_models(directory):
    results = {}
    for filename in os.listdir(directory):
        if filename.endswith('.pth'):
            model_name = filename[:-4]  # Remove .pth extension
            model = LeNet5GELU()  # Create a new model instance
            model.load_state_dict(torch.load(os.path.join(directory, filename)))
            model.to(device)
            test_accuracy = evaluate_model(model, test_loader)
            results[model_name] = test_accuracy
    return results

# Evaluate all saved models
model_directory = '/content/drive/MyDrive/cs5787'
evaluation_results = load_and_evaluate_models(model_directory)

# Print results
for model_name, accuracy in evaluation_results.items():
    print(f"{model_name}: Test Accuracy = {accuracy:.4f}")

  model.load_state_dict(torch.load(os.path.join(directory, filename)))


No Regularization (GELU) (lr=0.01, batch_size=32)_weights: Test Accuracy = 0.7715
No Regularization (GELU) (lr=0.01, batch_size=64)_weights: Test Accuracy = 0.8828
No Regularization (GELU) (lr=0.01, batch_size=128)_weights: Test Accuracy = 0.8851
No Regularization (GELU) (lr=0.001, batch_size=32)_weights: Test Accuracy = 0.9023
No Regularization (GELU) (lr=0.001, batch_size=64)_weights: Test Accuracy = 0.9031
No Regularization (GELU) (lr=0.001, batch_size=128)_weights: Test Accuracy = 0.8973
No Regularization (GELU) (lr=0.0001, batch_size=32)_weights: Test Accuracy = 0.8865
No Regularization (GELU) (lr=0.0001, batch_size=64)_weights: Test Accuracy = 0.8830
No Regularization (GELU) (lr=0.0001, batch_size=128)_weights: Test Accuracy = 0.8756
Dropout (GELU) (lr=0.01, batch_size=32, dropout=0.3)_weights: Test Accuracy = 0.1000
Dropout (GELU) (lr=0.01, batch_size=32, dropout=0.5)_weights: Test Accuracy = 0.1000
Dropout (GELU) (lr=0.01, batch_size=32, dropout=0.7)_weights: Test Accuracy = 0.

In [12]:
evaluation_results

{'No Regularization (GELU) (lr=0.01, batch_size=32)_weights': 0.7715,
 'No Regularization (GELU) (lr=0.01, batch_size=64)_weights': 0.8828,
 'No Regularization (GELU) (lr=0.01, batch_size=128)_weights': 0.8851,
 'No Regularization (GELU) (lr=0.001, batch_size=32)_weights': 0.9023,
 'No Regularization (GELU) (lr=0.001, batch_size=64)_weights': 0.9031,
 'No Regularization (GELU) (lr=0.001, batch_size=128)_weights': 0.8973,
 'No Regularization (GELU) (lr=0.0001, batch_size=32)_weights': 0.8865,
 'No Regularization (GELU) (lr=0.0001, batch_size=64)_weights': 0.883,
 'No Regularization (GELU) (lr=0.0001, batch_size=128)_weights': 0.8756,
 'Dropout (GELU) (lr=0.01, batch_size=32, dropout=0.3)_weights': 0.1,
 'Dropout (GELU) (lr=0.01, batch_size=32, dropout=0.5)_weights': 0.1,
 'Dropout (GELU) (lr=0.01, batch_size=32, dropout=0.7)_weights': 0.1,
 'Dropout (GELU) (lr=0.01, batch_size=64, dropout=0.3)_weights': 0.8678,
 'Dropout (GELU) (lr=0.01, batch_size=64, dropout=0.5)_weights': 0.8222,
 'D