Author: Joshua Yu w/ assistance from Gemini 2.5 Pro

In [1]:
import os
import zipfile
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define paths to the train and test directories
base_dir = './data/Structured'
train_dir = os.path.join(base_dir, 'train')
test_dir = os.path.join(base_dir, 'test')

# Set up data transformations w/ augmentations
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Test transforms have no augmentation
test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load the datasets using ImageFolder
train_dataset = datasets.ImageFolder(train_dir, transform=train_transforms)
val_dataset = datasets.ImageFolder(test_dir, transform=test_transforms)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

# Get number of classes and class names
num_classes = len(train_dataset.classes)
class_names = train_dataset.classes
print(f"Found {len(train_dataset)} images in the training set.")
print(f"Found {len(val_dataset)} images in the test set.")
print(f"Number of classes: {num_classes}")
print(f"Class names: {class_names}")

# Set device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Found 10710 images in the training set.
Found 1890 images in the test set.
Number of classes: 15
Class names: ['calling', 'clapping', 'cycling', 'dancing', 'drinking', 'eating', 'fighting', 'hugging', 'laughing', 'listening_to_music', 'running', 'sitting', 'sleeping', 'texting', 'using_laptop']
Using device: cuda


In [None]:
from tqdm.notebook import tqdm
import torch.nn as nn
from torchvision import models
import torch.optim as optim
import timm
import os
import time

# --- 1. Define Models and Hyperparameters ---

# ResNet18 setup
model_resnet = models.resnet18(pretrained=True)
# Adjust the final layer to match the number of classes (15)
model_resnet.fc = nn.Linear(model_resnet.fc.in_features, num_classes)
model_resnet = model_resnet.to(device)

# ViT setup
model_vit = timm.create_model('vit_base_patch16_224', pretrained=True)
# Adjust the final layer to match the number of classes (15)
model_vit.head = nn.Linear(model_vit.head.in_features, num_classes)
model_vit = model_vit.to(device)

# Define loss function and optimizers
criterion = nn.CrossEntropyLoss()
optimizer_resnet = optim.Adam(model_resnet.parameters(), lr=0.001)
optimizer_vit = optim.AdamW(model_vit.parameters(), lr=0.0001)


# --- 2. Training Function ---
def train_model(model, criterion, optimizer, num_epochs, train_loader, val_loader, log_file_path):
    """
    Trains a model, saves a log, and reports training time and peak GPU memory.
    """
    model_name = model.__class__.__name__
    if 'VisionTransformer' in model_name:
        model_name = 'ViT'

    print(f"\n--- Training Started for {model_name} ---")

    # Reset peak memory stats and start timer
    torch.cuda.reset_peak_memory_stats(device)
    start_time = time.time()

    # Write a header to the log file if it's new
    if not os.path.exists(log_file_path):
        with open(log_file_path, 'w') as f:
            f.write("Epoch,Loss,Validation_Accuracy\n")

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

        for inputs, labels in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_loader)
        epoch_acc = 100 * correct / total

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Val Accuracy: {epoch_acc:.2f}%')

        with open(log_file_path, 'a') as f:
            f.write(f"{epoch+1},{epoch_loss:.4f},{epoch_acc:.2f}\n")

    # Report training time and memory usage
    end_time = time.time()
    total_training_time = end_time - start_time
    peak_memory_mb = torch.cuda.max_memory_allocated(device) / (1024 * 1024)
    print(f"Total Training Time: {total_training_time:.2f} seconds")
    print(f"Peak GPU Memory during Training: {peak_memory_mb:.2f} MB")
    print(f"Training log saved to: {log_file_path}")


# --- 3. Run Training for Both Models ---

# Train ResNet18
train_model(model_resnet, criterion, optimizer_resnet, num_epochs=5,
            train_loader=train_loader, val_loader=val_loader,
            log_file_path='resnet18_training_log.csv')

# Train ViT
train_model(model_vit, criterion, optimizer_vit, num_epochs=5,
            train_loader=train_loader, val_loader=val_loader,
            log_file_path='vit_training_log.csv')




--- Training Started for ResNet ---


Epoch 1/5:   0%|          | 0/335 [00:00<?, ?it/s]

Epoch [1/5], Loss: 1.7490, Val Accuracy: 49.74%


Epoch 2/5:   0%|          | 0/335 [00:00<?, ?it/s]

Epoch [2/5], Loss: 1.3707, Val Accuracy: 53.70%


Epoch 3/5:   0%|          | 0/335 [00:00<?, ?it/s]

Epoch [3/5], Loss: 1.1752, Val Accuracy: 62.28%


Epoch 4/5:   0%|          | 0/335 [00:00<?, ?it/s]

Epoch [4/5], Loss: 1.0527, Val Accuracy: 61.53%


Epoch 5/5:   0%|          | 0/335 [00:00<?, ?it/s]

Epoch [5/5], Loss: 0.9298, Val Accuracy: 60.48%
Total Training Time: 246.73 seconds
Peak GPU Memory during Training: 1187.59 MB
Training log saved to: resnet18_training_log.csv

--- Training Started for ViT ---


Epoch 1/5:   0%|          | 0/335 [00:00<?, ?it/s]

  x = F.scaled_dot_product_attention(


In [None]:


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from fvcore.nn import FlopCountAnalysis

# --- Save models to calculate disk size ---
torch.save(model_resnet.state_dict(), 'resnet18.pth')
torch.save(model_vit.state_dict(), 'vit.pth')


def evaluate_model_fully(model, data_loader, class_names, model_path):
    """
    Generates a comprehensive report including computational and performance metrics.
    """
    model_name = model.__class__.__name__
    if 'VisionTransformer' in model_name:
        model_name = 'ViT'

    print(f"\n{'='*20} COMPREHENSIVE REPORT FOR {model_name.upper()} {'='*20}")

    # --- 1. Computational Metrics ---
    print("\n--- Computational Stats ---")
    # Parameters
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total Trainable Parameters: {total_params / 1_000_000:.2f}M")

    # Disk Size
    disk_size = os.path.getsize(model_path) / (1024 * 1024)
    print(f"Disk Size: {disk_size:.2f} MB")

    # FLOPs
    dummy_input = torch.randn(1, 3, 224, 224).to(device)
    flops = FlopCountAnalysis(model, dummy_input)
    print(f"GFLOPs (for a single image): {flops.total() / 1e9:.2f}")

    # --- 2. Inference Performance ---
    print("\n--- Inference Performance ---")
    model.eval()
    all_preds, all_labels = [], []

    torch.cuda.reset_peak_memory_stats(device)
    start_time = time.time()
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    end_time = time.time()

    # Timing and Memory
    total_inference_time = end_time - start_time
    num_images = len(data_loader.dataset)
    peak_inference_memory_mb = torch.cuda.max_memory_allocated(device) / (1024 * 1024)
    print(f"Total Inference Time: {total_inference_time:.2f} seconds")
    print(f"Average Time per Image: {total_inference_time / num_images * 1000:.2f} ms")
    print(f"Peak GPU Memory during Inference: {peak_inference_memory_mb:.2f} MB")

    # --- 3. Model Accuracy Metrics ---
    print("\n--- Accuracy Metrics ---")
    print("\nClassification Report:")
    print("Overall accuracy: ", accuracy_score(all_labels, all_preds))
    print(classification_report(all_labels, all_preds, target_names=class_names, zero_division=0))

    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix for {model_name}', fontsize=16)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

# --- Run the full evaluation for both models ---
evaluate_model_fully(model_resnet, val_loader, class_names, model_path='resnet18.pth')
evaluate_model_fully(model_vit, val_loader, class_names, model_path='vit.pth')

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

def predict_and_display_images(model, image_paths, class_names):
    """
    Predicts the class for a list of images and displays them
    with the predicted class as the title.
    """
    model.eval()

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    for image_path in image_paths:
        try:
            # Open and transform the image
            image = Image.open(image_path).convert('RGB')
            image_tensor = transform(image).unsqueeze(0).to(device)

            # Perform inference
            with torch.no_grad():
                outputs = model(image_tensor)
                _, predicted_idx = torch.max(outputs, 1)
                prediction = class_names[predicted_idx.item()]

            # Display the image with its prediction
            plt.figure(figsize=(6, 6))
            plt.imshow(image)
            plt.title(f'Prediction: {prediction}', fontsize=16, color='green')
            plt.axis('off')
            plt.show()

        except FileNotFoundError:
            print(f"Error: Image not found at {image_path}")
        except Exception as e:
            print(f"An error occurred while processing {image_path}: {e}")


# --- IMPORTANT ---
# Modify this list with the paths to the images you want to test.
# I've added a few examples from the test set for you.
demo_image_paths = [
    './data/Structured/test/calling/Image_10899.jpg',
    './data/Structured/test/drinking/Image_10700.jpg',
    './data/Structured/test/fighting/Image_11179.jpg',
    './data/Structured/test/sleeping/Image_10719.jpg',
    './data/Structured/test/eating/Image_10874.jpg'


]

# Run inference and display results for the ViT model
print("\n--- Running Demo Inference with Vision Transformer ---")
predict_and_display_images(model_vit, demo_image_paths, class_names)
print("\n--- Running Demo Inference with ResNet18 ---")
predict_and_display_images(model_resnet, demo_image_paths, class_names)