# Task 3: Robust Classifier Against Adversarial Examples

This notebook trains a robust classifier against adversarial examples like FGSM and PGD attacks.

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18, resnet34, resnet50
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import requests
import os
import json
from torch.utils.data import DataLoader, Dataset, random_split

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



## 1. Data Loading and Preparation

First, let's load and prepare the training dataset. We'll also create validation and test splits.

In [None]:
# Define data transformations
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
])

# Load CIFAR-10 dataset (assuming this is the distribution)
# Replace with actual dataset loading if different
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform_train)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform_test)

# Create data loaders
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=128,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

## 2. Model Definition

We'll use ResNet50 as our base architecture since it provides a good balance between capacity and computation.

In [None]:
def get_model(model_name='resnet50', num_classes=10):
    if model_name == 'resnet18':
        model = resnet18(pretrained=True)
    elif model_name == 'resnet34':
        model = resnet34(pretrained=True)
    elif model_name == 'resnet50':
        model = resnet50(pretrained=True)
    else:
        raise ValueError(f"Model {model_name} not supported. Use resnet18, resnet34, or resnet50.")
    
    # Modify the first convolution layer to accept 3-channel 32x32 inputs (CIFAR-10)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    # Remove maxpool to preserve spatial dimensions for the small input size
    model.maxpool = nn.Identity()
    
    # Modify the final fully connected layer for our number of classes
    in_features = model.fc.in_features
    model.fc = nn.Linear(in_features, num_classes)
    
    return model

# Choose one of the required models
model_name = 'resnet50'  # You can try different models (resnet18, resnet34, resnet50)
model = get_model(model_name)
model = model.to(device)

print(f"Model: {model_name}")

## 3. Implement Adversarial Attack Methods

Now, let's implement FGSM (Fast Gradient Sign Method) and PGD (Projected Gradient Descent) attacks for adversarial training.

In [None]:
# FGSM Attack
def fgsm_attack(model, images, labels, epsilon=0.1):
    # Make sure gradients are calculated
    images.requires_grad = True
    
    # Forward pass
    outputs = model(images)
    loss = F.cross_entropy(outputs, labels)
    
    # Backward pass
    model.zero_grad()
    loss.backward()
    
    # Create perturbation
    data_grad = images.grad.data
    sign_data_grad = data_grad.sign()
    
    # Create adversarial example
    perturbed_images = images + epsilon * sign_data_grad
    
    # Clamp to ensure valid pixel range [0,1]
    perturbed_images = torch.clamp(perturbed_images, 0, 1)
    
    return perturbed_images

# PGD Attack
def pgd_attack(model, images, labels, epsilon=0.1, alpha=0.01, num_iter=10):
    perturbed_images = images.clone().detach()
    
    for i in range(num_iter):
        perturbed_images.requires_grad = True
        
        # Forward pass
        outputs = model(perturbed_images)
        loss = F.cross_entropy(outputs, labels)
        
        # Backward pass
        if perturbed_images.grad is not None:
            perturbed_images.grad.data.zero_()
        loss.backward()
        
        # Create single-step perturbation
        data_grad = perturbed_images.grad.data
        adv_images = perturbed_images.detach() + alpha * data_grad.sign()
        
        # Project back to epsilon ball
        eta = torch.clamp(adv_images - images, -epsilon, epsilon)
        perturbed_images = torch.clamp(images + eta, 0, 1).detach()
    
    return perturbed_images

## 4. Training Function with Adversarial Training

In [None]:
def train_model(model, trainloader, validloader, epochs=10, lr=0.01, epsilon=0.03, adv_ratio=0.5):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    
    # Training history
    history = {
        'train_loss': [],
        'train_acc': [],
        'val_clean_acc': [],
        'val_fgsm_acc': [],
        'val_pgd_acc': []
    }
    
    best_acc = 0.0
    
    for epoch in range(epochs):
        print(f'Epoch {epoch+1}/{epochs}')
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for i, (inputs, targets) in enumerate(tqdm(trainloader)):
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Random mix of clean and adversarial examples for training
            if np.random.random() < adv_ratio:
                # Choose randomly between FGSM and PGD attacks
                if np.random.random() < 0.5:
                    # FGSM attack
                    perturbed_inputs = fgsm_attack(model, inputs, targets, epsilon)
                else:
                    # PGD attack
                    perturbed_inputs = pgd_attack(model, inputs, targets, epsilon)
            else:
                perturbed_inputs = inputs
                
            optimizer.zero_grad()
            outputs = model(perturbed_inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
        scheduler.step()
        
        train_loss = running_loss / len(trainloader)
        train_accuracy = 100 * correct / total
        
        print(f'Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.2f}%')
        
        # Evaluate on validation set
        clean_acc, fgsm_acc, pgd_acc = evaluate_model(model, validloader, epsilon)
        
        # Save metrics to history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_accuracy)
        history['val_clean_acc'].append(clean_acc)
        history['val_fgsm_acc'].append(fgsm_acc)
        history['val_pgd_acc'].append(pgd_acc)
        
        # Save best model
        avg_adv_acc = (fgsm_acc + pgd_acc) / 2
        if avg_adv_acc > best_acc:
            best_acc = avg_adv_acc
            torch.save({
                'model_state_dict': model.state_dict(),
                'model_name': model_name,
                'clean_acc': clean_acc,
                'fgsm_acc': fgsm_acc,
                'pgd_acc': pgd_acc,
            }, f'best_{model_name}_robust.pt')
    
    return model, history

def evaluate_model(model, dataloader, epsilon=0.03):
    model.eval()
    
    # Clean accuracy
    correct_clean = 0
    total = 0
    
    # FGSM accuracy
    correct_fgsm = 0
    
    # PGD accuracy
    correct_pgd = 0
    
    with torch.no_grad():
        for inputs, targets in tqdm(dataloader, desc="Evaluating"):
            inputs, targets = inputs.to(device), targets.to(device)
            total += targets.size(0)
            
            # Clean accuracy
            outputs_clean = model(inputs)
            _, predicted_clean = outputs_clean.max(1)
            correct_clean += predicted_clean.eq(targets).sum().item()
            
            # FGSM accuracy
            fgsm_inputs = fgsm_attack(model, inputs, targets, epsilon)
            outputs_fgsm = model(fgsm_inputs)
            _, predicted_fgsm = outputs_fgsm.max(1)
            correct_fgsm += predicted_fgsm.eq(targets).sum().item()
            
            # PGD accuracy
            pgd_inputs = pgd_attack(model, inputs, targets, epsilon)
            outputs_pgd = model(pgd_inputs)
            _, predicted_pgd = outputs_pgd.max(1)
            correct_pgd += predicted_pgd.eq(targets).sum().item()
    
    clean_acc = 100 * correct_clean / total
    fgsm_acc = 100 * correct_fgsm / total
    pgd_acc = 100 * correct_pgd / total
    
    print(f'Clean Accuracy: {clean_acc:.2f}%')
    print(f'FGSM  Accuracy: {fgsm_acc:.2f}%')
    print(f'PGD   Accuracy: {pgd_acc:.2f}%')
    
    return clean_acc, fgsm_acc, pgd_acc

## 5. Split Data and Train Model

In [None]:
# Split training data into train and validation
train_size = int(0.9 * len(trainset))
val_size = len(trainset) - train_size
train_dataset, val_dataset = random_split(trainset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=2)

print(f"Training data size: {len(train_dataset)}")
print(f"Validation data size: {len(val_dataset)}")
print(f"Test data size: {len(testset)}")

In [None]:
# Train the model with adversarial training
trained_model, history = train_model(
    model=model,
    trainloader=train_loader,
    validloader=val_loader,
    epochs=20,            # Adjust as needed
    lr=0.01,              # Initial learning rate
    epsilon=0.03,         # Perturbation magnitude
    adv_ratio=0.5         # Ratio of adversarial examples in training
)

## 6. Visualize Training Results

In [None]:
# Plot training curves
plt.figure(figsize=(15, 5))

# Plot 1: Training Loss
plt.subplot(1, 2, 1)
plt.plot(history['train_loss'])
plt.title('Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')

# Plot 2: Accuracy
plt.subplot(1, 2, 2)
plt.plot(history['train_acc'], label='Train Acc')
plt.plot(history['val_clean_acc'], label='Clean Val Acc')
plt.plot(history['val_fgsm_acc'], label='FGSM Val Acc')
plt.plot(history['val_pgd_acc'], label='PGD Val Acc')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend()

plt.tight_layout()
plt.show()

## 7. Final Evaluation on Test Set

In [None]:
# Evaluate on test set
print("Final Evaluation on Test Set:")
clean_acc, fgsm_acc, pgd_acc = evaluate_model(trained_model, test_loader)

print(f"\nSummary:")
print(f"Clean Accuracy: {clean_acc:.2f}%")
print(f"FGSM Accuracy: {fgsm_acc:.2f}%")
print(f"PGD Accuracy: {pgd_acc:.2f}%")
print(f"Average Adversarial Accuracy: {(fgsm_acc + pgd_acc) / 2:.2f}%")

## 8. Prepare and Save Final Model for Submission

In [None]:
# Load the best model
checkpoint = torch.load(f'best_{model_name}_robust.pt')
model.load_state_dict(checkpoint['model_state_dict'])

# Save the final model for submission
submission_path = f'submission_{model_name}.pt'
torch.save({
    'model_state_dict': model.state_dict(),
    'model_name': model_name
}, submission_path)

print(f"Model saved to {submission_path}")
print(f"Model name: {model_name}")

## 9. Submit Model (Optional)

In [None]:
def submit_model(file_path):
    url = "http://149.156.182.9:6060/task-3/submit"
    
    with open(file_path, 'rb') as f:
        files = {'model': f}
        response = requests.post(url, files=files)
    
    if response.status_code == 200:
        print("Submission successful!")
        print(response.json())
    else:
        print(f"Submission failed with status code: {response.status_code}")
        print(response.text)

# Uncomment to submit
# submit_model(submission_path)

## 10. Additional Experiments (Optional)

To improve robustness further, you might want to try:
1. Different epsilon values for adversarial training
2. Different model architectures (ResNet18, ResNet34)
3. Ensemble methods combining multiple models
4. Different PGD configurations (more iterations, different step sizes)
5. Other regularization techniques