## Optuna Hyperparamters Optimization
---


### Import Libraries


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from torchvision.models.video import s3d, S3D_Weights
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import os
from pathlib import Path
import cv2
import numpy as np
import random
import optuna
from torch.optim.lr_scheduler import CosineAnnealingLR


### Class Names


In [2]:
class_names = [
    'cutback-frontside',
    'take-off',
    '360',
    'roller'
]


### Device Configuration

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


### Hyperparameters


In [5]:
BATCH_SIZE = 4  # Smaller batch size for video (memory intensive)
EPOCHS = 35
NUM_FRAMES = 64  # S3D typically uses 64 frames (can be adjusted)
FRAME_SIZE = 224  # S3D input size (for reference - official transforms handle resizing)
LEARNING_RATE = 0.0005
NUM_CLASSES = 4  # Cutback-Frontside, Take-off, 360, Roller


criterion = nn.CrossEntropyLoss()

# Adding weights
# counts = torch.tensor([166, 84, 125, 366], dtype=torch.float32)  # [cutback-frontside, take-off, 360, roller]
# weights = torch.sqrt(1.0 / counts)  # Gentler weighting using square root
# weights = weights / weights.sum() * len(counts)
# criterion = nn.CrossEntropyLoss(weight=weights.to(device))  

# Set all random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Make training deterministic (may impact performance slightly)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Create generator for DataLoader
dataloader_generator = torch.Generator()
dataloader_generator.manual_seed(SEED)


<torch._C.Generator at 0x1c34cf0e590>

In [6]:
class CachedSurfingManeuverDataset(Dataset):
    def __init__(self, cache_dir):
        self.cache_dir = Path(cache_dir)
        
        # Get all cached .pt files
        self.cache_files = []
        self.labels = []
        
        for class_idx, class_name in enumerate(class_names):
            class_cache_dir = self.cache_dir / class_name
            if class_cache_dir.exists():
                pt_files = list(class_cache_dir.glob('*.pt'))
                self.cache_files.extend(pt_files)
                self.labels.extend([class_idx] * len(pt_files))
        
        print(f"Found {len(self.cache_files)} cached videos across {len(class_names)} classes")
        
        if len(self.cache_files) == 0:
            print(f"\n WARNING: No cached files found in {cache_dir}")
            print(f"Please run 'python preprocess_videos_cache.py' first to create the cache.")
    
    def __len__(self):
        return len(self.cache_files)
    
    def __getitem__(self, idx):
        cache_file = self.cache_files[idx]
        
        try:
            # Load cached data (already preprocessed!)
            data = torch.load(cache_file, map_location='cpu')
            video = data['video']
            label = data['label']
            
            return video, label
            
        except Exception as e:
            print(f"Error loading cached file {cache_file}: {e}")
            # Return a random valid sample instead
            return self.__getitem__((idx + 1) % len(self))


### Data Transforms

S3D expects inputs with specific normalization (from Kinetics-400 pretraining).


In [7]:
# S3D preprocessing - using manual transforms since official ones have format issues
from torchvision.models.video import S3D_Weights
import torchvision.transforms.functional as F

# Normalization values from S3D Kinetics-400 pretraining
MEAN = [0.43216, 0.394666, 0.37645]
STD = [0.22803, 0.22145, 0.216989]

def video_transform(video):
    # Convert from [T, H, W, C] to [C, T, H, W]
    video = video.permute(3, 0, 1, 2)  # [T, H, W, C] -> [C, T, H, W]
    
    # Convert to float and scale to [0, 1]
    video = video.float() / 255.0
    
    # Resize each frame to 256x256 then center crop to 224x224
    # Process frame by frame to handle the temporal dimension correctly
    T = video.shape[1]
    resized_frames = []
    for t in range(T):
        frame = video[:, t, :, :]  # [C, H, W]
        # Resize to 256x256
        frame = F.resize(frame, [256, 256], antialias=True)
        # Center crop to 224x224
        frame = F.center_crop(frame, [224, 224])
        resized_frames.append(frame)
    
    # Stack back to [C, T, H, W]
    video = torch.stack(resized_frames, dim=1)
    
    # Normalize with S3D mean and std
    mean = torch.tensor(MEAN).view(3, 1, 1, 1)
    std = torch.tensor(STD).view(3, 1, 1, 1)
    video = (video - mean) / std
    
    return video

train_transform = video_transform
val_transform = video_transform


### Load Dataset

Preprocess the videos:
```bash
python preprocess_videos_cache.py
```

This will create `surfing_dataset_cache/` which:
- Sample frames uniformly
- Apply transformations
- Convert to the format expected by S3D (C, T, H, W)



In [8]:
# Cached dataset paths
CACHE_TRAIN_DIR = './surfing_dataset_cache/train'
CACHE_VAL_DIR = './surfing_dataset_cache/val'
CACHE_TEST_DIR = './surfing_dataset_cache/test'

# Load cached datasets
train_dataset = CachedSurfingManeuverDataset(cache_dir=CACHE_TRAIN_DIR)
val_dataset = CachedSurfingManeuverDataset(cache_dir=CACHE_VAL_DIR)
test_dataset = CachedSurfingManeuverDataset(cache_dir=CACHE_TEST_DIR)

# Create DataLoaders (no transform needed - videos are already preprocessed!)
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")
print("Using cached dataset")


Found 741 cached videos across 4 classes
Found 110 cached videos across 4 classes
Found 211 cached videos across 4 classes
Training set size: 741
Validation set size: 110
Test set size: 211
Using cached dataset


### Hyperparameter Optimization with Optuna


In [10]:
# Create Optuna study with TPE sampler and median pruner
study = optuna.create_study(
    direction='maximize',  # Maximize validation accuracy
    sampler=optuna.samplers.TPESampler(seed=SEED),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=5,  # Number of trials before pruning starts
        n_warmup_steps=2,    # Number of epochs before pruning can occur
    ),
    study_name='surfing-s3d-hpo'
)

print(f"Optuna study created: {study.study_name}")
print(f"Direction: {study.direction}")
print(f"Sampler: {type(study.sampler).__name__}")
print(f"Pruner: {type(study.pruner).__name__}")


[I 2025-11-09 00:30:51,981] A new study created in memory with name: surfing-s3d-hpo


Optuna study created: surfing-s3d-hpo
Direction: 2
Sampler: TPESampler
Pruner: MedianPruner


In [11]:
def create_s3d_model(dropout_rate, num_classes=NUM_CLASSES):
    # Load S3D with pretrained weights from Kinetics-400
    weights = S3D_Weights.KINETICS400_V1
    model = s3d(weights=weights)
    
    # Freeze all layers
    for param in model.parameters():
        param.requires_grad = False
    
    # Replace the final classification layer
    in_channels = model.classifier[1].in_channels  # 1024
    model.classifier[1] = nn.Conv3d(
        in_channels,
        num_classes,
        kernel_size=1,
        stride=1
    )
    
    # Set dropout rate
    if isinstance(model.classifier[0], nn.Dropout):
        model.classifier[0].p = dropout_rate
    
    # Initialize new classifier head
    nn.init.kaiming_normal_(model.classifier[1].weight, nonlinearity='linear')
    if model.classifier[1].bias is not None:
        nn.init.zeros_(model.classifier[1].bias)
    
    return model


def create_optimizer(model, optimizer_name, learning_rate, weight_decay=1e-4):
    # Only optimize the classifier layer
    params = model.classifier[1].parameters()
    
    if optimizer_name == 'Adam':
        return optim.Adam(params, lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'SGD':
        return optim.SGD(params, lr=learning_rate, weight_decay=weight_decay, momentum=0.9)
    elif optimizer_name == 'RMSprop':
        return optim.RMSprop(params, lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'AdamW':
        return optim.AdamW(params, lr=learning_rate, weight_decay=weight_decay)
    else:
        raise ValueError(f"Unknown optimizer: {optimizer_name}")


print("Helper functions defined")


Helper functions defined


In [12]:
def train_model_for_hpo(model, train_loader, val_loader, optimizer, criterion, 
                         device, epochs, trial, scheduler=None):
    best_val_acc = 0.0
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for videos, labels in train_loader:
            videos, labels = videos.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(videos)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * videos.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
        
        train_loss = running_loss / len(train_loader.dataset)
        train_acc = 100. * correct / total
        
        # Update scheduler if provided
        if scheduler is not None:
            scheduler.step()
        
        # Validation phase
        model.eval()
        running_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for videos, labels in val_loader:
                videos, labels = videos.to(device), labels.to(device)
                
                outputs = model(videos)
                loss = criterion(outputs, labels)
                
                running_loss += loss.item() * videos.size(0)
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()
        
        val_loss = running_loss / len(val_loader.dataset)
        val_acc = 100. * correct / total
        
        # Track best validation accuracy
        if val_acc > best_val_acc:
            best_val_acc = val_acc
        
        # Report intermediate value to Optuna
        trial.report(val_acc, epoch)
        
        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        print(f"  Epoch [{epoch+1}/{epochs}] "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}% | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
    
    return best_val_acc


print("HPO training function defined")


HPO training function defined


In [13]:
def objective(trial):
    batch_size = trial.suggest_int('batch_size', 2, 8, step=2)
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.6)
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'AdamW', 'RMSprop'])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    scheduler_enabled = trial.suggest_categorical('scheduler', [True, False])
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
    
    # Smaller number of epochs for HPO (to save time)
    hpo_epochs = trial.suggest_int('epochs', 5, 35)
    
    print(f"\n{'='*70}")
    print(f"Trial {trial.number}: Testing hyperparameters")
    print(f"{'='*70}")
    print(f"  Batch size: {batch_size}")
    print(f"  Dropout rate: {dropout_rate:.4f}")
    print(f"  Optimizer: {optimizer_name}")
    print(f"  Learning rate: {learning_rate:.6f}")
    print(f"  Scheduler: {scheduler_enabled}")
    print(f"  Weight decay: {weight_decay:.6f}")
    print(f"  HPO Epochs: {hpo_epochs}")
    print(f"{'='*70}\n")
    
    # Create data loaders with suggested batch size
    train_loader_hpo = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=True if torch.cuda.is_available() else False,
        generator=dataloader_generator
    )
    
    val_loader_hpo = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    # Create model with suggested dropout
    model = create_s3d_model(dropout_rate=dropout_rate)
    model = model.to(device)
    
    # Create optimizer with suggested parameters
    optimizer = create_optimizer(
        model, 
        optimizer_name=optimizer_name,
        learning_rate=learning_rate,
        weight_decay=weight_decay
    )
    
    # Create scheduler if enabled
    scheduler = None
    if scheduler_enabled:
        scheduler = CosineAnnealingLR(optimizer, T_max=hpo_epochs)
    
    # Train and get validation accuracy
    try:
        val_acc = train_model_for_hpo(
            model=model,
            train_loader=train_loader_hpo,
            val_loader=val_loader_hpo,
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            epochs=hpo_epochs,
            trial=trial,
            scheduler=scheduler
        )
        
        print(f"\nTrial {trial.number} completed with validation accuracy: {val_acc:.2f}%\n")
        return val_acc
        
    except optuna.exceptions.TrialPruned:
        print(f"\nTrial {trial.number} was pruned.\n")
        raise


print("Objective function defined")


Objective function defined


In [14]:
# Run Optuna optimization
N_TRIALS = 30

print(f"\n{'='*70}")
print(f"STARTING HYPERPARAMETER OPTIMIZATION")
print(f"{'='*70}")
print(f"Number of trials: {N_TRIALS}")
print(f"{'='*70}\n")

study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

print(f"\n{'='*70}")
print(f"HYPERPARAMETER OPTIMIZATION COMPLETED!")
print(f"{'='*70}")
print(f"\nBest trial: {study.best_trial.number}")
print(f"Best validation accuracy: {study.best_value:.2f}%")
print(f"\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")
print(f"{'='*70}\n")



STARTING HYPERPARAMETER OPTIMIZATION
Number of trials: 30



  0%|          | 0/30 [00:00<?, ?it/s]


Trial 0: Testing hyperparameters
  Batch size: 4
  Dropout rate: 0.5704
  Optimizer: Adam
  Learning rate: 0.000021
  Scheduler: False
  Weight decay: 0.000064
  HPO Epochs: 26

  Epoch [1/26] Train Loss: 1.2947, Train Acc: 45.75% | Val Loss: 1.3921, Val Acc: 48.18%
  Epoch [2/26] Train Loss: 1.2626, Train Acc: 47.64% | Val Loss: 1.3678, Val Acc: 48.18%
  Epoch [3/26] Train Loss: 1.2642, Train Acc: 47.64% | Val Loss: 1.3458, Val Acc: 47.27%
  Epoch [4/26] Train Loss: 1.2412, Train Acc: 47.77% | Val Loss: 1.3147, Val Acc: 47.27%
  Epoch [5/26] Train Loss: 1.2417, Train Acc: 47.77% | Val Loss: 1.2826, Val Acc: 45.45%
  Epoch [6/26] Train Loss: 1.2632, Train Acc: 46.56% | Val Loss: 1.2457, Val Acc: 45.45%
  Epoch [7/26] Train Loss: 1.2233, Train Acc: 49.26% | Val Loss: 1.2148, Val Acc: 45.45%
  Epoch [8/26] Train Loss: 1.2149, Train Acc: 47.64% | Val Loss: 1.1842, Val Acc: 45.45%
  Epoch [9/26] Train Loss: 1.2049, Train Acc: 49.26% | Val Loss: 1.1633, Val Acc: 45.45%
  Epoch [10/26] Trai

In [15]:
# Visualize optimization results
print("Optimization History:")
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

print("\nParameter Importances:")
try:
    fig = optuna.visualization.plot_param_importances(study)
    fig.show()
except:
    print("Not enough trials to compute parameter importances")

print("\nParallel Coordinate Plot:")
try:
    fig = optuna.visualization.plot_parallel_coordinate(study)
    fig.show()
except:
    print("Not enough completed trials to show parallel coordinate plot")


Optimization History:



Parameter Importances:



Parallel Coordinate Plot:
