In [1]:
# Install Triton
!pip install --upgrade triton

# Import with updated autocast and GradScaler usage
from torch.amp import autocast
from torch.amp import GradScaler

Collecting triton
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.1.0


In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
from datetime import datetime
import json
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
from torch.optim.lr_scheduler import OneCycleLR
from torchvision.models.resnet import BasicBlock

# Optimization 1: Use torch.compile for performance boost on PyTorch 2.0+
try:
    TORCH_COMPILE_AVAILABLE = hasattr(torch, 'compile')
except:
    TORCH_COMPILE_AVAILABLE = False

torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [30]:
class OptimizedCustomCNN(nn.Module):
    """
    An improved CNN architecture for action recognition with enhanced feature extraction,
    better regularization, and modern architectural choices for improved generalization.
    """
    def __init__(self, num_classes=40):
        super(OptimizedCustomCNN, self).__init__()

        # Enhanced feature extraction layers with regularization
        self.features = nn.Sequential(
            # Initial block: larger kernel for better spatial understanding
            nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.2),  # Spatial dropout to prevent feature co-adaptation
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),

            # First feature block with residual-style double convolution
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.2),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),  # Additional conv for better feature learning
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Second block with increased channel capacity
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.3),  # Increased dropout as we go deeper
            nn.Conv2d(128, 128, kernel_size=3, padding=1),  # Double conv for richer features
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Third block for high-level feature extraction
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.4),  # Highest dropout in deepest conv layers
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
        )

        # Global average pooling for better generalization
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))

        # Enhanced classifier with multiple layers and strong regularization
        self.classifier = nn.Sequential(
            # First classifier block
            nn.Linear(256, 512),  # Expand features first
            nn.BatchNorm1d(512),  # Normalize activations
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),  # Heavy dropout for regularization

            # Second classifier block for better feature abstraction
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),

            # Final classification layer
            nn.Linear(256, num_classes)
        )

        # Initialize weights using modern techniques
        self._initialize_weights()

    def _initialize_weights(self):
        """
        Initialize network weights using modern techniques for better training dynamics:
        - Kaiming initialization for conv layers (considering ReLU)
        - Constant initialization for batch norm
        - Normal initialization for linear layers
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        # Extract hierarchical features
        x = self.features(x)

        # Global average pooling
        x = self.global_pool(x)

        # Flatten and classify
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

In [38]:
class OptimizedTrainer:
    def __init__(self, train_loader, val_loader, device, num_epochs=20):
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.num_epochs = num_epochs

        self.model = OptimizedCustomCNN(num_classes=40).to(device)

        # Warmup compilation
        if TORCH_COMPILE_AVAILABLE:
          self.model = torch.compile(self.model, mode='reduce-overhead')
          print("Model compiled with torch.compile()")

        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.AdamW(
            self.model.parameters(),
            lr=2e-3,
            weight_decay=0.1
        )

        self.scheduler = OneCycleLR(
            self.optimizer,
            max_lr=2e-3,
            epochs=num_epochs,
            steps_per_epoch=len(train_loader),
            pct_start=0.2,
            div_factor=25,
            final_div_factor=1000
        )

        # Add scaler initialization
        self.scaler = torch.amp.GradScaler()

        self.history = {
            'train_loss': [], 'train_acc': [],
            'val_loss': [], 'val_acc': [],
            'learning_rates': []
        }

    def train_epoch(self):
        self.model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        batch_start_time = time.time()

        for batch_idx, (inputs, labels, _) in enumerate(self.train_loader):
            inputs = inputs.to(self.device, non_blocking=True)
            labels = labels.to(self.device, non_blocking=True)

            with torch.amp.autocast(device_type='cuda'):
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

            self.optimizer.zero_grad(set_to_none=True)
            self.scaler.scale(loss).backward()

            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)

            self.scaler.step(self.optimizer)

            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)

            self.scaler.step(self.optimizer)
            self.scaler.update()
            self.scheduler.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            if batch_idx % 10 == 0:
                batch_time = time.time() - batch_start_time
                current_loss = running_loss / (batch_idx + 1)
                current_acc = 100. * correct / total
                gpu_mem = torch.cuda.memory_allocated() / 1024**2

                print(f'Batch {batch_idx}/{len(self.train_loader)} | '
                      f'Time: {batch_time:.2f}s | '
                      f'Loss: {current_loss:.4f} | '
                      f'Acc: {current_acc:.2f}% | '
                      f'GPU Mem: {gpu_mem:.0f}MB')

                batch_start_time = time.time()

        epoch_loss = running_loss / len(self.train_loader)
        epoch_acc = 100. * correct / total

        return epoch_loss, epoch_acc

    @torch.no_grad()
    def validate(self):
        self.model.eval()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels, _ in tqdm(self.val_loader, desc="Validating"):
            inputs = inputs.to(self.device, non_blocking=True)
            labels = labels.to(self.device, non_blocking=True)

            with torch.amp.autocast(device_type='cuda'):
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        val_loss = running_loss / len(self.val_loader)
        val_acc = 100. * correct / total

        return val_loss, val_acc

    def train(self, save_dir='model_checkpoints', validate_every=15):
        os.makedirs(save_dir, exist_ok=True)
        best_val_acc = 0.0
        start_time = time.time()

        print(f"\nStarting training...")
        print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

        for epoch in range(self.num_epochs):
            print(f"\nEpoch {epoch + 1}/{self.num_epochs}")

            # Training phase
            train_loss, train_acc = self.train_epoch()

            # Update training history
            self.history['train_loss'].append(train_loss)
            self.history['train_acc'].append(train_acc)
            self.history['learning_rates'].append(
                self.optimizer.param_groups[0]['lr']
            )

            # Validation phase
            if (epoch + 1) % validate_every == 0 or (epoch + 1) == self.num_epochs:
                val_loss, val_acc = self.validate()
                self.history['val_loss'].append(val_loss)
                self.history['val_acc'].append(val_acc)

                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    torch.save({
                        'epoch': epoch,
                        'model_state_dict': self.model.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'scheduler_state_dict': self.scheduler.state_dict(),
                        'val_acc': val_acc,
                        'history': self.history
                    }, os.path.join(save_dir, 'best_model.pth'))

                print(f"Validation Loss: {val_loss:.4f} | Validation Acc: {val_acc:.2f}%")
            else:
                self.history['val_loss'].append(None)
                self.history['val_acc'].append(None)

            # Print epoch summary
            total_time = time.time() - start_time
            hours = int(total_time // 3600)
            minutes = int((total_time % 3600) // 60)
            seconds = int(total_time % 60)

            print(f"\nEpoch {epoch+1} Summary:")
            print(f"Training Loss: {train_loss:.4f}")
            print(f"Training Accuracy: {train_acc:.2f}%")
            print(f"Learning Rate: {self.optimizer.param_groups[0]['lr']:.6f}")
            print(f"Total Training Time: {hours}h {minutes}m {seconds}s")
            print("-" * 50)

        return self.history

In [32]:
class Stanford40Dataset(Dataset):
    def __init__(self, root_dir, split='train', transform=None):
        """
        Initialize the Stanford40 dataset.

        Args:
            root_dir: The root directory containing the dataset
            split: Either 'train' or 'test'
            transform: Optional transformations to apply to the images
        """
        self.root_dir = root_dir
        self.transform = transform
        self.split = split

        # Load and process action classes
        actions_file = os.path.join(root_dir, 'ImageSplits/actions.txt')
        if not os.path.exists(actions_file):
            raise ValueError(f"Actions file not found at: {actions_file}")

        with open(actions_file, 'r') as f:
            # Skip the first line (count) and clean up action names
            lines = f.readlines()[1:]
            self.classes = [line.split()[0].strip() for line in lines]

        print(f"Found {len(self.classes)} action classes")
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

        # Build list of images
        self.images = []
        for action in self.classes:
            split_file = os.path.join(root_dir, f'ImageSplits/{action}_{split}.txt')

            if not os.path.exists(split_file):
                print(f"Warning: Split file not found: {split_file}")
                continue

            with open(split_file, 'r') as f:
                # Important: Handle the image IDs properly
                # Remove any file extension if present in the text file
                image_ids = [os.path.splitext(line.strip())[0] for line in f.readlines()]
                self.images.extend([(action, img_id) for img_id in image_ids])

        print(f"Loaded {len(self.images)} images for {split} split")

        # Verify first few images exist
        print("\nVerifying first few images...")
        for i, (action, img_id) in enumerate(self.images[:5]):
            img_path = os.path.join(self.root_dir, 'JPEGImages', f'{img_id}.jpg')
            exists = os.path.exists(img_path)
            print(f"Image {i+1}: {img_path} - {'Found' if exists else 'Not found'}")

    def __getitem__(self, idx):
        """
        Get a single item from the dataset.

        Args:
            idx: Index of the item to retrieve

        Returns:
            tuple: (image, label, dummy_bbox)
        """
        action, img_id = self.images[idx]

        # Construct image path - ensure we add only one .jpg extension
        img_path = os.path.join(self.root_dir, 'JPEGImages', f'{img_id}.jpg')

        if not os.path.exists(img_path):
            # Provide detailed error message
            raise FileNotFoundError(
                f"Image not found: {img_path}\n"
                f"Action: {action}\n"
                f"Image ID: {img_id}"
            )

        # Load and process image
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        return image, self.class_to_idx[action], torch.tensor([0, 0, 0, 0])

    def __len__(self):
        return len(self.images)

In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
from torch.cuda.amp import autocast, GradScaler
import os
import time
from datetime import datetime
from tqdm import tqdm

train_transform = transforms.Compose([
    # First, spatial transforms that work with PIL Images
    transforms.Resize((256, 256)),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(
        brightness=0.2,
        contrast=0.2,
        saturation=0.2,
        hue=0.1
    ),
    # Convert to tensor (must come before tensor-based operations)
    transforms.ToTensor(),

    # Tensor-based operations come after ToTensor()
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    transforms.RandomErasing(p=0.2)  # Works on tensors, so must come after ToTensor
])

# Simple validation transform
val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Set up datasets with appropriate transforms
print("Setting up datasets...")
dataset_path = '/content/drive/MyDrive/Stanford40'
train_dataset = Stanford40Dataset(dataset_path, split='train', transform=train_transform)
val_dataset = Stanford40Dataset(dataset_path, split='test', transform=val_transform)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=4,
    drop_last=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    persistent_workers=True,
)

# Set up device and create directories
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results_dir = '/content/drive/MyDrive/training_results'
os.makedirs(results_dir, exist_ok=True)

# Print system information
print("\nSystem Information:")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"Initial GPU Memory: {torch.cuda.memory_allocated()/1024**2:.1f}MB")

# Create model and trainer
model = OptimizedCustomCNN().to(device)
trainer = OptimizedTrainer(train_loader, val_loader, device)

# Print model architecture summary
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("\nModel Architecture:")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Start training with periodic validation
print("\nStarting training process...")
start_time = time.time()
history = trainer.train(
    save_dir=results_dir,
    validate_every=15  # Validate every 3 epochs
)

# Print final training summary
total_time = time.time() - start_time
hours = int(total_time // 3600)
minutes = int((total_time % 3600) // 60)
seconds = int(total_time % 60)

print("\nTraining Summary:")
print("=" * 50)
print(f"Total training time: {hours}h {minutes}m {seconds}s")
print(f"Best validation accuracy: {max(filter(None, history['val_acc'])):.2f}%")
print(f"Final training accuracy: {history['train_acc'][-1]:.2f}%")

# Save training curves
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

try:
    import matplotlib.pyplot as plt

    plt.figure(figsize=(15, 5))

    # Plot training/validation loss
    plt.subplot(1, 2, 1)
    plt.plot(history['train_loss'], label='Training Loss')
    plt.plot([i for i, v in enumerate(history['val_loss']) if v is not None],
             [v for v in history['val_loss'] if v is not None],
             'o-', label='Validation Loss')
    plt.title('Loss Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Plot training/validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history['train_acc'], label='Training Accuracy')
    plt.plot([i for i, v in enumerate(history['val_acc']) if v is not None],
             [v for v in history['val_acc'] if v is not None],
             'o-', label='Validation Accuracy')
    plt.title('Accuracy Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(results_dir, 'training_curves.png'))
    plt.close()

    print(f"\nTraining curves saved to {results_dir}/training_curves.png")
except Exception as e:
    print(f"Could not save training curves: {str(e)}")

print("\nTraining completed! All results saved to the 'training_results' folder.")

Setting up datasets...
Found 40 action classes
Loaded 4000 images for train split

Verifying first few images...
Image 1: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_177.jpg - Found
Image 2: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_119.jpg - Found
Image 3: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_087.jpg - Found
Image 4: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_228.jpg - Found
Image 5: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_260.jpg - Found
Found 40 action classes
Loaded 5532 images for test split

Verifying first few images...
Image 1: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_254.jpg - Found
Image 2: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_073.jpg - Found
Image 3: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_123.jpg - Found
Image 4: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_072.jpg - Found
Image 5: /content/drive/MyDrive/Stanford40/JPEGImages/applauding_036.

RuntimeError: unscale_() is being called after step().

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
