In [None]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from PIL import Image
import matplotlib.pyplot as plt
import os
import numpy as np
from pathlib import Path

# ------------------------
# GPU check
# ------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")

# ------------------------
# RSUD Dataset for DETR
# ------------------------
class RSUDDatasetDETR(Dataset):
    """RSUD dataset for DETR training"""
    def __init__(self, img_dir, label_dir, transform=None):
        self.img_dir = Path(img_dir)
        self.label_dir = Path(label_dir)
        self.transform = transform
        
        # Get all image files
        self.image_files = sorted([f for f in os.listdir(img_dir) if f.endswith(('.jpg', '.png'))])
        
        # RSUD class names (13 classes)
        self.classes = [
            'Dilarang Berhenti', 'Dilarang Parkir', 'Dilarang Masuk',
            'Bahaya', 'Lampu Lalu Lintas Merah', 'Batas Kecepatan',
            'Wajib', 'Larangan Belok', 'Zona Pejalan Kaki',
            'Petunjuk Arah', 'Rambu Informasi', 'Hati-hati',
            'Zona Khusus'
        ]
        
        print(f"Found {len(self.image_files)} images in {img_dir}")
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        # Load image
        img_name = self.image_files[idx]
        img_path = self.img_dir / img_name
        image = Image.open(img_path).convert('RGB')
        
        # Load YOLO format labels
        label_name = img_name.replace('.jpg', '.txt').replace('.png', '.txt')
        label_path = self.label_dir / label_name
        
        boxes = []
        labels = []
        
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                for line in f.readlines():
                    parts = line.strip().split()
                    if len(parts) >= 5:
                        class_id = int(parts[0])
                        x_center, y_center, width, height = map(float, parts[1:5])
                        
                        # Convert YOLO format (normalized) to DETR format (absolute pixels)
                        img_w, img_h = image.size
                        x1 = (x_center - width/2) * img_w
                        y1 = (y_center - height/2) * img_h
                        x2 = (x_center + width/2) * img_w
                        y2 = (y_center + height/2) * img_h
                        
                        boxes.append([x1, y1, x2, y2])
                        labels.append(class_id)
        
        # Convert to tensors
        if len(boxes) == 0:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
        else:
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            labels = torch.as_tensor(labels, dtype=torch.int64)
        
        target = {
            'boxes': boxes,
            'labels': labels,
            'image_id': torch.tensor([idx])
        }
        
        if self.transform:
            image = self.transform(image)
        
        return image, target

# ------------------------
# Data Transforms
# ------------------------
def get_transform(train):
    transforms = []
    transforms.append(T.Resize((800, 800)))  # DETR works better with larger images
    transforms.append(T.ToTensor())
    transforms.append(T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))
    return T.Compose(transforms)

# ------------------------
# Dataset Setup with RSUD paths
# ------------------------
base_path = "F:/skills-copilot-codespaces-vscode/thesis/rsuddataset/rsud20k"

train_dataset = RSUDDatasetDETR(
    img_dir=f'{base_path}/images/train',
    label_dir=f'{base_path}/labels/train',
    transform=get_transform(train=True)
)

val_dataset = RSUDDatasetDETR(
    img_dir=f'{base_path}/images/val',
    label_dir=f'{base_path}/labels/val',
    transform=get_transform(train=False)
)

def collate_fn(batch):
    """Custom collate function for DETR"""
    images = []
    targets = []
    for img, target in batch:
        images.append(img)
        targets.append(target)
    return images, targets

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn, num_workers=0)

print(f"\nDataset loaded:")
print(f"  Train: {len(train_dataset)} images")
print(f"  Val: {len(val_dataset)} images")
print(f"  Classes: {len(train_dataset.classes)}")

# ------------------------
# DETR Model Setup
# ------------------------
print("\n" + "="*60)
print("DETR Model Setup")
print("="*60)

# DETR expects num_classes + 1 (including background/no-object class)
num_classes = 13  # RSUD has 13 classes

# Load DETR model
# Note: torchvision doesn't have DETR built-in, you need to use transformers library
try:
    from transformers import DetrForObjectDetection, DetrConfig
    
    # Initialize DETR model
    config = DetrConfig(num_labels=num_classes)
    model = DetrForObjectDetection(config)
    model.to(device)
    print(f"‚úì DETR model loaded on {device}")
    
except ImportError:
    print("‚ö† transformers library not found")
    print("Install it with: pip install transformers")
    print("\nAlternative: Use YOLO for object detection")
    print("  Your trained YOLO model: runs/detect/rsud20k_yolo11/weights/best.pt")
    model = None

# ------------------------
# Optimizer & Scheduler
# ------------------------
if model is not None:
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.AdamW(params, lr=1e-4, weight_decay=1e-4)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    
    print(f"‚úì Optimizer: AdamW (lr=1e-4)")
    print(f"‚úì Total parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"‚úì Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

print("="*60)

# ------------------------
# Training Loop
# ------------------------
def train_one_epoch(model, optimizer, data_loader, device, epoch):
    model.train()
    running_loss = 0
    
    for i, (images, targets) in enumerate(data_loader):
        # Move to device
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        
        # Forward pass
        outputs = model(pixel_values=torch.stack(images), labels=targets)
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if (i + 1) % 100 == 0:
            print(f"  Batch [{i+1}/{len(data_loader)}], Loss: {loss.item():.4f}")
    
    avg_loss = running_loss / len(data_loader)
    print(f"Epoch [{epoch+1}] Average Loss: {avg_loss:.4f}")
    return avg_loss

# ------------------------
# Evaluation
# ------------------------
@torch.no_grad()
def evaluate(model, data_loader, device):
    model.eval()
    running_loss = 0
    
    for images, targets in data_loader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        
        outputs = model(pixel_values=torch.stack(images), labels=targets)
        running_loss += outputs.loss.item()
    
    avg_loss = running_loss / len(data_loader)
    print(f"Validation Loss: {avg_loss:.4f}")
    return avg_loss

# ------------------------
# IMPORTANT NOTE FOR TRAINING
# ------------------------
print("\n" + "="*60)
print("‚ö†Ô∏è  DETR TRAINING INFORMATION")
print("="*60)
print()
print("DETR (Detection Transformer) is VERY computationally expensive:")
print("  ‚Ä¢ Typical training: 300+ epochs needed for convergence")
print("  ‚Ä¢ Each epoch: ~15-30 minutes (on RTX 3060)")
print("  ‚Ä¢ Total training time: 75-150 HOURS")
print("  ‚Ä¢ Memory: Uses 800x800 images with batch size 2")
print()
print("For your thesis, you already have:")
print("  ‚úÖ YOLO: Fast, accurate object detection (already trained)")
print("  ‚úÖ ViT: Classification model (30% accuracy)")
print("  ‚úÖ CNN: Baseline classification")
print("  ‚úÖ DINOv2: Advanced vision transformer")
print()
print("RECOMMENDATION:")
print("  1. Use your YOLO model for object detection tasks")
print("  2. Focus on improving classification models (ViT/CNN/DINOv2)")
print("  3. Only train DETR if you need transformer-based detection comparison")
print()
print("If you want to train DETR anyway, uncomment the training code below:")
print("="*60)
print()

# ------------------------
# Training execution (COMMENTED OUT - Uncomment to train)
# ------------------------
# WARNING: This will take 75-150 hours to train properly!
# Uncomment the code below if you really want to train DETR

"""
if model is not None:
    num_epochs = 10  # Should be 300+ for good results, using 10 for testing
    output_dir = Path(r"F:\skills-copilot-codespaces-vscode\thesis\checkpoints")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []
    
    print(f"\nüöÄ Starting DETR training for {num_epochs} epochs...")
    print("‚ö†Ô∏è  Note: DETR typically needs 300+ epochs for good results")
    print("="*60)
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        # Train
        train_loss = train_one_epoch(model, optimizer, train_loader, device, epoch)
        train_losses.append(train_loss)
        
        # Evaluate
        val_loss = evaluate(model, val_loader, device)
        val_losses.append(val_loss)
        
        # Update learning rate
        lr_scheduler.step()
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), output_dir / "detr_best.pth")
            print(f"  ‚úì Saved best model (Val Loss: {val_loss:.4f})")
        
        print("="*60)
    
    # Save final model
    torch.save(model.state_dict(), output_dir / "detr_final.pth")
    print(f"\n‚úì DETR training completed!")
    print(f"  Best validation loss: {best_val_loss:.4f}")
    print(f"  Models saved to: {output_dir.absolute()}")
    
    # Plot training curves
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(range(1, num_epochs+1), train_losses, label="Train Loss", marker='o')
    ax.plot(range(1, num_epochs+1), val_losses, label="Val Loss", marker='s')
    ax.set_xlabel("Epochs")
    ax.set_ylabel("Loss")
    ax.set_title("DETR Training and Validation Loss")
    ax.legend()
    ax.grid(True)
    plt.tight_layout()
    plt.savefig(output_dir / "detr_training_curves.png", dpi=150)
    print(f"  Training curves saved!")
    plt.show()
"""

print("\n‚úì DETR notebook setup complete!")
print("  To train: Uncomment the training code above")
print("  To use YOLO: Load your trained model at runs/detect/rsud20k_yolo11/weights/best.pt")

Using device: cuda
GPU: NVIDIA GeForce RTX 3060
CUDA Version: 12.8
Found 18681 images in F:/skills-copilot-codespaces-vscode/thesis/rsuddataset/rsud20k/images/train
Found 1004 images in F:/skills-copilot-codespaces-vscode/thesis/rsuddataset/rsud20k/images/val

Dataset loaded:
  Train: 18681 images
  Val: 1004 images
  Classes: 13

DETR Model Setup
‚úì DETR model loaded on cuda
‚úì Optimizer: AdamW (lr=1e-4)
‚úì Total parameters: 41,504,722
‚úì Trainable parameters: 41,282,322
‚úì DETR model loaded on cuda
‚úì Optimizer: AdamW (lr=1e-4)
‚úì Total parameters: 41,504,722
‚úì Trainable parameters: 41,282,322
