# 1/ Env Setup
Load necessary libraries to run this notebook. <br>
All libraries are cited in ```requirements.txt```. <br>
Documentation: https://docs.pytorch.org/vision/main/models/generated/torchvision.models.detection.retinanet_resnet50_fpn_v2.html

## 1.1/ Import dependencies
Load libraries:

In [1]:
import sys  
import os

current_dir = os.getcwd() # path to the current working directory (notebook location)
project_root = os.path.abspath(os.path.join(current_dir, "..")) # path to project root

if project_root not in sys.path: # add project root to sys.path
    sys.path.insert(0, project_root)
print(f"Project root added to sys.path: {project_root}")

Project root added to sys.path: /Users/litani/Documents/myCode/steel-defects


In [2]:
from pathlib import Path
import torch 
from torchvision.models.detection import retinanet_resnet50_fpn_v2
from torchvision.models.detection.retinanet import RetinaNetClassificationHead
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import json

## 1.2/ Set reproducibility
Device and seed:

In [3]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
torch.manual_seed(42)
np.random.seed(42)

# 2/ Configuration Management
Define:
- image path
- model hyperparameters
- hardware


In [4]:
class Config:
    # Paths
    DATA_ROOT = Path(project_root) / "data" / "raw"
    TRAIN_IMG = DATA_ROOT / "train_images"
    TRAIN_ANN = DATA_ROOT / "train_annotations"
    VAL_IMG = DATA_ROOT / "valid_images"
    VAL_ANN = DATA_ROOT / "valid_annotations"

    # Model parameters
    NUM_CLASSES = 7 # 6 defects + 1 background
    BACKBONE_PRETRAINED = True 
    
    # Training hyperparameters
    BATCH_SIZE = 16  # no mention of batch size in the paper (go for > 10 when you are sure that training works)
    NUM_EPOCHS = 24 # 24 epochs based on paper. Reduced for quicker testing
    LEARNING_RATE = .002 # 0.0025 based on paper
    MOMENTUM = .9 # 0.9 based on paper
    WEIGHT_DECAY = .0005 # double check this value <<<<<<<

    # Hardware
    DEVICE = device
    NUM_WORKERS = 8
    PIN_MEMORY = False # True if torch.cuda.is_available() else False [in case of GPU avaliable]

    # Visualization
    SAVE_PLOTS = True
    PLOT_INTERVAL = 4

    # Scheduler parameters
    STEP_SIZE = 8 # based on paper
    GAMMA = .5 

config = Config()

In [5]:
config

<__main__.Config at 0x10e9b7cb0>

# 3/ Dataset Class
- Load images and annotations into PyTorch format. 
- This is necessary since RetineNet excepts a dictionary format. The latter requires XML parsing.

In [6]:
from src.utils.dataset import SteelDefectDataset, collate_func

# 4/ Data Augmentation
- We have 1800 images, resorting to image augmentation is mandatory to avoid overfitting. 
- Geometric transformation, simplist form, will be applied as a quick fix:
    - Horizental/Vertical flips
    - Rotate by 90
    - Others: brightness, contrast, adding random noise
- **NB:** OpenCV stores images as [Height in pixels, Width in pixels, RGB] while PyTorch expects [channel, height, width]

In [7]:
from src.utils.transforms_pipeline import get_train_transforms, get_val_transforms

# 5/ Model Initilization
- Apply transfer learning where pretrained RetineNet is loaded the changes are applied based on the dataset

In [8]:
def create_model(num_classes, pretrained = True):
    # Load pretrained RetinaNet w/ ResNet50 backbone,
    model = retinanet_resnet50_fpn_v2(weights = "DEFAULT" if pretrained else None)  # DEFAULT loads ImageNet pretrained weights for transfer learning
    
    # Replace head so that model learns defect-specific patterns
    num_anchors = model.head.classification_head.num_anchors # default is 9 anchors per location >> 3 scales x 3 aspect ratios
    model.head.classification_head = RetinaNetClassificationHead(
        in_channels = 256,          # Input: 256 features from FPN   
        num_anchors = num_anchors,  # Process: 9 anchors per location
        num_classes = num_classes   # Output: 7 classes scores per anchor
    )
    return model

model = create_model(config.NUM_CLASSES).to(device) # Create model instance and move to device CPU/GPU, config.NUM_CLASSES = 7 includes background

# 6/ Data Loaders
- collate_func is a fucntion that works on the collation process of RetinaNet since images have variable bbox counts.

In [9]:
# Create train dataset with augmentations and val dataset without augmentations, only format conversion
train_dataset = SteelDefectDataset(
    config.TRAIN_IMG,
    config.TRAIN_ANN,
    transforms = get_train_transforms()
)

val_dataset = SteelDefectDataset(
    config.VAL_IMG,
    config.VAL_ANN,
    transforms = get_val_transforms()
)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = config.BATCH_SIZE,
    shuffle = True, # shuffle training data for better generalization
    num_workers = config.NUM_WORKERS,
    pin_memory = config.PIN_MEMORY, # only useful if using GPU
    collate_fn = collate_func 
)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size = config.BATCH_SIZE,
    shuffle = False, # validation data should be consistent
    num_workers = config.NUM_WORKERS,
    pin_memory = config.PIN_MEMORY,
    collate_fn = collate_func
)

  self._set_keys()


# 7/ Training Set-up and Visualization Function 

In [10]:
from src.utils.trainEval_pipeline import train_one_epoch, evaluate

In [None]:
# Training history storage
history = defaultdict(list)

def plot_training_curves(history, save_path = 'training_curves.png'):
    """
    Create publication-ready training curves.
    
    Visualizations:
    1. Loss curves (train/val) - Shows optimization progress
    2. mAP@0.5 - Primary detection metric
    3. Learning rate schedule - Shows adaptation strategy
    """
    fig, axes = plt.subplots(2, 2, figsize = (14, 10))
    epochs = range(1, len(history['train_loss']) + 1)
    
    # 1. Training Loss
    axes[0, 0].plot(epochs, history['train_loss'], 'b-o', label = 'Train Loss', linewidth = 2, markersize = 4)
    axes[0, 0].set_xlabel('Epoch', fontsize = 11)
    axes[0, 0].set_ylabel('Loss', fontsize = 11)
    axes[0, 0].set_title('Training Loss Convergence', fontsize = 12, fontweight = 'bold')
    axes[0, 0].grid(True, alpha = 0.3)
    axes[0, 0].legend(fontsize = 10)
    
    # 2. Validation mAP@0.5
    axes[0, 1].plot(epochs, history['val_map50'], 'g-s', label = 'Val mAP@0.5', linewidth = 2, markersize = 4)
    axes[0, 1].axhline(y = 0.5, color = 'r', linestyle = '--', alpha = 0.5, label = 'Target (0.5)')
    axes[0, 1].set_xlabel('Epoch', fontsize = 11)
    axes[0, 1].set_ylabel('mAP@0.5', fontsize = 11)
    axes[0, 1].set_title('Validation mAP@0.5 Progress', fontsize = 12, fontweight = 'bold')
    axes[0, 1].grid(True, alpha = 0.3)
    axes[0, 1].legend(fontsize = 10)
    axes[0, 1].set_ylim(0, 1)
    
    # 3. Learning Rate Schedule
    axes[1, 0].plot(epochs, history['learning_rate'], 'r-^', linewidth = 2, markersize = 4)
    axes[1, 0].set_xlabel('Epoch', fontsize = 11)
    axes[1, 0].set_ylabel('Learning Rate', fontsize = 11)
    axes[1, 0].set_title('Learning Rate Schedule', fontsize = 12, fontweight = 'bold')
    axes[1, 0].set_yscale('log')
    axes[1, 0].grid(True, alpha = 0.3)
    
    # 4. Combined Loss & mAP (dual axis)
    ax1 = axes[1, 1]
    ax2 = ax1.twinx()
    
    line1 = ax1.plot(epochs, history['train_loss'], 'b-o', label = 'Train Loss', linewidth = 2, markersize = 4)
    line2 = ax2.plot(epochs, history['val_map50'], 'g-s', label = 'Val mAP@0.5', linewidth = 2, markersize = 4)
    
    ax1.set_xlabel('Epoch', fontsize = 11)
    ax1.set_ylabel('Loss', fontsize = 11, color = 'b')
    ax2.set_ylabel('mAP@0.5', fontsize = 11, color = 'g')
    ax1.set_title('Loss vs mAP@0.5', fontsize = 12, fontweight = 'bold')
    ax1.tick_params(axis = 'y', labelcolor = 'b')
    ax2.tick_params(axis = 'y', labelcolor = 'g')
    ax1.grid(True, alpha = 0.3)
    
    lines = line1 + line2
    labels = [l.get_label() for l in lines]
    ax1.legend(lines, labels, loc = 'center right', fontsize = 10)
    
    plt.tight_layout()
    plt.savefig(save_path, dpi = 300, bbox_inches = 'tight')
    plt.close()
    print(f"Training curves saved: {save_path}")

def save_training_metrics(history, save_path = 'training_metrics.json'):
    """Export metrics for later analysis/reporting."""
    clean_history = {
        k: [float(v) if torch.is_tensor(v) else v for v in vals]
        for k, vals in history.items()
    }
    with open(save_path, 'w') as f:
        json.dump(dict(history), f, indent = 2)
    print(f"Metrics saved: {save_path}")

# 8/ Training Loop and Evaluation Metrics
- Quick and dirty: use SGD as an optimizer for an initial model training, won't be launching/tracking experiments in the beginning
- this is standard supervised learning using RetinaNet loss
- For evaluation, assess model perf without retraining, aim for .5 (50% overlap) with validation images

In [13]:
from src.utils.trainEval_pipeline import train_one_epoch, evaluate

In [14]:
# Optimizer
optimizer = torch.optim.SGD(
    model.parameters(),
    lr = config.LEARNING_RATE,
    momentum = config.MOMENTUM,
    weight_decay = config.WEIGHT_DECAY # >>> double check this value <<<
)

# Scheduler
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size = config.STEP_SIZE, 
    gamma = config.GAMMA
)

# Checkpoint directory
Path("models").mkdir(exist_ok = True)
Path("visualizations").mkdir(exist_ok = True)

history = defaultdict(list)
best_map = 0

print("="*70)
print(f"Starting training: {config.NUM_EPOCHS} epochs")
print(f"Batch size: {config.BATCH_SIZE} | LR: {config.LEARNING_RATE}")
print(f"Device: {device} | Workers: {config.NUM_WORKERS}")
print("="*70)

# Training Loop
for epoch in range(config.NUM_EPOCHS):
    print(f"\n{"=" * 70}")
    print(f"Epoch {epoch + 1}/{config.NUM_EPOCHS}")
    print(f"{"=" * 70}")
    
    # Training
    train_loss = train_one_epoch(model, train_loader, optimizer, device)
    
    # Validation
    results = evaluate(model, val_loader, device)
    val_map50 = results["map_50"]

    # Update learning rate
    current_lr = optimizer.param_groups[0]["lr"]
    scheduler.step()

    # Store metrics
    history["train_loss"].append(train_loss)
    history["val_map50"].append(val_map50)
    history["learning_rate"].append(current_lr)
    
    # Display progress
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val mAP@0.5: {val_map50:.4f}")
    print(f"Learning Rate: {current_lr:.6f}")
    
    # Save best model
    if val_map50 > best_map:
        best_map = val_map50
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_map': best_map,
        }, 'models/retinanet_best.pth')
        print(f"✓ Best model saved (mAP: {best_map:.4f})")
    
    # Periodic checkpoint
    if (epoch + 1) % 8 == 0:
        checkpoint_path = f"models/retinanet_epoch_{epoch+1}.pth"
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Checkpoint saved: {checkpoint_path}")
    
    # Generate visualizations
    if config.SAVE_PLOTS and (epoch + 1) % config.PLOT_INTERVAL == 0:
        plot_path = f"visualizations/training_curves_epoch{epoch+1}.png"
        plot_training_curves(history, plot_path)

# Final outputs
print("\n" + "="*70)
print("Training completed.")
print(f"Best validation mAP@0.5: {best_map:.4f}")
print("="*70)

# Save final visualizations and metrics
plot_training_curves(history, 'visualizations/training_curves_final.png')
save_training_metrics(history, 'models/training_metrics.json')

# Save final model
torch.save(model.state_dict(), 'models/retinanet_final.pth')
print("Final model saved: models/retinanet_final.pth")

Starting training: 24 epochs
Batch size: 16 | LR: 0.002
Device: mps | Workers: 8

Epoch 1/24
  Batch [10/107] Loss: 1.5642
  Batch [20/107] Loss: 1.5910
  Batch [30/107] Loss: 1.5537
  Batch [40/107] Loss: 1.5244
  Batch [50/107] Loss: 1.5331
  Batch [60/107] Loss: 1.5259
  Batch [70/107] Loss: 1.5183
  Batch [80/107] Loss: 1.5455
  Batch [90/107] Loss: 1.4931
  Batch [100/107] Loss: 1.4929
Train Loss: 1.5289
Val mAP@0.5: 0.0000
Learning Rate: 0.002000

Epoch 2/24
  Batch [10/107] Loss: 1.4685
  Batch [20/107] Loss: 1.4716
  Batch [30/107] Loss: 1.4563
  Batch [40/107] Loss: 1.4162
  Batch [50/107] Loss: 1.4002
  Batch [60/107] Loss: 1.4237
  Batch [70/107] Loss: 1.2105
  Batch [80/107] Loss: 1.1789
  Batch [90/107] Loss: 1.1748
  Batch [100/107] Loss: 1.0784




Train Loss: 1.3278
Val mAP@0.5: 0.0778
Learning Rate: 0.002000
✓ Best model saved (mAP: 0.0778)

Epoch 3/24
  Batch [10/107] Loss: 0.9305
  Batch [20/107] Loss: 1.0087
  Batch [30/107] Loss: 1.0501
  Batch [40/107] Loss: 1.0462
  Batch [50/107] Loss: 0.9629
  Batch [60/107] Loss: 0.9672
  Batch [70/107] Loss: 0.9338
  Batch [80/107] Loss: 1.0362
  Batch [90/107] Loss: 0.9551
  Batch [100/107] Loss: 0.8885
Train Loss: 0.9977
Val mAP@0.5: 0.1778
Learning Rate: 0.002000
✓ Best model saved (mAP: 0.1778)

Epoch 4/24
  Batch [10/107] Loss: 0.9937
  Batch [20/107] Loss: 0.9958
  Batch [30/107] Loss: 0.8558
  Batch [40/107] Loss: 0.7590
  Batch [50/107] Loss: 0.9295
  Batch [60/107] Loss: 0.9533
  Batch [70/107] Loss: 0.7299
  Batch [80/107] Loss: 0.7775
  Batch [90/107] Loss: 0.8031
  Batch [100/107] Loss: 0.9154
Train Loss: 0.8956
Val mAP@0.5: 0.2999
Learning Rate: 0.002000
✓ Best model saved (mAP: 0.2999)
Training curves saved: visualizations/training_curves_epoch4.png

Epoch 5/24
  Batch [

TypeError: Object of type Tensor is not JSON serializable

# 9/ Training Results Summary

In [15]:
# Display final metrics
print("\nTraining Summary")
print("="*70)
print(f"Total epochs: {len(history['train_loss'])}")
print(f"Final train loss: {history['train_loss'][-1]:.4f}")
print(f"Final val mAP@0.5: {history['val_map50'][-1]:.4f}")
print(f"Best val mAP@0.5: {max(history['val_map50']):.4f}")
print(f"\nImprovement: {history['val_map50'][-1] - history['val_map50'][0]:.4f}")

# Show visualization paths
print("\nGenerated Files:")
print("- visualizations/training_curves_final.png")
print("- models/training_metrics.json")
print("- models/retinanet_best.pth")
print("- models/retinanet_final.pth")


Training Summary
Total epochs: 24
Final train loss: 0.5973
Final val mAP@0.5: 0.6397
Best val mAP@0.5: 0.6537

Improvement: 0.6397

Generated Files:
- visualizations/training_curves_final.png
- models/training_metrics.json
- models/retinanet_best.pth
- models/retinanet_final.pth
