# YOLOv11 Complete Training Pipeline
## From Raw Merged Dataset to Trained Model

**Pipeline:**
1. Preprocessing (static masking + label cleaning)
2. Dataset analysis & seeding (class imbalance)
3. Training with augmentation
4. Evaluation with negative class metrics

## 1. Check GPU & Install Dependencies

In [None]:
# Check GPU
!nvidia-smi

Thu Nov 13 18:38:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Install dependencies
!pip install -q ultralytics albumentations opencv-python-headless matplotlib seaborn

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m1.1/1.1 MB[0m [31m49.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.1/1.1 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 3. Upload Pipeline Script

In [None]:
# =============================================================================
# Configuration
# =============================================================================

class Config:
    """Centralized configuration."""

    # ==================== PATHS (EDIT THESE) ====================
    DRIVE_ROOT = "/content/drive/MyDrive"
    RAW_DATASET_PATH = f"{DRIVE_ROOT}/mergedDataset"  # Your raw merged dataset (already split)
    PROCESSED_DATASET_PATH = f"{DRIVE_ROOT}/preprocessedDataset"  # After preprocessing
    OUTPUT_PATH = f"{DRIVE_ROOT}/yolo_training_runs"

    # ==================== PREPROCESSING ====================

    # MASKING TO REMOVE MEANINGLESS NOISE. PLUS INCREASE SPEED FOR TRAINING.
    TOP_MASK = 0.20  # Mask top for sky,
    BOTTOM_MASK = 0.08  # Mask bottom for hood, dashboard etc

    # REMOVE LABELS THAT LIES BEYOND THE MASKING (might need to revise this)
    MASKED_OVERLAP_THRESHOLD = 0.25  # Drop box if >25% masked
    VISIBLE_PARTIAL_THRESHOLD = 0.6  # Mark partial if <60% visible

    IGNORE_CLASSES = [3]  # class 3 which is rutting has too little instances
    TAXONOMY = {
        0: "road_crack_longitudinal",
        1: "road_crack_transverse",
        2: "road_crack_alligator",
        3: "road_rutting",  # fitlered out
        4: "pothole",
        5: "marking_faded",
        6: "distractor_manhole",
        7: "distractor_patch",
    }
    POSITIVE_CLASSES = [0, 1, 2, 4, 5]
    NEGATIVE_CLASSES = [6, 7]  #Distractor classes

    # ==================== AUGMENTATION ====================
    USE_AUGMENTATION = True
    AUGMENTATION_PROB = 0.5  # Apply aug to 50% of images

    # ==================== DATASET SEEDING ====================
    SEED_STRATEGY = "oversampling"  # "stratified", "oversampling", or "none"
    OVERSAMPLE_RARE_THRESHOLD = 4000  # Oversample classes with <4000 instances

    # ==================== TRAINING ====================
    MODEL_SIZE = "yolov11l"
    PRETRAINED = True
    INPUT_SIZE = 640

    EPOCHS = 100
    BATCH_SIZE = 16
    WORKERS = 4
    PATIENCE = 20

    LR0 = 0.001
    LRF = 0.01
    WARMUP_EPOCHS = 3

    CLS_WEIGHT = 0.5
    BOX_WEIGHT = 7.5
    DFL_WEIGHT = 1.5
    NEGATIVE_WEIGHT = 2.0

    # ==================== OTHER ====================
    RANDOM_SEED = 42
    CONF_THRESHOLD = 0.25
    IOU_THRESHOLD = 0.45

## 4. Configure Paths

In [None]:

config = Config()
config.RAW_DATASET_PATH = "/content/drive/MyDrive/mergedDataset"
config.PROCESSED_DATASET_PATH = "/content/drive/MyDrive/preprocessedDataset"
config.OUTPUT_PATH = "/content/drive/MyDrive/yolo_training_runs"

# Training settings
config.EPOCHS = 100
config.BATCH_SIZE = 16
config.SEED_STRATEGY = "stratified"

config.TOP_MASK = 0.20
config.BOTTOM_MASK = 0.08
config.NEGATIVE_WEIGHT = 2.0

print("Configuration:")
print(f"  Raw dataset: {config.RAW_DATASET_PATH}")
print(f"  Output: {config.OUTPUT_PATH}")
print(f"  Epochs: {config.EPOCHS}")
print(f"  Batch size: {config.BATCH_SIZE}")
print(f"  Seeding strategy: {config.SEED_STRATEGY}")

Configuration:
  Raw dataset: /content/drive/MyDrive/mergedDataset
  Output: /content/drive/MyDrive/yolo_training_runs
  Epochs: 100
  Batch size: 16
  Seeding strategy: stratified


## 5. Verify Dataset Structure

In [None]:
import os
dataset_path = config.RAW_DATASET_PATH

print("Dataset structure:")
!ls -lh {dataset_path}
splits = []
for split in ["train", "val", "test"]:
    img_dir = f"{dataset_path}/images/{split}"
    if os.path.isdir(img_dir):
        count = len(os.listdir(img_dir))
        splits.append(f"{split}: {count}")
        print(f"  {split}: {count} images")

if not splits:
    print("\n ERROR: No train/val/test splits found")
    print("Expected structure:")
    print("  mergedDataset/")
    print("    images/")
    print("      train/")
    print("      val/")
    print("      test/")
    print("    labels/")
    print("      train/")
    print("      val/")
    print("      test/")
else:
    print("\n‚úì Dataset structure looks good!")

Dataset structure:
total 8.5M
-rw------- 1 root root  247 Nov 13 09:11 data.yaml
drwx------ 5 root root 4.0K Nov 13 12:21 images
drwx------ 2 root root 4.0K Nov 13 12:21 labels
-rw------- 1 root root 8.5M Nov 13 09:11 provenance.csv
  train: 27514 images
  val: 5897 images
  test: 5897 images

‚úì Dataset structure looks good!


## 6. Run Complete Pipeline

In [None]:
"""
Complete YOLOv11 Training Pipeline for Google Colab
From raw merged dataset to trained model.

Pipeline:
1. Preprocessing (static masking + label cleaning)
2. Data augmentation (photometric, geometric, synthetic occlusions)
3. Dataset seeding (class imbalance handling)

Usage in Colab:
    1. Upload this file to Colab
    2. Mount Google Drive
    3. Run the main() function
"""

import os
import yaml
import shutil
import random
import numpy as np
from pathlib import Path
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

import cv2
import torch
import torch.nn as nn

# Ultralytics YOLOv11
from ultralytics import YOLO
from tqdm import tqdm

# Albumentations for augmentation
import albumentations as A

# =============================================================================
# Module 1: Preprocessing
# =============================================================================

class Preprocessor:
    """Apply static masking, label cleaning, and class filtering to raw dataset."""

    def __init__(self, config: Config):
        self.config = config
        self.stats = defaultdict(int)

    def generate_mask(self, img_height: int, img_width: int) -> np.ndarray:
        """Generate binary mask (0=keep, 1=masked)."""
        mask = np.zeros((img_height, img_width), dtype=np.uint8)

        top_h = int(self.config.TOP_MASK * img_height)
        bottom_h = int(self.config.BOTTOM_MASK * img_height)

        mask[:top_h, :] = 1
        mask[img_height - bottom_h:, :] = 1

        return mask

    def apply_mask(self, img: np.ndarray, mask: np.ndarray) -> np.ndarray:
        """Black out masked regions."""
        img_out = img.copy()
        img_out[mask > 0] = 0
        return img_out

    def compute_visible_fraction(self, box: Tuple, mask: np.ndarray,
                                 img_w: int, img_h: int) -> float:
        """Compute visible fraction of a box."""
        cls_id, xc, yc, w, h = box

        x1 = max(0, int((xc - w/2) * img_w))
        y1 = max(0, int((yc - h/2) * img_h))
        x2 = min(img_w, int((xc + w/2) * img_w))
        y2 = min(img_h, int((yc + h/2) * img_h))

        if x2 <= x1 or y2 <= y1:
            return 0.0

        box_mask = mask[y1:y2, x1:x2]
        masked_pixels = (box_mask > 0).sum()
        box_area = (y2 - y1) * (x2 - x1)

        return 1.0 - (masked_pixels / max(1, box_area))

    def filter_labels(self, boxes: List[Tuple], mask: np.ndarray,
                     img_w: int, img_h: int) -> List[Tuple]:
        """Filter boxes based on mask overlap and ignored classes."""
        kept = []
        drop_threshold = 1.0 - self.config.MASKED_OVERLAP_THRESHOLD

        for box in boxes:
            cls_id = box[0]

            # Filter out ignored classes (e.g., class 3)
            if cls_id in self.config.IGNORE_CLASSES:
                self.stats['boxes_ignored_class'] += 1
                continue

            # Filter based on mask overlap
            vis_frac = self.compute_visible_fraction(box, mask, img_w, img_h)
            if vis_frac >= drop_threshold:
                kept.append(box)
                self.stats['boxes_kept'] += 1
            else:
                self.stats['boxes_dropped_mask'] += 1

        return kept

    def load_yolo_labels(self, label_path: str) -> List[Tuple]:
        """Load YOLO format labels."""
        if not os.path.isfile(label_path):
            return []

        boxes = []
        with open(label_path) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parts = line.split()
                if len(parts) != 5:
                    continue
                try:
                    cls_id = int(parts[0])
                    xc, yc, w, h = map(float, parts[1:])
                    boxes.append((cls_id, xc, yc, w, h))
                    self.stats['boxes_original'] += 1
                except ValueError:
                    continue
        return boxes

    def process_split(self, input_dir: str, output_dir: str, split: str):
        """Process a single split (train/val/test)."""
        input_img_dir = os.path.join(input_dir, "images", split)
        input_lbl_dir = os.path.join(input_dir, "labels", split)
        output_img_dir = os.path.join(output_dir, "images", split)
        output_lbl_dir = os.path.join(output_dir, "labels", split)

        if not os.path.isdir(input_img_dir):
            print(f"‚ö†Ô∏è  Split '{split}' not found, skipping")
            return

        os.makedirs(output_img_dir, exist_ok=True)
        os.makedirs(output_lbl_dir, exist_ok=True)

        # Find all images
        image_files = []
        for ext in [".jpg", ".jpeg", ".png"]:
            image_files.extend(Path(input_img_dir).glob(f"*{ext}"))
        image_files = sorted(image_files)

        print(f"\nProcessing {split}: {len(image_files)} images")

        for img_path in tqdm(image_files, desc=f"{split}"):
            stem = img_path.stem
            lbl_path = Path(input_lbl_dir) / f"{stem}.txt"

            # Load image
            img = cv2.imread(str(img_path))
            if img is None:
                self.stats['images_failed'] += 1
                continue

            img_h, img_w = img.shape[:2]

            # Generate and apply mask
            mask = self.generate_mask(img_h, img_w)
            img_masked = self.apply_mask(img, mask)

            # Load and filter labels
            boxes = self.load_yolo_labels(str(lbl_path))
            kept_boxes = self.filter_labels(boxes, mask, img_w, img_h)

            # Write outputs
            out_img_path = Path(output_img_dir) / img_path.name
            cv2.imwrite(str(out_img_path), img_masked)

            out_lbl_path = Path(output_lbl_dir) / f"{stem}.txt"
            with open(out_lbl_path, "w") as f:
                for cls_id, xc, yc, w, h in kept_boxes:
                    f.write(f"{cls_id} {xc:.6f} {yc:.6f} {w:.6f} {h:.6f}\n")

            self.stats['images_processed'] += 1

    def preprocess_dataset(self, input_dir: str, output_dir: str):
        print("\n" + "="*60)
        print("STEP 1: PREPROCESSING")
        print("="*60)
        print(f"Input:  {input_dir}")
        print(f"Output: {output_dir}")
        print(f"Ignoring classes: {self.config.IGNORE_CLASSES}")
        print("="*60)

        for split in ["train", "val", "test"]:
            self.process_split(input_dir, output_dir, split)

        src_yaml = os.path.join(input_dir, "data.yaml")
        dst_yaml = os.path.join(output_dir, "data.yaml")
        if os.path.isfile(src_yaml):
            shutil.copy2(src_yaml, dst_yaml)
        else:
            self.create_data_yaml(output_dir)

        print(f"\n" + "="*60)
        print("PREPROCESSING SUMMARY")
        print("="*60)
        print(f"Images processed:          {self.stats['images_processed']}")
        print(f"Images failed:             {self.stats['images_failed']}")
        print(f"\nBox statistics:")
        print(f"  Original boxes:          {self.stats['boxes_original']}")
        print(f"  Boxes kept:              {self.stats['boxes_kept']}")
        print(f"  Boxes dropped (masked):  {self.stats['boxes_dropped_mask']}")
        print(f"  Boxes ignored (class 3): {self.stats['boxes_ignored_class']}")
        print(f"  Total filtered:          {self.stats['boxes_dropped_mask'] + self.stats['boxes_ignored_class']}")
        print("="*60)

        return output_dir

    def create_data_yaml(self, output_dir: str):
        """Create data.yaml."""
        data_yaml = {
            'path': output_dir,
            'train': 'images/train',
            'val': 'images/val',
            'test': 'images/test',
            'names': self.config.TAXONOMY,
            'nc': len(self.config.TAXONOMY),
        }

        with open(os.path.join(output_dir, "data.yaml"), 'w') as f:
            yaml.dump(data_yaml, f, sort_keys=False)


# =============================================================================
# Module 2: Dataset Analysis & Seeding
# =============================================================================

class DatasetSeeder:
    """Analyze class distribution and apply seeding strategies."""

    def __init__(self, config: Config):
        self.config = config
        self.class_distribution = None
        self.image_class_map = {}

    def analyze_distribution(self, dataset_path: str, split: str = "train"):
        """Analyze class distribution."""
        labels_dir = os.path.join(dataset_path, "labels", split)

        class_counts = Counter()
        image_class_map = {}

        for label_file in Path(labels_dir).glob("*.txt"):
            with open(label_file) as f:
                classes = []
                for line in f:
                    if line.strip():
                        cls_id = int(line.split()[0])
                        class_counts[cls_id] += 1
                        classes.append(cls_id)

                if classes:
                    image_class_map[label_file.stem] = list(set(classes))

        self.class_distribution = dict(class_counts)
        self.image_class_map = image_class_map

        return class_counts

    def print_distribution(self):
        """Print class distribution report."""
        print("\n" + "="*60)
        print("CLASS DISTRIBUTION ANALYSIS (After Filtering)")
        print("="*60)

        if not self.class_distribution:
            print("No distribution data available")
            return

        total = sum(self.class_distribution.values())

        print(f"\n{'Class':<5} {'Name':<30} {'Count':>8} {'%':>7} {'Type':>12}")
        print("-"*60)

        for cls_id in sorted(self.class_distribution.keys()):
            count = self.class_distribution[cls_id]
            pct = 100 * count / total
            name = self.config.TAXONOMY.get(cls_id, f"unknown_{cls_id}")

            if cls_id in self.config.IGNORE_CLASSES:
                type_str = "IGNORED"
            elif cls_id in self.config.NEGATIVE_CLASSES:
                type_str = "NEGATIVE"
            else:
                type_str = "POSITIVE"

            print(f"{cls_id:<5} {name:<30} {count:>8} {pct:>6.2f}% {type_str:>12}")

        print("-"*60)
        print(f"{'TOTAL':<5} {'':<30} {total:>8} {'100.00%':>7}")
        print(f"\nTotal images: {len(self.image_class_map)}")

        # Check if class 3 still exists(it shouldn't after filtering)
        if 3 in self.class_distribution:
            print(f"\n‚ö†Ô∏è  WARNING: Class 3 (road_rutting) still present with {self.class_distribution[3]} boxes!")
            print("   This shouldn't happen after preprocessing. Check IGNORE_CLASSES config.")
        else:
            print(f"\n‚úì Class 3 (road_rutting) successfully filtered out")

        print("="*60)

    def plot_distribution(self, output_dir: str):
        """Plot class distribution."""
        if not self.class_distribution:
            return

        plt.figure(figsize=(14, 7))

        classes = sorted(self.class_distribution.keys())
        counts = [self.class_distribution[c] for c in classes]

        # Color coding
        colors = []
        for c in classes:
            if c in self.config.IGNORE_CLASSES:
                colors.append('gray')
            elif c in self.config.NEGATIVE_CLASSES:
                colors.append('red')
            else:
                colors.append('green')

        labels = [self.config.TAXONOMY.get(c, f"Class {c}") for c in classes]

        bars = plt.bar(range(len(classes)), counts, color=colors, alpha=0.7)
        plt.xlabel('Class', fontsize=12)
        plt.ylabel('Number of Boxes', fontsize=12)
        plt.title('Class Distribution After Preprocessing\n(Green=Positive, Red=Negative, Gray=Ignored)', fontsize=14)
        plt.xticks(range(len(classes)), labels, rotation=45, ha='right')
        plt.grid(axis='y', alpha=0.3)

        # Add count labels on bars
        for i, (bar, count) in enumerate(zip(bars, counts)):
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{count}',
                    ha='center', va='bottom', fontsize=9)

        plt.tight_layout()

        plot_path = os.path.join(output_dir, "class_distribution.png")
        plt.savefig(plot_path, dpi=150, bbox_inches='tight')
        print(f"\n‚úì Class distribution plot saved: {plot_path}")
        plt.close()

    def oversample_rare_classes(self, dataset_path: str, output_path: str):
        """Oversample rare classes by duplicating images."""
        print("\n" + "="*60)
        print("STEP 2: DATASET SEEDING (Oversampling)")
        print("="*60)

        if self.config.SEED_STRATEGY != "oversampling":
            print("Skipping oversampling (strategy not enabled)")
            return dataset_path

        # Find rare classes (excluding ignored classes)
        active_classes = {
            cls_id: count
            for cls_id, count in self.class_distribution.items()
            if cls_id not in self.config.IGNORE_CLASSES
        }

        if not active_classes:
            print("No active classes found")
            return dataset_path

        max_count = max(active_classes.values())
        rare_classes = {
            cls_id: max(2, min(10, max_count // count))  # Cap replication at 10x
            for cls_id, count in active_classes.items()
            if count < self.config.OVERSAMPLE_RARE_THRESHOLD
        }

        if not rare_classes:
            print("No rare classes found, skipping oversampling")
            return dataset_path

        print(f"\nRare classes to oversample:")
        for cls_id, factor in rare_classes.items():
            name = self.config.TAXONOMY.get(cls_id, f"Class {cls_id}")
            current = self.class_distribution[cls_id]
            print(f"  Class {cls_id} ({name}): {current} boxes ‚Üí {factor}x replication")

        # Copy dataset with oversampling (train only)
        for split in ["train", "val", "test"]:
            src_img_dir = os.path.join(dataset_path, "images", split)
            src_lbl_dir = os.path.join(dataset_path, "labels", split)
            dst_img_dir = os.path.join(output_path, "images", split)
            dst_lbl_dir = os.path.join(output_path, "labels", split)

            if not os.path.isdir(src_img_dir):
                continue

            os.makedirs(dst_img_dir, exist_ok=True)
            os.makedirs(dst_lbl_dir, exist_ok=True)

            copy_count = 0

            # Only oversample train split
            apply_oversampling = (split == "train")

            # Get all images in this split
            all_images = {}
            for ext in [".jpg", ".jpeg", ".png"]:
                for img_path in Path(src_img_dir).glob(f"*{ext}"):
                    all_images[img_path.stem] = img_path

            for img_stem, img_path in all_images.items():
                src_lbl = os.path.join(src_lbl_dir, f"{img_stem}.txt")

                # Get classes in this image
                image_classes = []
                if os.path.exists(src_lbl):
                    with open(src_lbl) as f:
                        for line in f:
                            if line.strip():
                                image_classes.append(int(line.split()[0]))

                # Determine replication factor
                replication = 1
                if apply_oversampling:
                    for cls_id in image_classes:
                        if cls_id in rare_classes:
                            replication = max(replication, rare_classes[cls_id])

                # Copy with replication
                for i in range(replication):
                    suffix = f"_rep{i}" if i > 0 else ""
                    dst_img = os.path.join(dst_img_dir, f"{img_stem}{suffix}{img_path.suffix}")
                    dst_lbl = os.path.join(dst_lbl_dir, f"{img_stem}{suffix}.txt")

                    shutil.copy2(img_path, dst_img)
                    if os.path.exists(src_lbl):
                        shutil.copy2(src_lbl, dst_lbl)
                    copy_count += 1

            original_count = len(all_images)
            print(f"  {split}: {copy_count} images (original: {original_count}, {copy_count/max(1,original_count):.1f}x)")

        # Copy data.yaml
        src_yaml = os.path.join(dataset_path, "data.yaml")
        dst_yaml = os.path.join(output_path, "data.yaml")
        if os.path.isfile(src_yaml):
            shutil.copy2(src_yaml, dst_yaml)

        print(f"\n‚úì Oversampling complete: {output_path}")
        print("="*60)

        return output_path


# =============================================================================
# Module 3: Training Pipeline
# =============================================================================

class YOLOTrainer:
    """YOLOv11 training with augmentation and negative class handling."""

    def __init__(self, config: Config):
        self.config = config
        self.model = None

    def setup_environment(self):
        """Setup Colab environment."""
        # Check if in Colab
        try:
            from google.colab import drive
            print("üìÅ Mounting Google Drive...")
            drive.mount('/content/drive')
            print("‚úì Google Drive mounted")
        except ImportError:
            print("‚ö†Ô∏è  Not in Colab, skipping Drive mount")

        # Set random seeds
        random.seed(self.config.RANDOM_SEED)
        np.random.seed(self.config.RANDOM_SEED)
        torch.manual_seed(self.config.RANDOM_SEED)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(self.config.RANDOM_SEED)

        # Create output directory
        os.makedirs(self.config.OUTPUT_PATH, exist_ok=True)

        print(f"\n‚úì Environment setup complete")
        print(f"  Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
        if torch.cuda.is_available():
            print(f"  GPU: {torch.cuda.get_device_name(0)}")

    def train(self, data_yaml_path: str):
        """Train YOLOv11 model."""
        print("\n" + "="*60)
        print("STEP 3: TRAINING")
        print("="*60)

        # Load model
        if self.config.PRETRAINED:
            print(f"\nLoading pretrained {self.config.MODEL_SIZE}...")
            self.model = YOLO(f"{self.config.MODEL_SIZE}.pt")
        else:
            print(f"\nInitializing {self.config.MODEL_SIZE} from scratch...")
            self.model = YOLO(f"{self.config.MODEL_SIZE}.yaml")

        # Training arguments
        train_args = {
            # Data
            'data': data_yaml_path,
            'imgsz': self.config.INPUT_SIZE,

            # Training
            'epochs': self.config.EPOCHS,
            'batch': self.config.BATCH_SIZE,
            'workers': self.config.WORKERS,
            'device': 0 if torch.cuda.is_available() else 'cpu',

            # Optimization
            'optimizer': 'AdamW',
            'lr0': self.config.LR0,
            'lrf': self.config.LRF,
            'momentum': 0.937,
            'weight_decay': 0.0005,
            'warmup_epochs': self.config.WARMUP_EPOCHS,

            # Loss weights
            'box': self.config.BOX_WEIGHT,
            'cls': self.config.CLS_WEIGHT,
            'dfl': self.config.DFL_WEIGHT,

            # Augmentation (built-in YOLO augmentations)
            'hsv_h': 0.015,
            'hsv_s': 0.7,
            'hsv_v': 0.4,
            'degrees': 5.0,
            'translate': 0.1,
            'scale': 0.5,
            'shear': 2.0,
            'flipud': 0.0,
            'fliplr': 0.5,
            'mosaic': 1.0,
            'mixup': 0.1,
            'copy_paste': 0.1,

            # Validation
            'val': True,
            'save': True,
            'patience': self.config.PATIENCE,

            # Output
            'project': self.config.OUTPUT_PATH,
            'name': 'yolov11l_road_defects',
            'exist_ok': True,

            # Other
            'pretrained': self.config.PRETRAINED,
            'verbose': True,
            'seed': self.config.RANDOM_SEED,
            'cos_lr': True,
            'close_mosaic': 10,
        }

        print("\nüöÄ Starting training...")
        results = self.model.train(**train_args)

        print("\n‚úì Training complete!")
        return results

    def evaluate(self, data_yaml_path: str):
        """Evaluate trained model."""
        print("\n" + "="*60)
        print("STEP 4: EVALUATION")
        print("="*60)

        # Load best model
        best_model_path = os.path.join(
            self.config.OUTPUT_PATH,
            'yolov11l_road_defects',
            'weights',
            'best.pt'
        )

        if not os.path.exists(best_model_path):
            print(f"‚ö†Ô∏è  Best model not found: {best_model_path}")
            return

        print(f"\nLoading best model: {best_model_path}")
        eval_model = YOLO(best_model_path)

        # Evaluate on test set
        print("\nEvaluating on test set...")
        test_results = eval_model.val(
            data=data_yaml_path,
            split='test',
            imgsz=self.config.INPUT_SIZE,
            batch=self.config.BATCH_SIZE,
            conf=self.config.CONF_THRESHOLD,
            iou=self.config.IOU_THRESHOLD,
            device=0 if torch.cuda.is_available() else 'cpu',
        )

        # Print results
        print("\n" + "="*60)
        print("TEST RESULTS")
        print("="*60)
        print(f"{'Class':<30} {'Precision':>10} {'Recall':>10} {'mAP50':>10}")
        print("-"*60)

        for i, name in test_results.names.items():
            if i in self.config.IGNORE_CLASSES:
                marker = "üö´"
            elif i in self.config.NEGATIVE_CLASSES:
                marker = "‚ùå"
            else:
                marker = "‚úÖ"

            p = test_results.box.p[i] if i < len(test_results.box.p) else 0
            r = test_results.box.r[i] if i < len(test_results.box.r) else 0
            ap = test_results.box.ap50[i] if i < len(test_results.box.ap50) else 0

            print(f"{marker} {name:<27} {p:>10.3f} {r:>10.3f} {ap:>10.3f}")

        print("-"*60)
        print(f"{'Overall mAP50':<30} {test_results.box.map50:>10.3f}")
        print(f"{'Overall mAP50-95':<30} {test_results.box.map:>10.3f}")
        print("="*60)

        return test_results


# =============================================================================
# Main Pipeline
# =============================================================================

def main():
    """Run complete pipeline."""
    print("\n" + "="*70)
    print(" "*15 + "YOLOv11 COMPLETE TRAINING PIPELINE")
    print("="*70)

    config = Config()
    trainer = YOLOTrainer(config)
    trainer.setup_environment()

    #Preprocessing
    preprocessor = Preprocessor(config)
    processed_path = preprocessor.preprocess_dataset(
        config.RAW_DATASET_PATH,
        config.PROCESSED_DATASET_PATH
    )

    #Analyze & seed dataset
    seeder = DatasetSeeder(config)
    seeder.analyze_distribution(processed_path, "train")
    seeder.print_distribution()
    seeder.plot_distribution(config.OUTPUT_PATH)

    # Apply seeding if enabled
    if config.SEED_STRATEGY == "oversampling":
        seeded_path = os.path.join(config.OUTPUT_PATH, "seeded_dataset")
        final_dataset_path = seeder.oversample_rare_classes(processed_path, seeded_path)
    else:
        final_dataset_path = processed_path

    data_yaml_path = os.path.join(final_dataset_path, "data.yaml")
'''
    #Train
    results = trainer.train(data_yaml_path)

    # Step 4: Evaluate
    test_results = trainer.evaluate(data_yaml_path)

    return trainer, results, test_results '''
    return None





# Run full pipeline (this will take several hours)
# Pipeline steps:
#   1. Preprocessing (static masking + label cleaning)
#   2. Dataset seeding (class imbalance handling)
#   3. Training with augmentation
#   4. Evaluation

main()

Creating new Ultralytics Settings v0.0.6 file ‚úÖ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.

               YOLOv11 COMPLETE TRAINING PIPELINE
üìÅ Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úì Google Drive mounted

‚úì Environment setup complete
  Device: GPU
  GPU: Tesla T4

STEP 1: PREPROCESSING
Input:  /content/drive/MyDrive/mergedDataset
Output: /content/drive/MyDrive/preprocessedDataset
Ignoring classes: [3]

Processing train: 27514 images


train:   2%|‚ñè         | 452/27514 [10:00<8:44:48,  1.16s/it]

## 7. Visualize Results

In [None]:
from IPython.display import Image, display

results_dir = f"{config.OUTPUT_PATH}/yolov11l_road_defects"

print("Class Distribution:")
display(Image(filename=f"{config.OUTPUT_PATH}/class_distribution.png"))

print("\nTraining Curves:")
display(Image(filename=f"{results_dir}/results.png"))

print("\nConfusion Matrix:")
display(Image(filename=f"{results_dir}/confusion_matrix.png"))

print("\nPR Curve:")
display(Image(filename=f"{results_dir}/PR_curve.png"))

## 8. Test Inference

In [None]:
from ultralytics import YOLO
import cv2
import matplotlib.pyplot as plt
import os

# Load best model
best_model = YOLO(f"{results_dir}/weights/best.pt")

# Get a test image
test_img_dir = f"{config.PROCESSED_DATASET_PATH}/images/test"
test_images = os.listdir(test_img_dir)

if test_images:
    sample_img = os.path.join(test_img_dir, test_images[0])

    # Predict
    results = best_model.predict(
        sample_img,
        conf=0.25,
        iou=0.45,
        imgsz=640,
        save=False,
        verbose=False
    )

    # Visualize
    annotated = results[0].plot()

    plt.figure(figsize=(15, 10))
    plt.imshow(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.title('Sample Detection Result')
    plt.show()

    # Print detections
    print("\nDetections:")
    for box in results[0].boxes:
        cls_id = int(box.cls[0])
        conf = float(box.conf[0])
        cls_name = config.TAXONOMY[cls_id]
        is_neg = cls_id in config.NEGATIVE_CLASSES
        marker = "‚ùå DISTRACTOR" if is_neg else "‚úÖ DEFECT"
        print(f"  {cls_name}: {conf:.3f}  {marker}")

## 9. Export Model

In [None]:
# Export to ONNX for deployment
best_model.export(format='onnx', imgsz=640, simplify=True)

print(f"\n‚úì Model exported to: {results_dir}/weights/best.onnx")

## 10. Download Results (Optional)

In [None]:
# Compress results for download
!cd {config.OUTPUT_PATH} && zip -r results.zip yolov11l_road_defects/weights yolov11l_road_defects/*.png class_distribution.png

# Download
from google.colab import files
files.download(f'{config.OUTPUT_PATH}/results.zip')