Test

In [None]:
#Uncomment the code below if needed to install the required dependencies
#!pip install nuscenes-devkit
#!pip install scipy==1.5.0 scikit-learn==1.0.2 --force-reinstall --no-deps
#!pip install -U ultralytics
#!pip install Preprocessing


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
#Global Varible definitions
import sys

sys.path.insert(0, "/content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build")
sys.path.insert(0, "build")

BUILD_ROOT = "/content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build"

DATA_ROOT = f"{BUILD_ROOT}/data"
NUSCENES_ROOT = f"{DATA_ROOT}/raw/v1.0-mini"
NUSCENES_VERSION = "v1.0-mini"

PREPROCESSED_ROOT = f"{DATA_ROOT}/preprocessed"
YOLO_BEV_ROOT = f"{DATA_ROOT}/yolo_bev"

MODELS_ROOT = f"{BUILD_ROOT}/models"

RUNS_ROOT = f"{BUILD_ROOT}/runs"
RESULTS_ROOT = f"{BUILD_ROOT}/results"

VISUALIZATIONS_ROOT = f"{BUILD_ROOT}/visualizations"



In [8]:
#Creating the Downloader Class
from pathlib import Path
#from Globals import NUSCENES_ROOT


class DataDownloader:
    def __init__(self, root_path=NUSCENES_ROOT):
        self.root = Path(root_path)

    def check_and_prompt(self):
        if self.root.exists():
            return True

        print(f"nuScenes dataset not found at: {self.root}")
        print()
        print("Download instructions:")
        print("1. Visit: https://www.nuscenes.org/nuscenes#download")
        print("2. Download v1.0-mini (4 GB)")
        print(f"3. Extract to: {self.root}")
        print()

        return False

In [9]:
from pathlib import Path
#from Globals import NUSCENES_ROOT, NUSCENES_VERSION


class DataValidator:
    def __init__(self, root_path=NUSCENES_ROOT, version=NUSCENES_VERSION):
        self.root = Path(root_path)
        self.version = version

        self.required_dirs = [
            f'samples/LIDAR_TOP',
            f'sweeps/LIDAR_TOP',
            self.version
        ]

        self.required_files = [
            f'{self.version}/sample.json',
            f'{self.version}/sample_data.json',
            f'{self.version}/sample_annotation.json',
            f'{self.version}/ego_pose.json',
            f'{self.version}/calibrated_sensor.json',
            f'{self.version}/scene.json',
            f'{self.version}/instance.json',
            f'{self.version}/category.json'
        ]

    def validate(self):
        if not self.root.exists():
            return False

        for dir_path in self.required_dirs:
            if not (self.root / dir_path).exists():
                return False

        for file_path in self.required_files:
            if not (self.root / file_path).exists():
                return False

        return True



In [10]:
"""
RawDataInspector: A comprehensive tool for exploring and visualizing nuScenes autonomous driving dataset.

This class provides methods to inspect, analyze, and visualize various components of the nuScenes dataset
including LiDAR point clouds, 3D bounding box annotations, camera images, and scene metadata.

Technical Overview:
- Interfaces with nuScenes API to access dataset metadata and sensor data
- Processes LiDAR point clouds stored in binary .pcd.bin format
- Handles 3D coordinate transformations and spatial data visualization
- Generates matplotlib-based visualizations of multi-sensor autonomous vehicle data
"""

import numpy as np
import os
from pathlib import Path
from nuscenes.utils.data_classes import LidarPointCloud
from nuscenes.utils.data_classes import Box
from nuscenes.utils.geometry_utils import BoxVisibility
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D



class RawDataInspector:
    """
    Inspector class for analyzing raw nuScenes dataset components.

    This class provides a suite of methods to explore autonomous vehicle sensor data,
    including LiDAR point clouds, camera images, and 3D object annotations. All visualizations
    are saved to disk rather than displayed interactively for compatibility with headless environments.

    Attributes:
        nusc: NuScenes instance providing access to the dataset API
        output_dir: Path object pointing to directory where visualizations are saved
    """

    def __init__(self, nusc, output_dir='build/visualizations'):
        """
        Initialize the RawDataInspector with a nuScenes dataset instance.

        Args:
            nusc: A NuScenes instance that provides API access to the dataset.
                  This object contains all the metadata tables (scenes, samples, annotations, etc.)
                  and methods to query and render the dataset.
            output_dir: String path where visualization files will be saved.
                       Defaults to 'build/visualizations'. Directory is created if it doesn't exist.

        Technical Details:
            - Creates output directory structure using pathlib for cross-platform compatibility
            - Stores nuScenes instance for accessing dataset metadata tables and file paths
        """
        self.nusc = nusc
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)


    def inspect_point_cloud(self, sample_data_token):
        """
        Extract and analyze statistics from a LiDAR point cloud.

        This method loads a LiDAR point cloud file and computes spatial and intensity statistics
        to understand the point cloud's coverage and characteristics.

        Args:
            sample_data_token: String UUID identifying a specific LiDAR sensor capture in the dataset.
                              This token references an entry in the sample_data table.

        Returns:
            Dictionary containing:
                - shape: Tuple (4, N) where N is number of points. First 3 rows are x,y,z coordinates,
                        4th row is intensity values
                - num_points: Total number of 3D points captured by the LiDAR
                - x_range: Tuple (min, max) of x-coordinates in meters (forward/backward)
                - y_range: Tuple (min, max) of y-coordinates in meters (left/right)
                - z_range: Tuple (min, max) of z-coordinates in meters (up/down)
                - intensity_range: Tuple (min, max) of LiDAR return intensity values

        Technical Details:
            1. Queries nuScenes metadata to get file path for the LiDAR scan
            2. Loads binary point cloud file (.pcd.bin format) using nuScenes utilities
            3. Point cloud data structure: 4xN numpy array where:
               - Row 0: X coordinates (forward direction in vehicle frame)
               - Row 1: Y coordinates (left direction in vehicle frame)
               - Row 2: Z coordinates (up direction in vehicle frame)
               - Row 3: Intensity values (reflectivity of laser return)
            4. Computes min/max statistics along each dimension using numpy operations
        """
        # Retrieve metadata record for this LiDAR capture from sample_data table
        sample_data = self.nusc.get('sample_data', sample_data_token)

        # Construct absolute file path to the binary point cloud file
        pcl_path = os.path.join(self.nusc.dataroot, sample_data['filename'])

        # Load point cloud from binary file into LidarPointCloud object
        # File format: binary float32 array with 4 values per point (x, y, z, intensity)
        pc = LidarPointCloud.from_file(pcl_path)
        points = pc.points  # Access underlying numpy array (4, N)

        # Compute and return statistical summary of point cloud
        return {
            'shape': points.shape,  # (4, N) - 4 channels, N points
            'num_points': points.shape[1],  # Total number of points
            'x_range': (points[0].min(), points[0].max()),  # Forward/backward extent (meters)
            'y_range': (points[1].min(), points[1].max()),  # Left/right extent (meters)
            'z_range': (points[2].min(), points[2].max()),  # Up/down extent (meters)
            'intensity_range': (points[3].min(), points[3].max())  # LiDAR intensity values
        }

    def inspect_annotations(self, sample_token):
        """
        Extract all 3D bounding box annotations for a given sample (timestamp).

        Each sample in nuScenes represents a synchronized snapshot from all sensors at a specific
        timestamp. This method retrieves all object annotations (3D bounding boxes) associated
        with that sample.

        Args:
            sample_token: String UUID identifying a sample (multi-sensor snapshot at one timestamp).
                         References an entry in the sample table.

        Returns:
            List of dictionaries, one per annotated object, each containing:
                - category: String object class (e.g., 'vehicle.car', 'human.pedestrian.adult')
                - translation: [x, y, z] center of 3D bounding box in global coordinates (meters)
                - size: [width, length, height] dimensions of bounding box (meters)
                - rotation: Quaternion [w, x, y, z] representing 3D orientation of the box

        Technical Details:
            1. Queries sample table to get list of annotation tokens for this timestamp
            2. Each sample contains 'anns' field: list of annotation token UUIDs
            3. For each annotation token, queries sample_annotation table for full metadata
            4. Coordinate system: Global world coordinates (not ego vehicle frame)
            5. Rotation format: Quaternion for 3D rotation (avoids gimbal lock issues)
            6. Size convention: [width (left-right), length (forward-back), height (up-down)]
        """
        # Retrieve the sample record containing list of annotation tokens
        sample = self.nusc.get('sample', sample_token)

        # Collect detailed annotation data for each object in this sample
        annotations = []
        for ann_token in sample['anns']:  # Iterate over all annotation UUIDs
            # Query sample_annotation table for full annotation metadata
            ann = self.nusc.get('sample_annotation', ann_token)

            # Extract key 3D bounding box parameters
            annotations.append({
                'category': ann['category_name'],  # Object class label
                'translation': ann['translation'],  # 3D position: [x, y, z] in global frame (meters)
                'size': ann['size'],  # Box dimensions: [width, length, height] (meters)
                'rotation': ann['rotation']  # Orientation: quaternion [w, x, y, z]
            })

        return annotations

    def visualize_3d_scene(self, sample_token):
        """
        Create a 3D visualization of LiDAR point cloud with overlaid 3D bounding box annotations.

        This method generates a 3D scatter plot showing the spatial distribution of LiDAR points
        colored by height (z-coordinate), with red wireframe boxes representing annotated objects.
        The visualization is saved as a PNG file for inspection.

        Args:
            sample_token: String UUID identifying the sample (timestamp) to visualize.

        Technical Details:
            1. Data Loading:
               - Retrieves LIDAR_TOP sensor data token from the sample
               - Loads binary point cloud file and extracts xyz coordinates (discards intensity)

            2. Point Subsampling:
               - Randomly samples up to 10,000 points for performance (full clouds have ~30K points)
               - Uses numpy.random.choice with replace=False for uniform random sampling
               - Reduces rendering time while maintaining spatial distribution

            3. 3D Scatter Plot:
               - Creates matplotlib 3D axis using projection='3d'
               - Points colored by z-coordinate (height) using 'viridis' colormap
               - Small point size (s=0.1) and transparency (alpha=0.5) for better visibility

            4. Bounding Box Overlay:
               - Retrieves 3D boxes transformed to sensor coordinate frame
               - BoxVisibility.ANY includes all boxes regardless of visibility status
               - Each box rendered as wireframe by connecting bottom face corners
               - Corners format: 3x8 array (xyz coordinates of 8 box vertices)
               - Draws 4 edges connecting bottom face corners (indices 0,1,2,3)

            5. Coordinate System:
               - X-axis: Forward direction (vehicle's driving direction)
               - Y-axis: Left direction (perpendicular to driving direction)
               - Z-axis: Up direction (vertical, perpendicular to ground)

            6. Output:
               - Saves high-resolution PNG (150 DPI) with tight bounding box
               - Closes figure to free memory (important in batch processing)
        """
        # Get the sample record and extract LIDAR_TOP sensor token
        sample = self.nusc.get('sample', sample_token)
        lidar_token = sample['data']['LIDAR_TOP']  # Top-mounted LiDAR is primary 3D sensor

        # Load point cloud file from disk
        sample_data = self.nusc.get('sample_data', lidar_token)
        pcl_path = os.path.join(self.nusc.dataroot, sample_data['filename'])
        pc = LidarPointCloud.from_file(pcl_path)
        points = pc.points[:3, :]  # Extract only xyz coordinates (discard intensity channel)

        # Subsample points for efficient rendering (typical cloud has ~30K-40K points)
        indices = np.random.choice(points.shape[1],  # Total available points
                                   size=min(10000, points.shape[1]),  # Sample up to 10K
                                   replace=False)  # No duplicates
        points_sampled = points[:, indices]

        # Create 3D matplotlib figure and axis
        fig = plt.figure(figsize=(12, 8))
        ax = fig.add_subplot(111, projection='3d')

        # Render point cloud as 3D scatter plot
        # Color points by height (z-coordinate) for depth perception
        ax.scatter(points_sampled[0],  # X coordinates
                   points_sampled[1],  # Y coordinates
                   points_sampled[2],  # Z coordinates
                   c=points_sampled[2],  # Color by height
                   cmap='viridis',  # Yellow (high) to purple (low) colormap
                   s=0.1,  # Small point size
                   alpha=0.5)  # Semi-transparent for better overlap visibility

        # Retrieve 3D bounding boxes transformed to LiDAR sensor frame
        # Returns: pointcloud, boxes (in sensor frame), camera_intrinsic
        _, boxes, _ = self.nusc.get_sample_data(lidar_token,
                                                box_vis_level=BoxVisibility.ANY)

        # Draw wireframe bounding boxes in red
        for box in boxes:
            corners = box.corners()  # Get 3x8 array of box corner coordinates

            # Draw bottom face of bounding box (4 edges connecting corners 0,1,2,3)
            for i in [0, 1, 2, 3]:
                j = (i + 1) % 4  # Next corner (wraps 3->0)
                ax.plot([corners[0, i], corners[0, j]],  # X coordinates of edge
                        [corners[1, i], corners[1, j]],  # Y coordinates of edge
                        [corners[2, i], corners[2, j]],  # Z coordinates of edge
                        'r-', linewidth=2)  # Red solid line

        # Set axis labels with units
        ax.set_xlabel('X (m)')  # Forward
        ax.set_ylabel('Y (m)')  # Left
        ax.set_zlabel('Z (m)')  # Up
        ax.set_title('3D LiDAR Scene with Annotations')

        # Save figure to disk and clean up
        output_path = self.output_dir / f'3d_scene_{sample_token}.png'
        plt.savefig(output_path, dpi=150, bbox_inches='tight')
        plt.close()  # Free memory
        print(f"Saved 3D scene visualization to: {output_path}")

    def list_scenes(self):
        """
        Print a formatted list of all scenes in the dataset to console.

        A 'scene' in nuScenes represents a continuous 20-second driving segment.
        This method delegates to nuScenes' built-in list_scenes() which prints:
        - Scene token (unique identifier)
        - Scene name/description
        - Timestamp when recorded
        - Duration in seconds
        - Location (e.g., boston-seaport, singapore-onenorth)
        - Number of annotations in the scene

        Technical Details:
            - Uses nuScenes API's formatted output
            - Useful for browsing dataset contents and selecting scenes for analysis
            - v1.0-mini contains 10 scenes (subset of full dataset's 1000 scenes)
        """
        self.nusc.list_scenes()

    def visualize_sample(self):
        """
        Render a multi-sensor visualization of a complete sample (all 6 cameras).

        Creates a 2x3 grid showing synchronized images from all 6 cameras mounted on the vehicle
        at a single timestamp. This provides a 360-degree view around the autonomous vehicle.

        Technical Details:
            1. Hardcoded to visualize scene index 1 (second scene in dataset)
            2. Uses first sample (timestamp) in that scene
            3. Calls nuScenes' render_sample() which:
               - Loads images from all 6 cameras: CAM_FRONT, CAM_FRONT_LEFT, CAM_FRONT_RIGHT,
                 CAM_BACK, CAM_BACK_LEFT, CAM_BACK_RIGHT
               - Overlays 2D projections of 3D bounding boxes on each camera view
               - Arranges in 2x3 grid (front cameras top row, back cameras bottom row)
            4. Camera coverage:
               - Each camera: ~70° horizontal field of view
               - Combined: Full 360° coverage around vehicle
            5. Saves composite image showing complete sensor suite view
        """
        # Select scene index 1 (hardcoded for demo purposes)
        my_scene = self.nusc.scene[1]
        first_sample_token = my_scene["first_sample_token"]  # Get first timestamp in scene

        # Render all 6 camera views with 2D projected annotations
        output_path = self.output_dir / f'sample_{first_sample_token}.png'
        self.nusc.render_sample(first_sample_token)  # Creates matplotlib figure internally
        plt.savefig(output_path, dpi=150, bbox_inches='tight')
        plt.close()  # Free memory
        print(f"Saved sample visualization to: {output_path}")

    def visualize_sample_data(self):
        """
        Render a single camera image with overlaid 2D bounding box annotations.

        Visualizes one specific sensor (front camera) at a specific timestamp, showing
        the raw camera image with 2D projections of 3D object annotations overlaid.

        Technical Details:
            1. Sensor Selection:
               - Hardcoded to CAM_FRONT (forward-facing camera)
               - Could be any of the 6 cameras or LIDAR_TOP sensor

            2. Data Flow:
               - sample['data'] dict maps sensor names to sample_data tokens
               - sample_data record contains: filename, timestamp, calibration reference
               - Retrieves CAM_FRONT's sample_data token for this timestamp

            3. Rendering Process (via nuScenes API):
               - Loads camera image from JPEG file
               - Retrieves all 3D annotations for this sample
               - Projects 3D boxes to 2D image plane using camera intrinsics and extrinsics
               - Draws 2D bounding boxes on image
               - Filters boxes by visibility and frustum culling

            4. Projection Math:
               - 3D world coords → ego vehicle frame → sensor frame → image plane
               - Uses camera calibration matrix (intrinsics) and pose (extrinsics)
               - Only renders objects visible in camera's field of view

            5. Output: Single annotated camera image saved as PNG
        """
        # Navigate to scene 1, first timestamp
        my_scene = self.nusc.scene[1]
        first_sample_token = my_scene["first_sample_token"]
        my_sample = self.nusc.get("sample", first_sample_token)

        # Select front camera sensor
        sensor = 'CAM_FRONT'  # Could be any sensor: CAM_BACK, CAM_FRONT_LEFT, etc.
        cam_front_data = self.nusc.get('sample_data', my_sample['data'][sensor])

        # Render camera image with 2D projected annotations
        output_path = self.output_dir / f'sample_data_{cam_front_data["token"]}.png'
        self.nusc.render_sample_data(cam_front_data['token'])  # Creates matplotlib figure
        plt.savefig(output_path, dpi=150, bbox_inches='tight')
        plt.close()  # Free memory
        print(f"Saved sample data visualization to: {output_path}")

    def visualize_annotation(self):
        """
        Render individual object annotations across all camera views where visible.

        For each annotated object, creates a visualization showing the object's 2D bounding box
        projected onto all camera images where it appears. Demonstrates how a single 3D object
        annotation maps to multiple 2D views.

        Technical Details:
            1. Annotation Selection:
               - Processes first 3 annotations from scene 1's first sample
               - Limits to 3 to avoid excessive output (samples can have 50+ annotations)

            2. For Each Annotation:
               a) Prints full annotation metadata to console:
                  - token: Unique annotation identifier
                  - sample_token: Parent sample (timestamp) reference
                  - instance_token: Object instance ID (tracks same object across frames)
                  - visibility_token: How well object is visible (1-4 scale)
                  - category_name: Object class (e.g., 'vehicle.car', 'human.pedestrian.adult')
                  - translation: 3D center position [x, y, z] in global coordinates (meters)
                  - size: [width, length, height] of 3D bounding box (meters)
                  - rotation: Quaternion [w, x, y, z] for 3D orientation
                  - num_lidar_pts: Point cloud points inside this box
                  - num_radar_pts: Radar detections for this object

               b) Calls nuScenes' render_annotation() which:
                  - Loads images from all 6 cameras
                  - Projects 3D box to 2D in each camera view
                  - Only shows cameras where object is visible (in field of view)
                  - Creates multi-panel figure with relevant camera views

            3. Use Cases:
               - Understanding annotation structure and metadata
               - Verifying annotation quality across multiple views
               - Debugging object tracking and visibility

            4. Output: One PNG per annotation showing all relevant camera views
        """
        # Navigate to scene 1, first timestamp
        my_scene = self.nusc.scene[1]
        first_sample_token = my_scene["first_sample_token"]
        my_sample = self.nusc.get("sample", first_sample_token)
        annotation_tokens = my_sample['anns']  # List of all annotation tokens for this sample

        # Process first 3 annotations only (for demonstration)
        for idx, annotation_token in enumerate(annotation_tokens[:3]):
            # Print complete annotation metadata to console for inspection
            my_annotation_metadata = self.nusc.get('sample_annotation', annotation_token)
            print(my_annotation_metadata)

            # Render this annotation across all cameras where visible
            self.nusc.render_annotation(annotation_token)  # Creates matplotlib figure
            output_path = self.output_dir / f'annotation_{idx}_{annotation_token}.png'
            plt.savefig(output_path, dpi=150, bbox_inches='tight')
            plt.close()  # Free memory
            print(f"Saved annotation visualization to: {output_path}")


In [11]:
"""
DatasetConfigGenerator: Create YOLO dataset.yaml configuration file.

This module generates the dataset.yaml file required by YOLO training, which
specifies paths to train/val/test images and defines class names. The configuration
references files directly in the preprocessed directory without duplication.

Key Operations:
- Generate dataset.yaml with absolute paths
- Define class names and IDs
- Save configuration for YOLO training
"""

from pathlib import Path
import yaml
#from Globals import PREPROCESSED_ROOT, DATA_ROOT


class DatasetConfigGenerator:
    """
    Generator for YOLO dataset configuration files.

    Creates a dataset.yaml file that points to the preprocessed images and labels
    in their original locations, avoiding file duplication.

    Attributes:
        class_names: List of detection class names
        num_classes: Number of detection classes
    """

    def __init__(self):
        """
        Initialize the dataset config generator with class definitions.

        Technical Details:
            - Class IDs map to indices: 0=Car, 1=Truck/Bus, 2=Pedestrian, 3=Cyclist
            - Order must match YOLOAnnotationConverter class_mapping
        """
        self.class_names = ['car', 'truck_bus', 'pedestrian', 'cyclist']
        self.num_classes = len(self.class_names)

    def generate(self, splits, output_path):
        """
        Generate YOLO dataset.yaml configuration file.

        Creates a YAML file that references the preprocessed images and labels
        directly, with separate lists for train/val/test splits.

        Args:
            splits: Dictionary from DataSplitter with train/val/test file paths
            output_path: Path where dataset.yaml should be saved

        Returns:
            Path to the generated dataset.yaml file

        Technical Details:

            **YAML Structure:**
            ```yaml
            path: /absolute/path/to/preprocessed
            train: images/
            val: images/
            test: images/

            names:
              0: car
              1: truck_bus
              2: pedestrian
              3: cyclist

            nc: 4
            ```

            **Path Handling:**
            - Uses absolute paths for robustness
            - YOLO will look for labels/ in same directory as images/
            - Train/val/test specify subdirectories relative to path

            **File Format:**
            - Standard YAML format
            - Compatible with Ultralytics YOLO
            - Supports both v8 and v12 versions
        """
        print(f"\nGenerating dataset.yaml...")

        # ============================================================
        # STEP 1: Build dataset configuration dictionary
        # ============================================================

        # Use absolute path to preprocessed directory
        preprocessed_path = Path(PREPROCESSED_ROOT).resolve()

        # YOLO expects images and labels to be in parallel directories
        # Since we're referencing preprocessed directory directly, we specify
        # the images/ subdirectory for each split
        config = {
            'path': str(preprocessed_path),
            'train': 'images',  # YOLO will look in path/images for train images
            'val': 'images',    # and path/labels for train labels
            'test': 'images',

            # Class definitions
            'names': {i: name for i, name in enumerate(self.class_names)},
            'nc': self.num_classes
        }

        # ============================================================
        # STEP 2: Save configuration to YAML file
        # ============================================================
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)

        with open(output_path, 'w') as f:
            yaml.dump(config, f, default_flow_style=False, sort_keys=False)

        print(f"✓ Dataset configuration saved to: {output_path}")

        # ============================================================
        # STEP 3: Create split manifest files for reference
        # ============================================================
        # Save train/val/test file lists for future reference
        manifest_dir = output_path.parent / 'split_manifests'
        manifest_dir.mkdir(exist_ok=True)

        for split_name, split_data in splits.items():
            manifest_path = manifest_dir / f'{split_name}_files.txt'
            with open(manifest_path, 'w') as f:
                for img_path in split_data['images']:
                    # Save relative path from preprocessed root
                    rel_path = img_path.relative_to(preprocessed_path)
                    f.write(f"{rel_path}\n")

            print(f"  Saved {split_name} manifest: {manifest_path}")

        return output_path



In [12]:
"""
DataSplitter: Split preprocessed dataset into train/validation/test sets.

This module handles the splitting of the preprocessed BEV dataset into training,
validation, and test subsets. The splits are performed in memory by creating lists
of file paths - no files are copied or duplicated.

Key Operations:
- Load all preprocessed image/label pairs
- Perform stratified train/val/test split
- Return file path lists for each split
- Preserve class distribution across splits
"""

from pathlib import Path
from sklearn.model_selection import train_test_split
import random
#from Globals import PREPROCESSED_ROOT


class DataSplitter:
    """
    Splitter for dividing preprocessed dataset into train/val/test sets in memory.

    This class implements stratified sampling to ensure that the class distribution
    is maintained across all splits. File paths are organized into lists without
    copying or moving any actual files.

    Attributes:
        preprocessed_root: Path to preprocessed dataset directory
        images_dir: Path to preprocessed images directory
        labels_dir: Path to preprocessed labels directory
        train_ratio: Fraction of data for training (default: 0.7)
        val_ratio: Fraction of data for validation (default: 0.15)
        test_ratio: Fraction of data for testing (default: 0.15)
        random_seed: Random seed for reproducibility
    """

    def __init__(self, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_seed=42):
        """
        Initialize the data splitter with split ratios.

        Args:
            train_ratio: Fraction of data for training (0 < x < 1)
            val_ratio: Fraction of data for validation (0 < x < 1)
            test_ratio: Fraction of data for testing (0 < x < 1)
            random_seed: Random seed for reproducible splits

        Technical Details:
            - Ratios must sum to 1.0
            - Typical splits: 70/15/15 or 80/10/10
            - Random seed ensures same split across runs
        """
        # Validate ratios sum to 1.0
        total = train_ratio + val_ratio + test_ratio
        if abs(total - 1.0) > 1e-6:
            raise ValueError(f"Split ratios must sum to 1.0, got {total}")

        self.train_ratio = train_ratio
        self.val_ratio = val_ratio
        self.test_ratio = test_ratio
        self.random_seed = random_seed

        # Set random seeds for reproducibility
        random.seed(random_seed)

        # Setup paths
        self.preprocessed_root = Path(PREPROCESSED_ROOT)
        self.images_dir = self.preprocessed_root / 'images'
        self.labels_dir = self.preprocessed_root / 'labels'

    def split(self):
        """
        Split preprocessed dataset into train/val/test sets in memory.

        Returns file path lists without copying any files. The returned paths
        reference the original files in the preprocessed directory.

        Returns:
            Dictionary with keys 'train', 'val', 'test', each containing:
            {
                'images': List of Path objects to image files,
                'labels': List of Path objects to label files
            }

        Technical Details:
            - Two-stage split: first separate test, then split remainder into train/val
            - File paths are kept as Path objects, no files copied
            - Shuffle ensures random distribution
            - Random seed provides reproducibility
        """
        # ============================================================
        # STEP 1: Get all image files
        # ============================================================
        print("Scanning preprocessed dataset...")
        image_files = sorted(list(self.images_dir.glob('*.png')))

        if len(image_files) == 0:
            raise FileNotFoundError(
                f"No preprocessed images found in {self.images_dir}\n"
                f"Have you run the preprocessing stage?"
            )

        print(f"Found {len(image_files)} preprocessed samples")

        # ============================================================
        # STEP 2: Create list of filename stems
        # ============================================================
        file_stems = [f.stem for f in image_files]

        # ============================================================
        # STEP 3: First split - separate test set
        # ============================================================
        train_val_stems, test_stems = train_test_split(
            file_stems,
            test_size=self.test_ratio,
            random_state=self.random_seed,
            shuffle=True
        )

        # ============================================================
        # STEP 4: Second split - separate train and val
        # ============================================================
        val_ratio_adjusted = self.val_ratio / (self.train_ratio + self.val_ratio)

        train_stems, val_stems = train_test_split(
            train_val_stems,
            test_size=val_ratio_adjusted,
            random_state=self.random_seed,
            shuffle=True
        )

        # ============================================================
        # STEP 5: Build file path dictionaries (no copying)
        # ============================================================
        splits = {
            'train': self._build_file_lists(train_stems),
            'val': self._build_file_lists(val_stems),
            'test': self._build_file_lists(test_stems)
        }

        # Print split summary
        print(f"\nDataset split:")
        print(f"  Train: {len(train_stems)} samples ({len(train_stems)/len(file_stems)*100:.1f}%)")
        print(f"  Val:   {len(val_stems)} samples ({len(val_stems)/len(file_stems)*100:.1f}%)")
        print(f"  Test:  {len(test_stems)} samples ({len(test_stems)/len(file_stems)*100:.1f}%)")

        return splits

    def _build_file_lists(self, file_stems):
        """
        Build lists of image and label file paths from filename stems.

        Args:
            file_stems: List of filename stems (without extensions)

        Returns:
            Dictionary with 'images' and 'labels' keys containing Path lists
        """
        images = [self.images_dir / f"{stem}.png" for stem in file_stems]
        labels = [self.labels_dir / f"{stem}.txt" for stem in file_stems]

        return {
            'images': images,
            'labels': labels
        }



In [13]:
"""
ModelEvaluator: Evaluate trained YOLO model on test set.

This module handles model evaluation including inference on test set, metrics
computation (mAP, precision, recall), and performance analysis. It provides
comprehensive assessment of detection performance.

Key Operations:
- Run inference on test set
- Compute detection metrics (mAP@0.5, mAP@0.5:0.95)
- Calculate per-class performance
- Measure inference speed
"""

from ultralytics import YOLO
from pathlib import Path


class ModelEvaluator:
    """
    Evaluator for trained YOLO detection models.

    This class handles model evaluation on test data, computing standard object
    detection metrics and analyzing performance across different classes and
    confidence thresholds.

    Attributes:
        model: YOLO model instance loaded from weights
        model_path: Path to model weights file
        dataset_yaml: Path to dataset configuration
        class_names: List of detection class names
    """

    def __init__(self, model_path, dataset_yaml):
        """
        Initialize the model evaluator.

        Args:
            model_path: Path to trained model weights (.pt file)
            dataset_yaml: Path to dataset.yaml configuration

        Technical Details:
            - Loads model from weights file
            - Validates model architecture
            - Prepares for evaluation on test set
        """
        self.model_path = Path(model_path)
        self.dataset_yaml = str(dataset_yaml)
        self.class_names = ['Car', 'Truck/Bus', 'Pedestrian', 'Cyclist']

        # Load trained model
        print(f"\nLoading model from: {self.model_path}")
        if not self.model_path.exists():
            raise FileNotFoundError(f"Model weights not found: {self.model_path}")

        self.model = YOLO(str(self.model_path))
        print("✓ Model loaded successfully")

    def evaluate(self, conf_threshold=0.25, iou_threshold=0.45, img_size=1000):
        """
        Evaluate model on test set.

        Runs inference on all test images and computes comprehensive detection
        metrics including mAP, precision, recall, and per-class performance.

        Args:
            conf_threshold: Confidence threshold for predictions (default: 0.25)
            iou_threshold: IoU threshold for NMS (default: 0.45)
            img_size: Input image size (default: 1000)

        Returns:
            Results object containing metrics and predictions

        Technical Details:

            **Metrics Computed:**

            • mAP@0.5 (Mean Average Precision at IoU=0.5):
              - Standard COCO metric for loose localization
              - Considers detection "correct" if IoU ≥ 0.5
              - Values typically 0.5-0.9 for good detectors

            • mAP@0.5:0.95 (Mean Average Precision at IoU=0.5:0.95):
              - Averaged over IoU thresholds from 0.5 to 0.95 (step 0.05)
              - More stringent metric requiring tighter localization
              - Values typically 0.3-0.6 for good detectors

            • Precision:
              - TP / (TP + FP)
              - Proportion of correct detections among all detections
              - Higher = fewer false alarms

            • Recall:
              - TP / (TP + FN)
              - Proportion of ground truth objects detected
              - Higher = fewer missed objects

            **Confidence Threshold:**
            - 0.25: Default, balances precision and recall
            - Lower: More detections, higher recall, lower precision
            - Higher: Fewer detections, lower recall, higher precision

            **NMS IoU Threshold:**
            - 0.45: Removes overlapping boxes (keep best)
            - Lower: More aggressive suppression
            - Higher: Keeps more overlapping detections
        """
        print("\n" + "="*80)
        print("MODEL EVALUATION")
        print("="*80)

        print(f"\nEvaluation settings:")
        print(f"  Confidence threshold: {conf_threshold}")
        print(f"  NMS IoU threshold: {iou_threshold}")
        print(f"  Image size: {img_size}")

        # ============================================================
        # Run validation on test split
        # ============================================================
        print(f"\nRunning inference on test set...")

        results = self.model.val(
            data=self.dataset_yaml,
            split='test',
            imgsz=img_size,
            batch=16,
            conf=conf_threshold,
            iou=iou_threshold,
            plots=True,
            save_json=True,
            save_txt=True
        )

        print("✓ Evaluation complete")

        return results

    def print_metrics(self, results):
        """
        Print comprehensive evaluation metrics.

        Displays overall and per-class performance metrics in a formatted
        table for easy interpretation.

        Args:
            results: Results object from evaluate()

        Technical Details:
            - Extracts metrics from YOLO results object
            - Formats for console display
            - Includes inference timing statistics
        """
        print("\n" + "="*80)
        print("EVALUATION METRICS")
        print("="*80)

        # ============================================================
        # Overall metrics
        # ============================================================
        print(f"\nOverall Performance:")
        print(f"  mAP@0.5:      {results.box.map50:.4f}")
        print(f"  mAP@0.5:0.95: {results.box.map:.4f}")
        print(f"  Precision:    {results.box.mp:.4f}")
        print(f"  Recall:       {results.box.mr:.4f}")

        # ============================================================
        # Per-class metrics
        # ============================================================
        print(f"\nPer-Class Performance:")
        print(f"  {'Class':<15} {'mAP@0.5':<10} {'Precision':<12} {'Recall':<10}")
        print(f"  {'-'*50}")

        for i, class_name in enumerate(self.class_names):
            map50 = results.box.maps[i] if hasattr(results.box, 'maps') else 0.0
            precision = results.box.p[i] if hasattr(results.box, 'p') else 0.0
            recall = results.box.r[i] if hasattr(results.box, 'r') else 0.0

            print(f"  {class_name:<15} {map50:<10.4f} {precision:<12.4f} {recall:<10.4f}")

        # ============================================================
        # Inference speed
        # ============================================================
        print(f"\nInference Speed:")
        if hasattr(results, 'speed'):
            preprocess_time = results.speed.get('preprocess', 0)
            inference_time = results.speed.get('inference', 0)
            postprocess_time = results.speed.get('postprocess', 0)
            total_time = preprocess_time + inference_time + postprocess_time

            print(f"  Preprocess:  {preprocess_time:.2f} ms")
            print(f"  Inference:   {inference_time:.2f} ms")
            print(f"  Postprocess: {postprocess_time:.2f} ms")
            print(f"  Total:       {total_time:.2f} ms")
            print(f"  FPS:         {1000 / total_time:.2f}")

        print("\n" + "="*80)

    def predict_batch(self, image_paths, conf_threshold=0.25, save_dir=None):
        """
        Run inference on a batch of images.

        Useful for visualizing predictions on specific images or creating
        demo outputs.

        Args:
            image_paths: List of paths to images
            conf_threshold: Confidence threshold (default: 0.25)
            save_dir: Directory to save prediction visualizations (optional)

        Returns:
            List of prediction results, one per image

        Technical Details:
            - Processes images in batch for efficiency
            - Optionally saves annotated images
            - Returns raw prediction results for further processing
        """
        results = self.model.predict(
            source=image_paths,
            conf=conf_threshold,
            save=save_dir is not None,
            project=save_dir,
            exist_ok=True
        )

        return results



Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [14]:
"""
PerformanceAnalyzer: Analyze model performance across different scenarios.

This module provides detailed performance analysis including per-class metrics,
confidence threshold analysis, and error pattern identification.

Key Operations:
- Compute detailed per-class statistics
- Analyze performance across confidence thresholds
- Identify common failure modes
- Generate performance summary tables
"""

import numpy as np
from pathlib import Path


class PerformanceAnalyzer:
    """
    Analyzer for detailed model performance assessment.

    Provides tools for understanding model behavior beyond basic metrics,
    including analysis of failure cases and performance across different
    object types and scenarios.

    Attributes:
        class_names: List of detection class names
    """

    def __init__(self):
        """
        Initialize the performance analyzer.

        Technical Details:
            - Prepares analysis frameworks
            - Sets up metric tracking structures
        """
        self.class_names = ['Car', 'Truck/Bus', 'Pedestrian', 'Cyclist']

    def analyze_class_distribution(self, labels_dir):
        """
        Analyze class distribution in the dataset.

        Counts instances of each class across all labels to understand
        dataset balance and potential class imbalance issues.

        Args:
            labels_dir: Directory containing YOLO label files

        Returns:
            Dictionary mapping class names to instance counts

        Technical Details:
            - Parses all label files
            - Aggregates class counts
            - Identifies imbalanced classes
        """
        print("\nAnalyzing class distribution...")

        labels_dir = Path(labels_dir)
        class_counts = {name: 0 for name in self.class_names}

        # Count instances in each label file
        for label_file in labels_dir.glob('*.txt'):
            with open(label_file, 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) == 5:
                        class_id = int(parts[0])
                        if 0 <= class_id < len(self.class_names):
                            class_counts[self.class_names[class_id]] += 1

        # Print distribution
        total = sum(class_counts.values())
        print("\nClass Distribution:")
        print(f"  {'Class':<15} {'Count':<10} {'Percentage':<10}")
        print(f"  {'-'*40}")

        for class_name, count in class_counts.items():
            percentage = (count / total * 100) if total > 0 else 0
            print(f"  {class_name:<15} {count:<10} {percentage:>6.2f}%")

        print(f"\n  Total: {total}")

        return class_counts

    def compute_performance_summary(self, results):
        """
        Compute comprehensive performance summary.

        Extracts and organizes all key metrics from evaluation results
        into a structured summary for reporting or further analysis.

        Args:
            results: Evaluation results from ModelEvaluator

        Returns:
            Dictionary containing organized performance metrics

        Technical Details:
            - Extracts metrics from YOLO results object
            - Organizes by category (overall, per-class, speed)
            - Returns structured data for downstream use
        """
        summary = {
            'overall': {
                'mAP_50': float(results.box.map50),
                'mAP_50_95': float(results.box.map),
                'precision': float(results.box.mp),
                'recall': float(results.box.mr)
            },
            'per_class': {},
            'speed': {}
        }

        # Per-class metrics
        for i, class_name in enumerate(self.class_names):
            summary['per_class'][class_name] = {
                'mAP_50': float(results.box.maps[i]) if hasattr(results.box, 'maps') else 0.0,
                'precision': float(results.box.p[i]) if hasattr(results.box, 'p') else 0.0,
                'recall': float(results.box.r[i]) if hasattr(results.box, 'r') else 0.0
            }

        # Speed metrics
        if hasattr(results, 'speed'):
            summary['speed'] = {
                'preprocess_ms': results.speed.get('preprocess', 0),
                'inference_ms': results.speed.get('inference', 0),
                'postprocess_ms': results.speed.get('postprocess', 0),
                'total_ms': sum(results.speed.values()),
                'fps': 1000 / sum(results.speed.values()) if sum(results.speed.values()) > 0 else 0
            }

        return summary

    def print_summary(self, summary):
        """
        Print formatted performance summary.

        Args:
            summary: Performance summary dictionary from compute_performance_summary()

        Technical Details:
            - Pretty-prints structured metrics
            - Formatted for console display
        """
        print("\n" + "="*80)
        print("PERFORMANCE SUMMARY")
        print("="*80)

        print("\nOverall Metrics:")
        for metric, value in summary['overall'].items():
            print(f"  {metric}: {value:.4f}")

        print("\nPer-Class Performance:")
        for class_name, metrics in summary['per_class'].items():
            print(f"\n  {class_name}:")
            for metric, value in metrics.items():
                print(f"    {metric}: {value:.4f}")

        if summary['speed']:
            print("\nInference Speed:")
            for metric, value in summary['speed'].items():
                if 'fps' in metric:
                    print(f"  {metric}: {value:.2f}")
                else:
                    print(f"  {metric}: {value:.2f} ms")

        print("\n" + "="*80)



In [15]:
"""
ResultsVisualizer: Visualize model predictions and performance.

This module creates visualizations of model predictions including detection
boxes overlaid on BEV images, confusion matrices, and performance plots.

Key Operations:
- Visualize predictions on test images
- Generate confusion matrix
- Plot precision-recall curves
- Create performance comparison charts
"""

import matplotlib.pyplot as plt
import cv2
import numpy as np
from pathlib import Path
import random
#from Globals import RESULTS_ROOT


class ResultsVisualizer:
    """
    Visualizer for model predictions and evaluation results.

    Creates visual outputs to assess model performance and understand prediction
    behavior across different scenarios.

    Attributes:
        results_dir: Directory for saving visualization outputs
        class_names: List of detection class names
        colors: Color scheme for each class
    """

    def __init__(self):
        """
        Initialize the results visualizer.

        Technical Details:
            - Creates results directory structure
            - Sets up color scheme for consistent visualization
            - Configures matplotlib defaults
        """
        self.results_dir = Path(RESULTS_ROOT)
        self.results_dir.mkdir(parents=True, exist_ok=True)

        self.class_names = ['Car', 'Truck/Bus', 'Pedestrian', 'Cyclist']
        self.colors = [
            (255, 0, 0),    # Red: Cars
            (0, 255, 0),    # Green: Trucks/Buses
            (0, 0, 255),    # Blue: Pedestrians
            (255, 255, 0)   # Yellow: Cyclists
        ]

    def visualize_predictions(self, model, test_images_dir, num_samples=10, conf_threshold=0.25):
        """
        Visualize model predictions on random test images.

        Selects random images from test set, runs inference, and creates
        visualizations with predicted bounding boxes overlaid.

        Args:
            model: Trained YOLO model instance
            test_images_dir: Directory containing test images
            num_samples: Number of images to visualize (default: 10)
            conf_threshold: Confidence threshold for predictions (default: 0.25)

        Returns:
            None (saves visualizations to disk)

        Technical Details:
            - Random sampling ensures diverse visualization
            - Predictions drawn with confidence scores
            - Color-coded by class
            - Saved as high-resolution PNG files
        """
        print(f"\nGenerating prediction visualizations...")

        # ============================================================
        # Select random test images
        # ============================================================
        test_images_dir = Path(test_images_dir)
        all_images = list(test_images_dir.glob('*.png'))

        if len(all_images) == 0:
            print(f"Warning: No test images found in {test_images_dir}")
            return

        sample_images = random.sample(all_images, min(num_samples, len(all_images)))

        # ============================================================
        # Create output directory
        # ============================================================
        vis_dir = self.results_dir / 'predictions'
        vis_dir.mkdir(exist_ok=True)

        # ============================================================
        # Run predictions and visualize
        # ============================================================
        for img_path in sample_images:
            # Run inference
            results = model.predict(
                source=str(img_path),
                conf=conf_threshold,
                verbose=False
            )

            # Get first result (single image)
            result = results[0]

            # Load original image
            img = cv2.imread(str(img_path))
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            # Draw predictions
            if result.boxes is not None and len(result.boxes) > 0:
                boxes = result.boxes.xyxy.cpu().numpy()  # x1, y1, x2, y2
                confs = result.boxes.conf.cpu().numpy()  # Confidence scores
                classes = result.boxes.cls.cpu().numpy().astype(int)  # Class IDs

                for box, conf, cls in zip(boxes, confs, classes):
                    x1, y1, x2, y2 = box.astype(int)

                    # Draw bounding box
                    cv2.rectangle(img_rgb, (x1, y1), (x2, y2), self.colors[cls], 2)

                    # Draw label with confidence
                    label = f"{self.class_names[cls]}: {conf:.2f}"
                    cv2.putText(img_rgb, label, (x1, y1-10),
                              cv2.FONT_HERSHEY_SIMPLEX, 0.5, self.colors[cls], 2)

            # Save visualization
            output_path = vis_dir / f"pred_{img_path.name}"
            plt.figure(figsize=(15, 15))
            plt.imshow(img_rgb)
            plt.title(f"Predictions: {img_path.name}\nDetections: {len(result.boxes) if result.boxes else 0}")
            plt.axis('off')
            plt.tight_layout()
            plt.savefig(output_path, dpi=150, bbox_inches='tight')
            plt.close()

        print(f"✓ Saved {len(sample_images)} prediction visualizations to: {vis_dir}")

    def generate_performance_report(self, results, output_path=None):
        """
        Generate comprehensive performance report.

        Creates a text report summarizing model performance including overall
        metrics, per-class breakdown, and inference timing.

        Args:
            results: Evaluation results from ModelEvaluator
            output_path: Path for report file (default: results/evaluation_report.txt)

        Returns:
            Path to generated report file

        Technical Details:
            - Plain text format for easy viewing
            - Includes all key metrics
            - Suitable for documentation or sharing
        """
        if output_path is None:
            output_path = self.results_dir / 'evaluation_report.txt'
        else:
            output_path = Path(output_path)

        output_path.parent.mkdir(parents=True, exist_ok=True)

        print(f"\nGenerating performance report...")

        with open(output_path, 'w') as f:
            f.write("="*80 + "\n")
            f.write("nuScenes BEV Object Detection - Evaluation Report\n")
            f.write("="*80 + "\n\n")

            f.write("Model Configuration:\n")
            f.write("-"*80 + "\n")
            f.write("Base Model: YOLOv12s\n")
            f.write("Training Strategy: Two-stage transfer learning\n")
            f.write("Input Resolution: 1000×1000 pixels\n")
            f.write("Detection Classes: 4 (Car, Truck/Bus, Pedestrian, Cyclist)\n\n")

            f.write("Overall Performance:\n")
            f.write("-"*80 + "\n")
            f.write(f"mAP@0.5:      {results.box.map50:.4f}\n")
            f.write(f"mAP@0.5:0.95: {results.box.map:.4f}\n")
            f.write(f"Precision:    {results.box.mp:.4f}\n")
            f.write(f"Recall:       {results.box.mr:.4f}\n\n")

            f.write("Per-Class Performance:\n")
            f.write("-"*80 + "\n")
            f.write(f"{'Class':<15} {'mAP@0.5':<12} {'Precision':<12} {'Recall':<10}\n")
            f.write("-"*80 + "\n")

            for i, class_name in enumerate(self.class_names):
                map50 = results.box.maps[i] if hasattr(results.box, 'maps') else 0.0
                precision = results.box.p[i] if hasattr(results.box, 'p') else 0.0
                recall = results.box.r[i] if hasattr(results.box, 'r') else 0.0

                f.write(f"{class_name:<15} {map50:<12.4f} {precision:<12.4f} {recall:<10.4f}\n")

            if hasattr(results, 'speed'):
                total_time = sum(results.speed.values())
                fps = 1000 / total_time if total_time > 0 else 0

                f.write("\n" + "="*80 + "\n")
                f.write("Inference Speed:\n")
                f.write("-"*80 + "\n")
                f.write(f"Average FPS: {fps:.2f}\n")
                f.write(f"Total latency: {total_time:.2f} ms\n\n")

            f.write("="*80 + "\n")
            f.write("Conclusion:\n")
            f.write("-"*80 + "\n")
            f.write("The model demonstrates the feasibility of using YOLO for LiDAR-based\n")
            f.write("object detection by converting 3D point clouds to BEV representations.\n")
            f.write("The two-stage training approach with transfer learning enables efficient\n")
            f.write("domain adaptation from COCO to the nuScenes BEV dataset.\n")

        print(f"✓ Report saved to: {output_path}")
        return output_path



In [16]:
"""
BEVInspector: Visualize and validate preprocessed BEV dataset with YOLO annotations.

This module provides tools for inspecting the preprocessed BEV dataset by loading
BEV images and their corresponding YOLO annotations, then visualizing the bounding
boxes overlaid on the images. This is essential for:
- Validating preprocessing pipeline correctness
- Debugging annotation alignment issues
- Visual quality assessment of the dataset
- Understanding class distribution and box sizes
"""

import matplotlib.pyplot as plt
import cv2
import numpy as np
from pathlib import Path
import os
#from Globals import PREPROCESSED_ROOT


class BEVInspector:
    """
    Inspector for visualizing preprocessed BEV images with YOLO bounding boxes.

    This class loads pairs of BEV images and their corresponding YOLO label files,
    renders the bounding boxes on the images with color-coded class labels, and
    displays them for visual inspection and quality control.

    Visualization helps identify issues such as:
    - Misaligned bounding boxes (coordinate transformation errors)
    - Missing or extra annotations
    - Incorrect class labels
    - Box size anomalies

    Attributes:
        class_names: List of human-readable class names [Car, Truck/Bus, Pedestrian, Cyclist]
        colors: List of BGR color tuples for each class (for OpenCV rendering)
        images_dir: Path to directory containing BEV images
        labels_dir: Path to directory containing YOLO label files
    """

    def __init__(self):
        """
        Initialize the BEV inspector with class definitions and data paths.

        Technical Details:
            - Class names map to class IDs [0, 1, 2, 3]
            - Colors are in BGR format (OpenCV convention, not RGB)
            - Color scheme:
              * Red (255,0,0): Cars - most common, high visibility
              * Green (0,255,0): Trucks/Buses - large vehicles
              * Blue (0,0,255): Pedestrians - vulnerable road users
              * Yellow (255,255,0): Cyclists - two-wheeled vehicles
        """
        # Human-readable class labels (index = class ID)
        self.class_names = ['Car', 'Truck/Bus', 'Pedestrian', 'Cyclist']

        # Colors for bounding boxes (BGR format for OpenCV)
        self.colors = [
            (255, 0, 0),   # Class 0 (Car): Red
            (0, 255, 0),   # Class 1 (Truck/Bus): Green
            (0, 0, 255),   # Class 2 (Pedestrian): Blue
            (255, 255, 0)  # Class 3 (Cyclist): Yellow
        ]

        # Paths to preprocessed dataset directories
        self.images_dir = Path(PREPROCESSED_ROOT) / 'images'
        self.labels_dir = Path(PREPROCESSED_ROOT) / 'labels'

    def load_samples(self, num_samples):
        """
        Load a subset of BEV images and their corresponding YOLO labels.

        This method samples images uniformly across the dataset to provide a
        representative view of the preprocessed data without loading everything.

        Args:
            num_samples: Number of image-label pairs to load

        Returns:
            Tuple of (bev_images, yolo_labels_list) where:
            - bev_images: List of numpy arrays (BGR images from OpenCV)
            - yolo_labels_list: List of lists, each containing YOLO annotations
              Format per annotation: [class_id, x_center, y_center, width, height]

        Technical Details:
            1. Sampling Strategy:
               - Lists all PNG files in images directory
               - Calculates uniform step size to select evenly distributed samples
               - Avoids random sampling to ensure reproducibility

            2. File Loading:
               - Images: cv2.imread loads as BGR uint8 arrays
               - Labels: Text file parsed line-by-line
               - Matching: Uses image filename stem to find corresponding label

            3. Label Parsing:
               - Splits each line by whitespace
               - Expects exactly 5 values per line
               - First value (class_id) converted to int
               - Remaining values (x, y, w, h) converted to float
               - Invalid lines (wrong format) are skipped

            4. Error Handling:
               - Missing label files result in empty label list (not an error)
               - Malformed lines within label files are silently skipped
               - This gracefully handles incomplete preprocessing
        """
        # Get all BEV image files, sorted for consistent ordering
        image_files = sorted(list(self.images_dir.glob('*.png')))

        # Calculate step size for uniform sampling across dataset
        step = max(1, len(image_files) // num_samples)

        # Select evenly-spaced images (list comprehension with stride)
        selected_files = [image_files[i * step] for i in range(num_samples) if i * step < len(image_files)]

        # Initialize lists to collect loaded data
        bev_images = []
        yolo_labels_list = []

        # Load each selected image and its corresponding labels
        for image_path in selected_files:
            # Load BEV image (BGR format, uint8)
            bev_image = cv2.imread(str(image_path))

            # Find matching label file (same filename, different extension)
            label_path = self.labels_dir / f"{image_path.stem}.txt"
            yolo_labels = []

            # Parse YOLO labels if file exists
            if label_path.exists():
                with open(label_path, 'r') as f:
                    for line in f:
                        parts = line.strip().split()  # Split by whitespace

                        # Validate format: must have exactly 5 values
                        if len(parts) == 5:
                            # Parse: <class_id> <x> <y> <w> <h>
                            yolo_labels.append(
                                [int(parts[0])] +  # class_id as integer
                                [float(x) for x in parts[1:]]  # coordinates as floats
                            )

            # Add to collections
            bev_images.append(bev_image)
            yolo_labels_list.append(yolo_labels)

        return bev_images, yolo_labels_list

    def visualize(self, bev_image, yolo_labels):
        """
        Visualize a single BEV image with bounding boxes and labels overlaid.

        Creates a matplotlib figure showing the BEV image with color-coded bounding
        boxes drawn for each object, along with class name labels.

        Args:
            bev_image: numpy array (H, W, 3) in BGR format from OpenCV
            yolo_labels: List of annotations, each [class_id, x, y, w, h] normalized

        Technical Details:
            1. Color Space Conversion:
               - Input: BGR (OpenCV format)
               - Output: RGB (matplotlib format)
               - Required because OpenCV and matplotlib use different conventions

            2. Coordinate Denormalization:
               - YOLO format uses normalized coordinates [0, 1]
               - Must multiply by image dimensions to get pixel coordinates
               - x_pixel = x_normalized * image_width

            3. Box Format Conversion:
               - YOLO: (center_x, center_y, width, height)
               - OpenCV rectangle: (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
               - Conversion: top_left = center - size/2, bottom_right = center + size/2

            4. Rendering:
               - Boxes drawn with 2-pixel thickness
               - Class names positioned above top-left corner
               - Colors match class definitions
        """
        # Convert BGR (OpenCV) to RGB (matplotlib)
        img_rgb = cv2.cvtColor(bev_image, cv2.COLOR_BGR2RGB)
        h, w = img_rgb.shape[:2]  # Get image dimensions

        # Draw each bounding box and label
        for label in yolo_labels:
            class_id = int(label[0])  # Object class [0-3]

            # Denormalize coordinates: [0,1] → pixel values
            x_center, y_center = label[1] * w, label[2] * h
            box_w, box_h = label[3] * w, label[4] * h

            # Convert from center-size to corner format
            x1 = int(x_center - box_w / 2)  # Top-left X
            y1 = int(y_center - box_h / 2)  # Top-left Y
            x2 = int(x_center + box_w / 2)  # Bottom-right X
            y2 = int(y_center + box_h / 2)  # Bottom-right Y

            # Draw rectangle (modifies image in-place)
            cv2.rectangle(img_rgb, (x1, y1), (x2, y2), self.colors[class_id], 2)

            # Draw class label above box
            cv2.putText(img_rgb, self.class_names[class_id], (x1, y1-10),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, self.colors[class_id], 2)

        # Display using matplotlib
        plt.figure(figsize=(12, 12))
        plt.imshow(img_rgb)
        plt.title('BEV Image with YOLO Annotations')
        plt.axis('off')  # Hide axis ticks and labels
        plt.show()

    def visualize_grid(self, bev_images, yolo_labels_list, num_cols=2):
        """
        Visualize multiple BEV images in a grid layout.

        Public interface method that delegates to _draw_grid for rendering.

        Args:
            bev_images: List of BEV images (numpy arrays)
            yolo_labels_list: List of label lists (one per image)
            num_cols: Number of columns in the grid layout

        Returns:
            Result from _draw_grid (typically None after plt.show())
        """
        return self._draw_grid(bev_images, yolo_labels_list, num_cols)

    def _draw_grid(self, bev_images, yolo_labels_list, num_cols=2):
        """
        Internal method to render multiple BEV images with annotations in a grid.

        Creates a matplotlib figure with subplots arranged in a grid, showing
        multiple BEV images side-by-side for comparative analysis. This is useful
        for:
        - Dataset overview and quality assessment
        - Comparing different scenes or time points
        - Identifying patterns in object distribution
        - Spotting preprocessing issues across samples

        Args:
            bev_images: List of numpy arrays (BGR images)
            yolo_labels_list: List of annotation lists
            num_cols: Number of columns in grid (default: 2)

        Technical Details:
            1. Grid Layout:
               - Rows calculated as: ceil(num_samples / num_cols)
               - Creates num_rows × num_cols subplot grid
               - Unused subplots (if any) are hidden

            2. Figure Sizing:
               - Each subplot: 10×10 inches
               - Total width: 10 * num_cols inches
               - Total height: 10 * num_rows inches
               - Large size ensures readability of small objects

            3. Annotation Rendering:
               - Boxes drawn but labels omitted (cleaner appearance in grid)
               - Copy made of each image to avoid modifying originals
               - Color coding preserved from class definitions

            4. Subplot Titles:
               - Format: "Sample {index} ({count} objects)"
               - Provides quick object count per image
               - Helps identify empty vs. crowded scenes

            5. Edge Cases:
               - Single image: axes becomes single object, not array
               - Handles by converting to list when needed
               - Extra subplots turned off to avoid empty panels
        """
        num_samples = len(bev_images)

        # Calculate grid dimensions
        num_rows = (num_samples + num_cols - 1) // num_cols  # Ceiling division

        # Create subplot grid
        fig, axes = plt.subplots(num_rows, num_cols, figsize=(10 * num_cols, 10 * num_rows))

        # Handle single subplot case (axes is not an array)
        if num_samples == 1:
            axes = [axes]
        else:
            axes = axes.flatten()  # Convert 2D grid to 1D list

        # Render each BEV image with annotations
        for i in range(num_samples):
            img = bev_images[i].copy()  # Copy to avoid modifying original
            h, w = img.shape[:2]

            # Draw bounding boxes for all objects in this image
            for label in yolo_labels_list[i]:
                class_id = int(label[0])

                # Denormalize coordinates
                x_center, y_center = label[1] * w, label[2] * h
                box_w, box_h = label[3] * w, label[4] * h

                # Convert to corner format
                x1 = int(x_center - box_w / 2)
                y1 = int(y_center - box_h / 2)
                x2 = int(x_center + box_w / 2)
                y2 = int(y_center + box_h / 2)

                # Draw box (no label to reduce clutter in grid view)
                cv2.rectangle(img, (x1, y1), (x2, y2), self.colors[class_id], 2)

            # Convert BGR to RGB for matplotlib
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            # Display in subplot
            axes[i].imshow(img_rgb)
            axes[i].set_title(f'Sample {i} ({len(yolo_labels_list[i])} objects)')
            axes[i].axis('off')  # Hide axis

        # Hide unused subplots (if grid has more cells than images)
        for i in range(num_samples, len(axes)):
            axes[i].axis('off')

        # Adjust spacing and display
        plt.tight_layout()  # Reduce whitespace between subplots
        plt.show()



In [17]:
"""
BEVRasterizer: Convert 3D LiDAR point clouds to 2D Bird's Eye View (BEV) images.

This module implements the core rasterization algorithm that projects 3D point clouds
onto a 2D grid viewed from above (bird's eye perspective). The resulting BEV images
encode height, intensity, and density information in three channels, making them
suitable for 2D object detection models.

Key Concepts:
- Orthographic projection from 3D to 2D (top-down view)
- Multi-channel encoding: height, intensity, density
- Pixel-space quantization and accumulation
- Normalization and perceptual enhancement
"""

import numpy as np


class BEVRasterizer:
    """
    Rasterizer that converts 3D point clouds into 2D Bird's Eye View (BEV) images.

    This class implements a projection algorithm that creates a top-down view of the
    environment around the vehicle, encoding 3D information into a 2D image format
    compatible with standard 2D object detection architectures like YOLO.

    The BEV representation has several advantages:
    - Preserves spatial relationships and distances (unlike perspective images)
    - Eliminates scale variation with distance
    - Provides consistent object sizes regardless of distance
    - Suitable for accurate localization and planning

    Attributes:
        x_range: Tuple (min, max) for forward/backward extent (meters)
        y_range: Tuple (min, max) for left/right extent (meters)
        z_range: Tuple (min, max) for height extent (meters)
        resolution: Meters per pixel in BEV image
        width: Image width in pixels
        height: Image height in pixels
    """

    def __init__(self, x_range=(-50, 50), y_range=(-50, 50), z_range=(-3, 5), resolution=0.1):
        """
        Initialize BEV rasterizer with spatial parameters.

        Args:
            x_range: (min, max) coverage in forward direction (meters)
                    Default: (-50, 50) = 100m range
            y_range: (min, max) coverage in lateral direction (meters)
                    Default: (-50, 50) = 100m range
            z_range: (min, max) height range for normalization (meters)
                    Default: (-3, 5) = 8m height range
            resolution: Spatial resolution in meters per pixel
                       Default: 0.1m = 10cm per pixel

        Technical Details:
            1. Resolution Trade-offs:
               - Smaller (e.g., 0.05m): Higher detail, larger images, more memory
               - Larger (e.g., 0.2m): Lower detail, smaller images, faster processing
               - 0.1m (10cm) is common balance for autonomous driving

            2. Image Dimensions:
               - Width = (x_range[1] - x_range[0]) / resolution
               - Height = (y_range[1] - y_range[0]) / resolution
               - Example: 100m range / 0.1m resolution = 1000 pixels
               - Typical BEV image: 1000×1000 pixels for 100m×100m area

            3. Memory Footprint:
               - 1000×1000×3 channels × 1 byte = ~3MB per BEV image
               - Batch processing requires careful memory management
        """
        self.x_range = x_range
        self.y_range = y_range
        self.z_range = z_range
        self.resolution = resolution

        # Calculate image dimensions from spatial extent and resolution
        self.width = int((x_range[1] - x_range[0]) / resolution)   # Pixels in X direction
        self.height = int((y_range[1] - y_range[0]) / resolution)  # Pixels in Y direction

    def rasterize(self, points):
        """
        Convert 3D point cloud to 2D Bird's Eye View image with three channels.

        This method implements the core rasterization algorithm that projects 3D points
        onto a 2D grid from above, accumulating height, intensity, and density information
        for each pixel. The result is a 3-channel image encoding the 3D scene from above.

        Args:
            points: numpy.ndarray of shape (4, N) containing:
                   - Row 0: X coordinates (forward) in meters
                   - Row 1: Y coordinates (left) in meters
                   - Row 2: Z coordinates (height) in meters
                   - Row 3: Intensity values (LiDAR reflectivity)

        Returns:
            numpy.ndarray: BEV image of shape (height, width, 3) with dtype uint8
                          - Channel 0 (Red): Height map (maximum Z value per pixel)
                          - Channel 1 (Green): Intensity map (average intensity per pixel)
                          - Channel 2 (Blue): Density map (point count per pixel)
                          All values normalized to range [0, 255]

        Technical Details:

            **Algorithm Overview:**
            1. Initialize three 2D accumulation maps (height, intensity, density)
            2. Project 3D points to 2D pixel coordinates
            3. Accumulate values for each pixel
            4. Normalize and enhance channels
            5. Stack into 3-channel RGB image

            **Channel Semantics:**

            • Height Map (Red Channel):
              - Encodes maximum elevation at each (x,y) location
              - Helps distinguish objects from ground plane
              - Used for detecting vertical structures (vehicles, pedestrians)
              - Formula: max(z) for all points projecting to same pixel

            • Intensity Map (Green Channel):
              - Encodes average LiDAR reflectivity
              - Different materials have different reflectivities
              - Helps distinguish object types (metal vs. fabric vs. vegetation)
              - Formula: mean(intensity) for points in each pixel

            • Density Map (Blue Channel):
              - Encodes number of LiDAR points per pixel
              - Indicates measurement confidence and proximity
              - Higher density = closer objects or better visibility
              - Formula: count(points) per pixel, log-normalized

            **Coordinate Transformation:**
            - 3D world coordinates (meters) → 2D pixel coordinates
            - X_pixel = (X_world - X_min) / resolution
            - Y_pixel = (Y_world - Y_min) / resolution
            - Y-axis is flipped (image origin at top-left, world origin at center)

            **Normalization Strategy:**
            - Height: Linear normalization to [0,1], then sqrt for perceptual balance
            - Intensity: Linear normalization to [0,1], then sqrt for contrast
            - Density: Log normalization (log1p) for wide range, then power 0.3
            - All channels scaled to [0, 255] for uint8 image format
        """
        # Initialize three 2D accumulation maps for the BEV image
        height_map = np.zeros((self.height, self.width), dtype=np.float32)     # Max height per pixel
        intensity_map = np.zeros((self.height, self.width), dtype=np.float32)  # Sum of intensities
        density_map = np.zeros((self.height, self.width), dtype=np.int32)      # Point count per pixel

        # ============================================================
        # STEP 1: Project 3D points to 2D pixel coordinates
        # ============================================================

        # Convert world coordinates (meters) to pixel coordinates
        # X direction: forward/backward in world → horizontal in image
        x_img = np.int32((points[0, :] - self.x_range[0]) / self.resolution)

        # Y direction: left/right in world → vertical in image
        y_img = np.int32((points[1, :] - self.y_range[0]) / self.resolution)

        # Clamp pixel coordinates to image boundaries (handles edge cases)
        x_img = np.clip(x_img, 0, self.width - 1)
        y_img = np.clip(y_img, 0, self.height - 1)

        # Flip Y-axis: image origin is top-left, world origin is center-bottom
        # This makes the image appear with vehicle at bottom, forward direction up
        y_img = self.height - 1 - y_img

        # ============================================================
        # STEP 2: Accumulate values for each pixel
        # ============================================================

        # Iterate through all points and update the three maps
        # Note: Could be optimized with numpy operations, but loop is clear
        for i in range(points.shape[1]):
            x, y = x_img[i], y_img[i]  # Pixel coordinates for this point

            # Height map: Keep maximum Z value (tallest point at this location)
            height_map[y, x] = max(height_map[y, x], points[2, i])

            # Intensity map: Accumulate intensity (will average later)
            intensity_map[y, x] += points[3, i]

            # Density map: Count number of points
            density_map[y, x] += 1

        # ============================================================
        # STEP 3: Compute average intensity per pixel
        # ============================================================

        # Create mask for pixels with at least one point
        mask = density_map > 0

        # Convert accumulated intensity sum to average intensity
        # Only for pixels with points (avoid division by zero)
        intensity_map[mask] = intensity_map[mask] / density_map[mask]

        # ============================================================
        # STEP 4: Normalize and enhance each channel
        # ============================================================

        # --- Height Map Normalization ---
        # Convert from absolute height (meters) to normalized [0, 1] range
        # Formula: (height - min) / (max - min)
        height_map = np.clip((height_map - self.z_range[0]) / (self.z_range[1] - self.z_range[0]), 0, 1)

        # Apply gamma correction (power 0.5 = square root) for perceptual enhancement
        # Brightens darker values, compresses brighter values
        # Helps distinguish low-height objects from ground
        height_map = np.power(height_map, 0.5)

        # --- Intensity Map Normalization ---
        # Normalize to [0, 1] range based on maximum intensity in this frame
        # Add small epsilon (1e-6) to avoid division by zero
        intensity_map = intensity_map / max(intensity_map.max(), 1e-6)

        # Apply gamma correction for contrast enhancement
        # Makes subtle reflectivity differences more visible
        intensity_map = np.power(intensity_map, 0.5)

        # --- Density Map Normalization ---
        # Use logarithmic normalization for wide dynamic range
        # log1p(x) = log(1 + x) handles zero values gracefully
        # Compresses high densities while preserving low density variation
        density_norm = np.log1p(density_map.astype(np.float32))

        # Scale to [0, 1] range
        density_norm = density_norm / max(density_norm.max(), 1e-6)

        # Apply strong power transformation (0.3) to further compress range
        # Emphasizes presence of points over exact count
        density_norm = np.power(density_norm, 0.3)

        # ============================================================
        # STEP 5: Stack channels and convert to uint8 image format
        # ============================================================

        # Stack three normalized maps into RGB image (height, width, 3)
        # Channel order: [height, intensity, density] → [R, G, B]
        bev_image = np.stack([height_map, intensity_map, density_norm], axis=-1)

        # Scale from [0, 1] float to [0, 255] uint8 for standard image format
        return (bev_image * 255).astype(np.uint8)



In [18]:
"""
PointCloudProcessor: Transform and filter LiDAR point clouds for BEV processing.

This module handles the spatial transformation of 3D point clouds from sensor coordinates
to ego vehicle coordinates, and filters points to a region of interest (ROI) suitable for
Bird's Eye View (BEV) image generation.

Key Operations:
- Load binary LiDAR data from nuScenes dataset
- Apply rigid body transformations (rotation + translation)
- Filter points to defined spatial boundaries
- Prepare point clouds for 2D rasterization
"""

import numpy as np
from nuscenes.utils.data_classes import LidarPointCloud
from pyquaternion import Quaternion
import os


class PointCloudProcessor:
    """
    Processor for loading, transforming, and filtering 3D LiDAR point clouds.

    This class handles the coordinate transformations necessary to convert LiDAR point clouds
    from sensor frame to ego vehicle frame, and filters points to a region of interest around
    the vehicle suitable for autonomous driving perception tasks.

    Attributes:
        nusc: NuScenes dataset instance for accessing metadata and files
        x_range: Tuple (min, max) defining forward/backward extent in meters
        y_range: Tuple (min, max) defining left/right extent in meters
        z_range: Tuple (min, max) defining up/down extent in meters
    """

    def __init__(self, nusc, x_range=(-50, 50), y_range=(-50, 50), z_range=(-3, 5)):
        """
        Initialize the point cloud processor with spatial filtering boundaries.

        Args:
            nusc: NuScenes instance providing access to dataset
            x_range: (min_x, max_x) in meters. X-axis points forward from vehicle.
                    Default (-50, 50) = 100m total range, 50m ahead and behind
            y_range: (min_y, max_y) in meters. Y-axis points left from vehicle.
                    Default (-50, 50) = 100m total range, 50m on each side
            z_range: (min_z, max_z) in meters. Z-axis points up from vehicle.
                    Default (-3, 5) = 8m total height, 3m below to 5m above vehicle

        Technical Details:
            - Range selection impacts BEV image resolution and coverage area
            - Typical autonomous vehicle perception: 50-100m forward, ±50m lateral
            - Z-range filters ground points (below) and tall structures (above)
            - Coordinate system: Right-handed with Z-up (ego vehicle frame)
        """
        self.nusc = nusc
        self.x_range = x_range
        self.y_range = y_range
        self.z_range = z_range

    def load_and_transform(self, sample_data_token):
        """
        Load LiDAR point cloud and transform from sensor frame to ego vehicle frame.

        This method performs a rigid body transformation (rotation followed by translation)
        to convert point cloud coordinates from the LiDAR sensor's local coordinate system
        to the ego vehicle's coordinate system. This is essential for sensor fusion and
        consistent spatial representation.

        Args:
            sample_data_token: UUID string identifying a specific LiDAR capture

        Returns:
            numpy.ndarray: Point cloud in ego vehicle frame with shape (4, N) where:
                - Row 0: X coordinates (forward) in meters
                - Row 1: Y coordinates (left) in meters
                - Row 2: Z coordinates (up) in meters
                - Row 3: Intensity values (reflectivity)

        Technical Details:
            1. Data Loading:
               - Retrieves file path from nuScenes metadata
               - Loads binary .pcd.bin file containing float32 values
               - Format: Interleaved [x, y, z, intensity] × N points

            2. Coordinate Transformation Chain:
               a) Points start in LiDAR sensor frame (LIDAR_TOP coordinate system)
               b) Retrieve calibrated_sensor record containing extrinsics:
                  - rotation: Quaternion representing sensor orientation relative to ego
                  - translation: 3D vector [x, y, z] for sensor position relative to ego
               c) Apply rotation: Points are rotated by converting quaternion to 3×3 matrix
               d) Apply translation: Rotated points are shifted by translation vector
               e) Result: Points in ego vehicle frame (centered at vehicle)

            3. Transformation Mathematics:
               - Rotation matrix R from quaternion q = [w, x, y, z]
               - Translation vector t = [tx, ty, tz]
               - Transformed point: p' = R × p + t
               - Order matters: Rotate first, then translate

            4. Ego Vehicle Frame Convention:
               - Origin: Center of vehicle at ground level
               - X-axis: Points forward (driving direction)
               - Y-axis: Points left
               - Z-axis: Points up
               - Right-handed coordinate system
        """
        # Get metadata record for this LiDAR capture
        sample_data = self.nusc.get('sample_data', sample_data_token)

        # Construct full path to binary point cloud file
        pcl_path = os.path.join(self.nusc.dataroot, sample_data['filename'])

        # Load point cloud from binary file (4×N array: x, y, z, intensity)
        pc = LidarPointCloud.from_file(pcl_path)

        # Retrieve sensor calibration (extrinsic parameters)
        cs_record = self.nusc.get('calibrated_sensor', sample_data['calibrated_sensor_token'])

        # ============================================================
        # CRITICAL FIX: Keep LiDAR data in sensor frame
        # ============================================================
        # According to nuScenes documentation, the raw LiDAR data is already
        # stored in the sensor (LIDAR_TOP) coordinate frame, NOT the ego vehicle frame.
        #
        # REASON FOR COMMENTING OUT TRANSFORMATION:
        # The original code incorrectly transformed LiDAR data from sensor→ego frame,
        # which caused misalignment with annotations. Since the data is already in the
        # correct sensor frame, we should NOT apply any transformation here.
        #
        # Instead, annotations must be transformed TO the sensor frame (see DataPreprocessor.py)
        # to match the coordinate system of the LiDAR point cloud.
        #
        # TRANSFORMATION STRATEGY:
        # - LiDAR: Keep in sensor frame (no transformation needed)
        # - Annotations: Transform from global → ego → sensor frame
        # - Result: Both in same coordinate system for correct BEV projection
        #
        # Original transformation code (now disabled):
        # pc.rotate(Quaternion(cs_record['rotation']).rotation_matrix)
        # pc.translate(np.array(cs_record['translation']))
        # ============================================================

        # Return point cloud in sensor frame (4×N: x, y, z, intensity)
        return pc.points

    def filter_points(self, points):
        """
        Filter point cloud to region of interest (ROI) using spatial boundaries.

        Removes points outside the defined x, y, z ranges to focus on the relevant
        area around the vehicle. This reduces computational load and focuses on
        distances relevant for autonomous driving perception.

        Args:
            points: numpy.ndarray of shape (4, N) with rows [x, y, z, intensity]

        Returns:
            numpy.ndarray: Filtered point cloud with shape (4, M) where M ≤ N
                          Contains only points within the defined spatial boundaries

        Technical Details:
            1. Filtering Logic:
               - Creates boolean mask using vectorized numpy comparisons
               - Each condition checks one spatial dimension
               - Combines conditions with logical AND (&) operator
               - Applies mask to select only points satisfying all conditions

            2. Spatial Filtering Rationale:
               - X-range: Limits forward/backward perception distance
                 • Too far: Low density, less relevant for immediate decisions
                 • Too close: May miss distant objects
               - Y-range: Limits lateral (side-to-side) perception
                 • Typically symmetric around vehicle centerline
               - Z-range: Removes ground and sky points
                 • Below vehicle: Road surface, underground artifacts
                 • Above vehicle: Bridges, buildings, sky noise

            3. Performance Considerations:
               - Vectorized operations using numpy for efficiency
               - Typical filtering: 30K-40K → 10K-20K points (50-70% reduction)
               - Reduces downstream processing time for rasterization

            4. Impact on BEV:
               - Filtered points determine BEV image coverage
               - Points outside range are completely discarded
               - Matches the spatial extent that will be rasterized
        """
        # Create boolean mask: True for points inside ROI, False outside
        mask = (
            (points[0, :] >= self.x_range[0]) & (points[0, :] <= self.x_range[1]) &  # X bounds
            (points[1, :] >= self.y_range[0]) & (points[1, :] <= self.y_range[1]) &  # Y bounds
            (points[2, :] >= self.z_range[0]) & (points[2, :] <= self.z_range[1])    # Z bounds
        )

        # Apply mask to select only points within boundaries (fancy indexing)
        return points[:, mask]



In [19]:
"""
YOLOAnnotationConverter: Transform 3D bounding boxes to YOLO format for BEV images.

This module converts nuScenes 3D bounding box annotations (in meters with global coordinates)
to YOLO-compatible 2D bounding box format (normalized pixel coordinates). It handles the
projection from 3D world space to 2D BEV image space and maps nuScenes' 23 object categories
to 4 simplified classes for object detection.

Key Operations:
- 3D bounding box → 2D bounding box projection (top-down view)
- Coordinate transformation: meters → pixels → normalized [0,1]
- Category mapping: 23 nuScenes classes → 4 detection classes
- Validation: Ensures boxes are within image bounds
"""

import numpy as np


class YOLOAnnotationConverter:
    """
    Converter for transforming 3D bounding boxes to YOLO format for BEV images.

    YOLO (You Only Look Once) format specifies 2D bounding boxes as:
    <class_id> <x_center> <y_center> <width> <height>

    where all spatial values are normalized to [0, 1] relative to image dimensions.

    This class performs the geometric transformation from 3D boxes in world coordinates
    to 2D boxes in BEV image coordinates, handling coordinate systems, resolution
    conversion, and class label mapping.

    Attributes:
        image_width: BEV image width in pixels
        image_height: BEV image height in pixels
        x_range: Spatial extent in X direction (meters)
        y_range: Spatial extent in Y direction (meters)
        resolution: Meters per pixel
        class_mapping: Dictionary mapping nuScenes categories to class IDs
    """

    def __init__(self, image_width, image_height, x_range=(-50, 50), y_range=(-50, 50), resolution=0.1):
        """
        Initialize the annotation converter with image and spatial parameters.

        Args:
            image_width: Width of BEV image in pixels (e.g., 1000)
            image_height: Height of BEV image in pixels (e.g., 1000)
            x_range: (min, max) spatial extent in X direction (meters)
            y_range: (min, max) spatial extent in Y direction (meters)
            resolution: Spatial resolution in meters per pixel (e.g., 0.1)

        Technical Details:
            - Image dimensions must match BEV rasterizer output
            - Spatial ranges must match point cloud filtering ranges
            - Resolution determines coordinate transformation accuracy
        """
        self.image_width = image_width
        self.image_height = image_height
        self.x_range = x_range
        self.y_range = y_range
        self.resolution = resolution

        # Map nuScenes' detailed taxonomy (23 classes) to simplified classes (4)
        # Rationale: Reduces class imbalance, focuses on key autonomous driving objects
        self.class_mapping = {
            # Class 0: Cars (most common, ~60% of objects)
            'vehicle.car': 0,
            'vehicle.taxi': 0,  # Taxis are functionally similar to cars

            # Class 1: Large Vehicles (trucks, buses, construction)
            'vehicle.truck': 1,
            'vehicle.bus.bendy': 1,  # Articulated buses
            'vehicle.bus.rigid': 1,   # Standard buses
            'vehicle.construction': 1,  # Bulldozers, cranes, etc.

            # Class 2: Pedestrians (all types, critical for safety)
            'human.pedestrian.adult': 2,
            'human.pedestrian.child': 2,
            'human.pedestrian.construction_worker': 2,
            'human.pedestrian.police_officer': 2,

            # Class 3: Two-wheeled Vehicles (vulnerable road users)
            'vehicle.bicycle': 3,
            'vehicle.motorcycle': 3

            # Note: Other nuScenes classes not included:
            # - vehicle.trailer, vehicle.emergency.*: Uncommon
            # - movable_object.*: Traffic cones, barriers (static)
            # - animal: Very rare in nuScenes dataset
        }

    def convert_annotation(self, box_translation, box_size, category_name):
        """
        Convert a single 3D bounding box annotation to YOLO format for BEV image.

        Transforms a 3D box defined in world coordinates (meters) to a 2D box in BEV
        image coordinates (normalized [0,1]). The conversion involves:
        1. Category filtering and mapping
        2. 3D→2D projection (top-down, uses only X and Y coordinates)
        3. Coordinate transformation (meters → pixels → normalized)
        4. Validation (ensure box is within image bounds)

        Args:
            box_translation: [x, y, z] center of 3D bounding box in ego frame (meters)
            box_size: [width, length, height] dimensions of 3D box (meters)
                     Note: nuScenes uses [width, length, height] order
            category_name: String category from nuScenes taxonomy (e.g., 'vehicle.car')

        Returns:
            List [class_id, x_center, y_center, width, height] in YOLO format, or None if:
            - Category is not in class_mapping (filtered out)
            - Box center is outside image bounds
            - Box dimensions are invalid (≤0 or >1 after normalization)

        Technical Details:

            **Coordinate System Transformation:**

            1. Input Space (3D world, ego vehicle frame):
               - Origin: Center of vehicle
               - X-axis: Forward (driving direction)
               - Y-axis: Left
               - Z-axis: Up (discarded in BEV)
               - Units: Meters

            2. Intermediate Space (2D pixel coordinates):
               - Origin: Top-left corner
               - X-axis: Right (horizontal)
               - Y-axis: Down (vertical)
               - Units: Pixels
               - Transformation: x_px = (x_m - x_min) / resolution

            3. Output Space (YOLO normalized coordinates):
               - Origin: Top-left corner
               - X-axis: Right, range [0, 1]
               - Y-axis: Down, range [0, 1]
               - Units: Fraction of image dimensions
               - Transformation: x_norm = x_px / image_width

            **Y-Axis Flip:**
            - World coordinates: Y increases leftward
            - Image coordinates: Y increases downward
            - Requires flip: y_img = image_height - 1 - y_world_to_px

            **Size Interpretation:**
            - box_size[0]: Width (lateral extent, X direction in world)
            - box_size[1]: Length (longitudinal extent, Y direction in world)
            - In BEV: Width maps to image X, Length maps to image Y
            - Z dimension (height) is discarded for 2D projection

            **Validation Logic:**
            - Filters unknown categories (returns None)
            - Rejects boxes with center outside [0, 1] range
            - Rejects boxes with invalid dimensions (too large or non-positive)
            - Clamps final values to [0, 1] as safety measure
        """
        # ============================================================
        # STEP 1: Filter by category (return None if not in mapping)
        # ============================================================
        if category_name not in self.class_mapping:
            return None  # Skip classes we're not detecting (e.g., trafficcone)

        class_id = self.class_mapping[category_name]  # Map to simplified class [0-3]

        # ============================================================
        # STEP 2: Convert 3D box center to 2D pixel coordinates
        # ============================================================

        # Transform X coordinate: world meters → pixel index
        # X in world frame (forward) → X in image (horizontal)
        x_center = (box_translation[0] - self.x_range[0]) / self.resolution

        # Transform Y coordinate: world meters → pixel index (before flip)
        y_center = (box_translation[1] - self.y_range[0]) / self.resolution

        # Flip Y-axis: image origin is top-left, world origin is center
        # In world: Y positive = left; In image: Y positive = down
        y_center = self.image_height - 1 - y_center

        # ============================================================
        # STEP 3: Convert 3D box size to 2D pixel dimensions
        # ============================================================

        # Width: box_size[0] is lateral extent (X direction)
        width = box_size[0] / self.resolution

        # Height: box_size[1] is longitudinal extent (Y direction)
        # Note: "height" in YOLO 2D means vertical extent in image, not Z
        height = box_size[1] / self.resolution

        # ============================================================
        # STEP 4: Normalize to [0, 1] range (YOLO format requirement)
        # ============================================================

        # Normalize center coordinates
        x_norm = x_center / self.image_width   # Fraction of image width
        y_norm = y_center / self.image_height  # Fraction of image height

        # Normalize dimensions
        w_norm = width / self.image_width
        h_norm = height / self.image_height

        # ============================================================
        # STEP 5: Validate box (reject invalid or out-of-bounds boxes)
        # ============================================================

        # Check if dimensions are valid (positive and not too large)
        if w_norm <= 0 or h_norm <= 0 or w_norm > 1 or h_norm > 1:
            return None  # Invalid box size

        # Check if center is within image bounds
        if x_norm < 0 or x_norm > 1 or y_norm < 0 or y_norm > 1:
            return None  # Box center outside image

        # ============================================================
        # STEP 6: Return YOLO format annotation
        # ============================================================

        # Format: [class_id, x_center, y_center, width, height]
        # All spatial values normalized to [0, 1]
        # Clamp as final safety check (should rarely trigger after validation)
        return [class_id,
                np.clip(x_norm, 0, 1),
                np.clip(y_norm, 0, 1),
                np.clip(w_norm, 0, 1),
                np.clip(h_norm, 0, 1)]



In [20]:
"""
DataPreprocessor: Orchestrate the full nuScenes → YOLO BEV dataset conversion pipeline.

This is the main preprocessing orchestrator that combines all preprocessing components to
convert the raw nuScenes dataset into a YOLO-compatible BEV object detection dataset.

Pipeline Overview:
1. Load 3D LiDAR point cloud → PointCloudProcessor
2. Transform to ego vehicle frame → PointCloudProcessor
3. Filter to region of interest → PointCloudProcessor
4. Rasterize to BEV image → BEVRasterizer
5. Transform 3D annotations to 2D YOLO format → YOLOAnnotationConverter
6. Save paired images and labels to disk

Output Structure:
    build/data/preprocessed/
    ├── images/
    │   ├── scene-0001_<token>.png
    │   ├── scene-0002_<token>.png
    │   └── ...
    └── labels/
        ├── scene-0001_<token>.txt
        ├── scene-0002_<token>.txt
        └── ...
"""

#from Preprocessing.PointCloudProcessor import PointCloudProcessor
#from Preprocessing.BEVRasterizer import BEVRasterizer
#from Preprocessing.YOLOAnnotationConverter import YOLOAnnotationConverter
from nuscenes.utils.data_classes import Box
from pyquaternion import Quaternion
import numpy as np
import cv2
import os
from pathlib import Path
#from Globals import PREPROCESSED_ROOT


class DataPreprocessor:
    """
    Main preprocessing pipeline orchestrator for nuScenes → YOLO BEV conversion.

    This class coordinates the entire preprocessing workflow, managing the flow of data
    through each processing stage and handling file I/O for the output dataset.

    The preprocessor creates a dataset suitable for training YOLO models on BEV images,
    with each sample consisting of:
    - A 3-channel BEV image (PNG format, 1000×1000 pixels by default)
    - A corresponding YOLO format label file (TXT format, one box per line)

    Attributes:
        nusc: NuScenes dataset instance
        pc_processor: PointCloudProcessor for loading and filtering point clouds
        rasterizer: BEVRasterizer for converting point clouds to images
        converter: YOLOAnnotationConverter for transforming annotations
        output_root: Root directory for preprocessed dataset
        images_dir: Directory for BEV images
        labels_dir: Directory for YOLO labels
    """

    def __init__(self, nusc):
        """
        Initialize the data preprocessor with all required components.

        Args:
            nusc: NuScenes instance providing access to the raw dataset

        Technical Details:
            - Creates instances of all preprocessing components with compatible parameters
            - Sets up output directory structure (creates if doesn't exist)
            - Ensures image dimensions match between rasterizer and annotation converter
        """
        self.nusc = nusc

        # Initialize preprocessing components with default parameters
        self.pc_processor = PointCloudProcessor(nusc)
        self.rasterizer = BEVRasterizer()

        # Pass rasterizer dimensions to ensure coordinate transformation consistency
        self.converter = YOLOAnnotationConverter(
            self.rasterizer.width,
            self.rasterizer.height
        )

        # Setup output directory structure
        self.output_root = Path(PREPROCESSED_ROOT)
        self.images_dir = self.output_root / 'images'  # BEV PNG files
        self.labels_dir = self.output_root / 'labels'  # YOLO TXT files

        # Create directories (parents=True creates intermediate dirs, exist_ok ignores if exists)
        self.images_dir.mkdir(parents=True, exist_ok=True)
        self.labels_dir.mkdir(parents=True, exist_ok=True)

    def process_all_samples(self):
        """
        Process all samples in the nuScenes dataset and save as YOLO BEV dataset.

        This is the main processing loop that iterates through every sample (timestamp)
        in the nuScenes dataset, generates a BEV image and YOLO annotations, and saves
        them to disk in a format suitable for training object detection models.

        Returns:
            int: Total number of samples processed

        Processing Pipeline per Sample:
            1. Load LiDAR point cloud
            2. Transform to ego vehicle frame
            3. Filter to region of interest
            4. Rasterize to BEV image
            5. Transform all 3D annotations to 2D YOLO format
            6. Save image and labels with matching filenames

        Technical Details:

            **Coordinate Frame Transformations:**
            The preprocessing involves two key coordinate transformations:

            a) Point Cloud Transformation (handled by PointCloudProcessor):
               - Sensor frame → Ego vehicle frame
               - Applied via calibrated_sensor record (rotation + translation)

            b) Annotation Transformation (handled in this method):
               - Global frame → Ego vehicle frame
               - Required because annotations are in global coordinates
               - Uses ego_pose to transform: T_ego^global
               - Inverse transformation: box_ego = T_ego^global^-1 * box_global

            **Why Two Transformations?**
            - Point clouds: Stored in sensor frame, need ego frame alignment
            - Annotations: Stored in global frame, need ego frame alignment
            - Both must be in same frame (ego) for consistent BEV projection

            **File Naming Convention:**
            - Format: {scene_name}_{sample_token}.{ext}
            - Example: scene-0061_3e8750f331d7499e9b5123e9eb70f2e2.png
            - Ensures unique names and preserves scene context
            - Matching names for image-label pairs enable automatic pairing

            **YOLO Label Format (per line in .txt file):**
            <class_id> <x_center> <y_center> <width> <height>
            - class_id: Integer [0-3]
            - All other values: Floats in range [0.0, 1.0]
            - Space-separated values
            - One line per object

            **Performance Considerations:**
            - v1.0-mini: ~400 samples, takes ~5-10 minutes to process
            - Full dataset: ~40,000 samples, takes hours
            - Each BEV image: ~3MB (1000×1000×3)
            - Total output: ~1.2GB for mini, ~120GB for full dataset
        """
        # Get total number of samples to process (v1.0-mini has ~400)
        total_samples = len(self.nusc.sample)

        # ============================================================
        # Main Processing Loop: Iterate through all samples
        # ============================================================
        for sample_idx in range(total_samples):
            # Get the sample record (represents one timestamp across all sensors)
            sample = self.nusc.sample[sample_idx]

            # Extract LIDAR_TOP token (primary 3D sensor for BEV generation)
            lidar_token = sample['data']['LIDAR_TOP']

            # --------------------------------------------------------
            # STAGE 1: Generate BEV Image from Point Cloud
            # --------------------------------------------------------

            # Load and transform point cloud to ego vehicle frame
            points = self.pc_processor.load_and_transform(lidar_token)

            # Filter to region of interest (removes distant/irrelevant points)
            filtered_points = self.pc_processor.filter_points(points)

            # Rasterize 3D points to 2D BEV image (height, intensity, density channels)
            bev_image = self.rasterizer.rasterize(filtered_points)

            # --------------------------------------------------------
            # STAGE 2: Transform Annotations to YOLO Format
            # --------------------------------------------------------

            # ============================================================
            # COORDINATE FRAME TRANSFORMATION: Global → Ego → Sensor
            # ============================================================
            # PROBLEM: nuScenes stores data in different coordinate frames:
            # - LiDAR point clouds: Sensor frame (LIDAR_TOP coordinate system)
            # - Annotations (bounding boxes): Global frame (world coordinates)
            #
            # SOLUTION: Transform annotations to match LiDAR's sensor frame
            # Transformation chain: Global → Ego Vehicle → Sensor
            #
            # WHY THIS IS NECESSARY:
            # For proper BEV projection, both point cloud and annotations must be
            # in the same coordinate system. Since we keep LiDAR in sensor frame
            # (see PointCloudProcessor.py), we must transform annotations TO sensor frame.
            # ============================================================

            # Get ego vehicle pose for this timestamp (needed for global→ego transform)
            sample_data = self.nusc.get('sample_data', lidar_token)
            ego_pose = self.nusc.get('ego_pose', sample_data['ego_pose_token'])

            # Get sensor calibration for ego→sensor transformation
            cs_record = self.nusc.get('calibrated_sensor', sample_data['calibrated_sensor_token'])

            # Process all annotations (3D bounding boxes) for this sample
            yolo_labels = []
            for ann_token in sample['anns']:
                # Get annotation metadata (stored in global/world coordinates)
                ann = self.nusc.get('sample_annotation', ann_token)

                # Create 3D box object from annotation (currently in global frame)
                box = Box(
                    ann['translation'],  # [x, y, z] center in global frame
                    ann['size'],         # [width, length, height] dimensions
                    Quaternion(ann['rotation'])  # Orientation quaternion
                )

                # --------------------------------------------------------
                # TRANSFORMATION STEP 1 & 2: Global Frame → Ego Vehicle Frame
                # --------------------------------------------------------
                # The ego vehicle frame has its origin at the center of the vehicle
                # with X=forward, Y=left, Z=up (right-handed system)

                # Step 1: Translate by negative ego position (center on ego)
                box.translate(-np.array(ego_pose['translation']))

                # Step 2: Rotate by inverse ego orientation (align with ego axes)
                box.rotate(Quaternion(ego_pose['rotation']).inverse)

                # --------------------------------------------------------
                # TRANSFORMATION STEP 3 & 4: Ego Frame → Sensor Frame
                # --------------------------------------------------------
                # CRITICAL FIX: This transformation was missing in the original code,
                # causing misalignment between LiDAR points and bounding boxes.
                #
                # The sensor frame (LIDAR_TOP) has a different origin and orientation
                # than the ego frame, so we must apply the sensor calibration transform.

                # Step 3: Translate by negative sensor position (center on sensor)
                box.translate(-np.array(cs_record['translation']))

                # Step 4: Rotate by inverse sensor orientation (align with sensor axes)
                box.rotate(Quaternion(cs_record['rotation']).inverse)

                # --------------------------------------------------------
                # AXIS-ALIGNED BOUNDING BOX COMPUTATION
                # --------------------------------------------------------
                # PROBLEM: Standard YOLO format only supports axis-aligned bounding boxes
                # (no rotation angle). However, our 3D boxes are rotated in 3D space.
                #
                # SOLUTION: Compute the minimum axis-aligned bounding box (AABB) that
                # fully contains the rotated 3D box when viewed from above (BEV).
                #
                # WHY THIS IS NECESSARY:
                # - A car at 45° has a larger footprint in axis-aligned coordinates
                # - Using original width/length would create boxes that don't fully
                #   contain the rotated object
                # - AABB ensures the box properly encloses the object at any angle
                #
                # ALGORITHM:
                # 1. Get all 8 corners of the rotated 3D box
                # 2. Find min/max X and Y coordinates (top-down projection)
                # 3. Compute new center and dimensions from these extents
                # --------------------------------------------------------

                # Get all 8 corners of the rotated 3D box (3×8 array: x, y, z for each corner)
                corners = box.corners()

                # For BEV (top-down view), we only need X and Y coordinates (Z is discarded)
                # Find the min/max extents in X and Y to create the smallest axis-aligned box
                x_min, x_max = corners[0, :].min(), corners[0, :].max()
                y_min, y_max = corners[1, :].min(), corners[1, :].max()

                # Compute axis-aligned center (midpoint of extents)
                # Z coordinate remains unchanged (height doesn't affect top-down projection)
                aa_center = np.array([(x_min + x_max) / 2, (y_min + y_max) / 2, box.center[2]])

                # Compute axis-aligned dimensions (extent ranges)
                # Width = X extent, Length = Y extent, Height = original Z dimension
                aa_size = np.array([x_max - x_min, y_max - y_min, box.wlh[2]])

                # Convert 3D axis-aligned box (sensor frame) to 2D YOLO annotation (BEV image)
                yolo_label = self.converter.convert_annotation(
                    aa_center,          # Axis-aligned box center in sensor frame
                    aa_size,            # Axis-aligned box dimensions [width, length, height]
                    ann['category_name']  # Object category for class mapping
                )

                # Add to list if valid (converter returns None for invalid/filtered boxes)
                if yolo_label:
                    yolo_labels.append(yolo_label)

            # --------------------------------------------------------
            # STAGE 3: Save Image and Labels to Disk
            # --------------------------------------------------------

            # Generate unique filename from scene name and sample token
            scene_name = self.nusc.get('scene', sample['scene_token'])['name']
            filename = f"{scene_name}_{sample['token']}"

            # Save BEV image as PNG (3-channel RGB, uint8)
            image_path = self.images_dir / f"{filename}.png"
            cv2.imwrite(str(image_path), bev_image)

            # Save YOLO labels as TXT (one line per object)
            label_path = self.labels_dir / f"{filename}.txt"
            with open(label_path, 'w') as f:
                for label in yolo_labels:
                    # Format: <class> <x> <y> <w> <h>
                    f.write(f"{label[0]} {label[1]} {label[2]} {label[3]} {label[4]}\n")

        # Return total number of processed samples
        return total_samples



In [21]:
"""
ModelInitializer: Initialize YOLO model with pretrained weights.

This module handles loading the YOLOv12 model with pretrained COCO weights for
transfer learning. It downloads weights if needed and prepares the model for training.

Key Operations:
- Load YOLOv12 with pretrained weights
- Validate model architecture
- Prepare for transfer learning
"""

from ultralytics import YOLO
from pathlib import Path
#from Globals import MODELS_ROOT


class ModelInitializer:
    """
    Initializer for YOLO models with transfer learning support.

    This class handles model instantiation, pretrained weight loading, and
    configuration for fine-tuning on the BEV detection task.

    Attributes:
        model_size: YOLO model size variant ('n', 's', 'm', 'l')
        pretrained: Whether to use pretrained weights
        models_dir: Directory for storing model weights
    """

    def __init__(self, model_size='s', pretrained=True):
        """
        Initialize the model initializer.

        Args:
            model_size: Model variant - 'n' (nano), 's' (small), 'm' (medium), 'l' (large)
            pretrained: Load COCO pretrained weights (True) or random initialization (False)

        Technical Details:
            - 's' (small) provides good balance of speed and accuracy
            - Pretrained weights improve convergence and final performance
            - Models are cached in build/models/ directory
        """
        self.model_size = model_size
        self.pretrained = pretrained
        self.models_dir = Path(MODELS_ROOT)
        self.models_dir.mkdir(parents=True, exist_ok=True)

    def initialize(self):
        """
        Initialize YOLO model with optional pretrained weights.

        Downloads pretrained weights if needed and creates a YOLO model instance
        ready for training on the BEV detection task.

        Returns:
            YOLO model instance configured for training

        Technical Details:

            **Model Variants:**
            - YOLOv12n: 1.9M params, fastest, lowest accuracy
            - YOLOv12s: 9.1M params, balanced (recommended)
            - YOLOv12m: 23.8M params, higher accuracy, slower
            - YOLOv12l: 52.6M params, highest accuracy, slowest

            **Transfer Learning:**
            - Pretrained weights are from COCO dataset (80 classes)
            - Detection head will be replaced with 4-class head automatically
            - Backbone and neck preserve learned features
            - Significant speedup in convergence vs random initialization

            **Weight Management:**
            - Weights downloaded to build/models/ on first use
            - Subsequent runs use cached weights
            - Network connection required only for first download
        """
        print(f"\nInitializing YOLOv12{self.model_size} model...")

        # ============================================================
        # STEP 1: Construct model name
        # ============================================================
        if self.pretrained:
            # Pretrained weights format: yolo12s.pt
            model_name = f'yolo12{self.model_size}.pt'
            print(f"  Loading with COCO pretrained weights")
        else:
            # Architecture config format: yolo12s.yaml
            model_name = f'yolo12{self.model_size}.yaml'
            print(f"  Random initialization (no pretrained weights)")

        # ============================================================
        # STEP 2: Initialize model
        # ============================================================
        # YOLO() automatically downloads weights if not found
        model = YOLO(model_name)

        # ============================================================
        # STEP 3: Print model info
        # ============================================================
        # Count total parameters
        total_params = sum(p.numel() for p in model.model.parameters())
        trainable_params = sum(p.numel() for p in model.model.parameters() if p.requires_grad)

        print(f"  Model: YOLOv12{self.model_size}")
        print(f"  Total parameters: {total_params / 1e6:.2f}M")
        print(f"  Trainable parameters: {trainable_params / 1e6:.2f}M")
        print(f"  Pretrained: {self.pretrained}")

        return model



In [22]:
"""
TrainingOrchestrator: Orchestrate two-stage YOLO training pipeline.

This module manages the complete training workflow including warm-up stage with
frozen backbone and fine-tuning stage with all layers trainable. It handles
hyperparameter configuration, training execution, and checkpoint management.

Key Operations:
- Configure training hyperparameters for each stage
- Execute two-stage training (warm-up + fine-tuning)
- Manage checkpoints and logging
- Monitor training progress
"""

from pathlib import Path
#from Globals import RUNS_ROOT


class TrainingOrchestrator:
    """
    Orchestrator for two-stage YOLO training pipeline.

    Manages the complete training workflow with separate warm-up and fine-tuning
    stages to optimize transfer learning from COCO to BEV detection task.

    Attributes:
        model: YOLO model instance
        dataset_yaml: Path to dataset configuration file
        runs_dir: Directory for training outputs
    """

    def __init__(self, model, dataset_yaml):
        """
        Initialize the training orchestrator.

        Args:
            model: YOLO model instance from ModelInitializer
            dataset_yaml: Path to dataset.yaml configuration file

        Technical Details:
            - Training outputs saved to build/runs/
            - Separate directories for each training stage
            - Tensorboard logs automatically generated
        """
        self.model = model
        self.dataset_yaml = str(dataset_yaml)
        self.runs_dir = Path(RUNS_ROOT)
        self.runs_dir.mkdir(parents=True, exist_ok=True)

    def train_stage1_warmup(self, epochs=50, batch_size=16, img_size=1000):
        """
        Stage 1: Warm-up training with frozen backbone.

        Trains only the detection head while keeping the backbone frozen. This
        allows the head to adapt to the new domain (BEV detection) without
        disrupting pretrained feature extraction.

        Args:
            epochs: Number of training epochs (default: 50)
            batch_size: Batch size (default: 16, adjust based on GPU memory)
            img_size: Input image size (default: 1000 to match BEV resolution)

        Returns:
            Training results object from YOLO

        Technical Details:

            **Freezing Strategy:**
            - freeze=10: Freezes first 10 layers (backbone)
            - Detection head and neck remain trainable
            - Prevents catastrophic forgetting of low-level features

            **Learning Rate:**
            - lr0=0.01: Higher initial LR acceptable since only head trains
            - Cosine annealing reduces LR over epochs
            - Warm-up helps stabilize early training

            **Data Augmentation:**
            - Mosaic: Combines 4 images for better small object detection
            - MixUp: Blends images to improve generalization
            - Geometric: Rotation, translation, scale for robustness
        """
        print("\n" + "="*80)
        print("STAGE 1: WARM-UP TRAINING (Frozen Backbone)")
        print("="*80)

        # ============================================================
        # Configure Stage 1 hyperparameters
        # ============================================================
        config = {
            # Dataset
            'data': self.dataset_yaml,
            'imgsz': img_size,
            'batch': batch_size,

            # Training duration
            'epochs': epochs,

            # Optimizer
            'optimizer': 'AdamW',
            'lr0': 0.01,          # Initial learning rate
            'lrf': 0.01,          # Final LR (fraction of lr0)
            'momentum': 0.937,
            'weight_decay': 0.0005,

            # Learning rate schedule
            'cos_lr': True,       # Cosine annealing
            'warmup_epochs': 3,
            'warmup_momentum': 0.8,
            'warmup_bias_lr': 0.1,

            # Freezing
            'freeze': 10,         # Freeze first 10 layers (backbone)

            # Data augmentation
            'degrees': 15.0,      # Rotation
            'translate': 0.1,     # Translation
            'scale': 0.5,         # Scale
            'fliplr': 0.5,        # Horizontal flip
            'mosaic': 0.5,        # Mosaic augmentation
            'mixup': 0.0,         # MixUp augmentation

            # Hardware
            'device': 0,          # GPU 0 (use 'cpu' for CPU training)
            'workers': 2,         # DataLoader workers
            'cache': False,       # Cache images for faster training
            'amp': True,          # Automatic mixed precision

            # Logging and saving
            'project': str(self.runs_dir / 'detect'),
            'name': 'stage1_warmup',
            'exist_ok': True,
            'save': True,
            'save_period': 10,    # Save checkpoint every 10 epochs
            'plots': True,
            'verbose': True,

            # Other
            'seed': 42,
            'deterministic': True,
            'val': True,          # Run validation
        }

        # ============================================================
        # Execute training
        # ============================================================
        print(f"\nStarting Stage 1 training...")
        print(f"  Epochs: {epochs}")
        print(f"  Batch size: {batch_size}")
        print(f"  Image size: {img_size}")
        print(f"  Frozen layers: 10 (backbone)")

        results = self.model.train(**config)

        print(f"\n✓ Stage 1 complete")
        print(f"  Results saved to: {results.save_dir}")
        print(f"  Best weights: {results.save_dir}/weights/best.pt")

        return results

    def train_stage2_finetune(self, stage1_weights_path, epochs=150, batch_size=16, img_size=1000):
        """
        Stage 2: Fine-tuning with all layers trainable.

        Unfreezes all layers and trains end-to-end with reduced learning rate.
        This allows the entire network to adapt to BEV-specific features while
        maintaining learned representations.

        Args:
            stage1_weights_path: Path to best weights from Stage 1
            epochs: Number of training epochs (default: 150)
            batch_size: Batch size (default: 16)
            img_size: Input image size (default: 1000)

        Returns:
            Training results object from YOLO

        Technical Details:

            **Unfreezing Strategy:**
            - freeze=0: All layers trainable
            - Lower LR prevents catastrophic forgetting
            - Allows fine-grained adaptation to BEV domain

            **Learning Rate:**
            - lr0=0.001: 10x lower than Stage 1
            - Prevents disrupting learned features
            - Enables careful fine-tuning

            **Early Stopping:**
            - patience=50: Stop if no improvement for 50 epochs
            - Prevents overfitting
            - Saves computational resources
        """
        print("\n" + "="*80)
        print("STAGE 2: FINE-TUNING (All Layers Trainable)")
        print("="*80)

        # ============================================================
        # Load best model from Stage 1
        # ============================================================
        from ultralytics import YOLO
        model = YOLO(stage1_weights_path)
        print(f"Loaded Stage 1 weights from: {stage1_weights_path}")

        # ============================================================
        # Configure Stage 2 hyperparameters
        # ============================================================
        config = {
            # Dataset
            'data': self.dataset_yaml,
            'imgsz': img_size,
            'batch': batch_size,

            # Training duration
            'epochs': epochs,

            # Optimizer
            'optimizer': 'AdamW',
            'lr0': 0.001,         # Lower LR for fine-tuning
            'lrf': 0.001,
            'momentum': 0.937,
            'weight_decay': 0.0005,

            # Learning rate schedule
            'cos_lr': True,
            'warmup_epochs': 0,   # No warmup needed

            # Freezing
            'freeze': 0,          # Unfreeze all layers

            # Data augmentation (same as Stage 1)
            'degrees': 15.0,
            'translate': 0.1,
            'scale': 0.5,
            'fliplr': 0.5,
            'mosaic': 1.0,
            'mixup': 0.1,

            # Hardware
            'device': 0,
            'workers': 8,

            # Logging and saving
            'project': str(self.runs_dir / 'detect'),
            'name': 'stage2_finetune',
            'exist_ok': True,
            'save': True,
            'save_period': 10,
            'plots': True,
            'verbose': True,

            # Early stopping
            'patience': 50,       # Stop if no improvement for 50 epochs

            # Other
            'seed': 42,
            'deterministic': True,
            'val': True,
        }

        # ============================================================
        # Execute training
        # ============================================================
        print(f"\nStarting Stage 2 training...")
        print(f"  Epochs: {epochs}")
        print(f"  Batch size: {batch_size}")
        print(f"  Image size: {img_size}")
        print(f"  All layers trainable")

        results = model.train(**config)

        print(f"\n✓ Stage 2 complete")
        print(f"  Results saved to: {results.save_dir}")
        print(f"  Best weights: {results.save_dir}/weights/best.pt")

        return results

    def train_full_pipeline(self, stage1_epochs=50, stage2_epochs=150, batch_size=16):
        """
        Execute complete two-stage training pipeline.

        Convenience method that runs both training stages sequentially,
        automatically passing the best weights from Stage 1 to Stage 2.

        Args:
            stage1_epochs: Epochs for warm-up stage (default: 50)
            stage2_epochs: Epochs for fine-tuning stage (default: 150)
            batch_size: Batch size for both stages (default: 16)

        Returns:
            Tuple of (stage1_results, stage2_results)
        """
        # Stage 1: Warm-up
        stage1_results = self.train_stage1_warmup(
            epochs=stage1_epochs,
            batch_size=batch_size
        )

        # Get best weights from Stage 1
        stage1_best = Path(stage1_results.save_dir) / 'weights' / 'best.pt'

        # Stage 2: Fine-tuning
        stage2_results = self.train_stage2_finetune(
            stage1_weights_path=stage1_best,
            epochs=stage2_epochs,
            batch_size=batch_size
        )

        return stage1_results, stage2_results



In [29]:
from pathlib import Path
from nuscenes.nuscenes import NuScenes
dataset_yaml_path = Path(DATA_ROOT) / 'dataset.yaml'

#We create the model first, so it's available for all training stages
model_initializer = ModelInitializer(model_size='s', pretrained=True)
model = model_initializer.initialize()

trainer = TrainingOrchestrator(model, dataset_yaml_path)




def download():

    downloader = DataDownloader()
    if not downloader.check_and_prompt():
        return 1
    return 0

def validate():
    validator = DataValidator()
    if not validator.validate():
        print("Dataset validation failed")
        return 1
    return 0

def inspect():
    nusc = NuScenes(version=NUSCENES_VERSION, dataroot=NUSCENES_ROOT, verbose=False)

    inspector = RawDataInspector(nusc)
    inspector.list_scenes()
    inspector.visualize_sample()
    inspector.visualize_sample_data()
    inspector.visualize_annotation()

    inspector = RawDataInspector(nusc)

    sample = nusc.sample[10]
    lidar_token = sample['data']['LIDAR_TOP']

    pc_info = inspector.inspect_point_cloud(lidar_token)
    annotations = inspector.inspect_annotations(sample['token'])
    inspector.visualize_3d_scene(sample['token'])

    print("\n=== Inspection Point 1: Raw Data ===")
    print(f"\nPoint Cloud: {pc_info['num_points']} points")
    print(f"X: [{pc_info['x_range'][0]:.2f}, {pc_info['x_range'][1]:.2f}] m")
    print(f"Y: [{pc_info['y_range'][0]:.2f}, {pc_info['y_range'][1]:.2f}] m")
    print(f"Z: [{pc_info['z_range'][0]:.2f}, {pc_info['z_range'][1]:.2f}] m")
    return nusc
def preprocess(nusc):

    print("\n=== Preprocessing Stage ===")
    preprocessor = DataPreprocessor(nusc)
    total = preprocessor.process_all_samples()
    print(f"Processed {total} samples")

    print("\n=== Inspection Point 2: Preprocessed Data ===")
    bev_inspector = BEVInspector()
    bev_images, yolo_labels_list = bev_inspector.load_samples(4)
    bev_inspector.visualize_grid(bev_images, yolo_labels_list, num_cols=2)


    print("\n=== Data Preparation Stage ===")
    splitter = DataSplitter(train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)
    splits = splitter.split()

    config_generator = DatasetConfigGenerator()
    config_generator.generate(splits, dataset_yaml_path)
    print("✓ Data preparation complete")
    return splits
def train():
    print("\n=== Training Stage ===")

    #user_input = input("\nProceed with training? This will take several hours. (y/n): ")
    #if user_input.lower() != 'y':
    #    print("Training skipped by user")
    #    return 0

    stage1_results, stage2_results = trainer.train_full_pipeline(
        stage1_epochs=50,
        stage2_epochs=150,
        batch_size=16
    )

    best_model_path = Path(stage2_results.save_dir) / 'weights' / 'best.pt'
    print(f"\n✓ Training complete. Best model: {best_model_path}")
def train_warmup(epochs=50, batch_size=16):
  print("\n=== Training Stage - Warmup only ===")
  # Stage 1: Warm-up
  stage1_results = trainer.train_stage1_warmup(
      epochs=epochs,
      batch_size=batch_size
  )

  # Get best weights from Stage 1
  stage1_best = Path(stage1_results.save_dir) / 'weights' / 'best.pt'

  return stage1_best
def train_finetune(stage1_best, epochs=150, batch_size=16):
  print("\n=== Training Stage - Fine-tuning ===")
  # Stage 2: Fine-tuning
  stage2_results = trainer.train_stage2_finetune(
      stage1_weights_path=stage1_best,
      epochs=epochs,
      batch_size=batch_size
  )
  return stage2_results
def evaluate(best_model_path):
    print("\n=== Evaluation Stage ===")

    evaluator = ModelEvaluator(best_model_path, dataset_yaml_path)
    results = evaluator.evaluate()
    evaluator.print_metrics(results)

    analyzer = PerformanceAnalyzer()
    summary = analyzer.compute_performance_summary(results)

    labels_dir = Path(PREPROCESSED_ROOT) / 'labels'
    analyzer.analyze_class_distribution(labels_dir)

    visualizer = ResultsVisualizer()
    #test_images_dir = Path(splits['test']['images'][0]).parent
    test_images_dir = '/content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/data/split_data/images/test'
    visualizer.visualize_predictions(evaluator.model, test_images_dir, num_samples=10)
    visualizer.generate_performance_report(results)

    print("\n✓ Pipeline complete")
    print(f"mAP@0.5: {summary['overall']['mAP_50']:.4f}")
    print(f"mAP@0.5:0.95: {summary['overall']['mAP_50_95']:.4f}")

    return 0


#if __name__ == "__main__":
#    exit(main())



Initializing YOLOv12s model...
  Loading with COCO pretrained weights
  Model: YOLOv12s
  Total parameters: 9.29M
  Trainable parameters: 0.00M
  Pretrained: True


In [None]:
#Now, let's execute everything in sequence
#First, we'll download
download()

nuScenes dataset not found at: build/data/raw/v1.0-mini

Download instructions:
1. Visit: https://www.nuscenes.org/nuscenes#download
2. Download v1.0-mini (4 GB)
3. Extract to: build/data/raw/v1.0-mini



1

In [None]:
#Now we Validate
validate()

In [None]:
#Next, we Inspect
inspect()

In [None]:
#Now we run preprocess
preprocess()

In [None]:
#Then, we'll train
#Use this line if you want to run both stages (warm-up and fine-tuning)
#train()
stage1_results = trainer.train_stage1_warmup(
  epochs=50,
  batch_size=16
)
stage1_best = Path(stage1_results.save_dir) / 'weights' / 'best.pt'




STAGE 1: WARM-UP TRAINING (Frozen Backbone)

Starting Stage 1 training...
  Epochs: 50
  Batch size: 16
  Image size: 1000
  Frozen layers: 10 (backbone)
Ultralytics 8.3.231 🚀 Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=True, cutmix=0.0, data=/content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/data/dataset.yaml, degrees=15.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=10, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=1000, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, m

In [None]:
print(stage1_best)

/content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/runs/detect/stage1_warmup/weights/best.pt


In [None]:
# Stage 2: Fine-tuning

#Uncomment this line if starting over, but want to go straight to Fine-tuning
#stage1_best = '/content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/runs/detect/stage1_warmup/weights/best.pt'

stage2_results = trainer.train_stage2_finetune(
    stage1_weights_path=stage1_best,
    epochs=150,
    batch_size=8
)


STAGE 2: FINE-TUNING (All Layers Trainable)
Loaded Stage 1 weights from: /content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/runs/detect/stage1_warmup/weights/best.pt

Starting Stage 2 training...
  Epochs: 150
  Batch size: 8
  Image size: 1000
  All layers trainable
Ultralytics 8.3.231 🚀 Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=8, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=True, cutmix=0.0, data=/content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/data/dataset.yaml, degrees=15.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=150, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=0, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, img

In [31]:
#Lastly, we'll evaluate

 # Results saved to: /content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/runs/detect/stage2_finetune
 # Best weights: /content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/runs/detect/stage2_finetune/weights/best.pt
best_model_path = '/content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/runs/detect/stage2_finetune/weights/best.pt'

evaluate(best_model_path)


=== Evaluation Stage ===

Loading model from: /content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/runs/detect/stage2_finetune/weights/best.pt
✓ Model loaded successfully

MODEL EVALUATION

Evaluation settings:
  Confidence threshold: 0.25
  NMS IoU threshold: 0.45
  Image size: 1000

Running inference on test set...
Ultralytics 8.3.231 🚀 Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
YOLOv12s summary (fused): 159 layers, 9,232,428 parameters, 0 gradients, 21.2 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.4±0.1 ms, read: 47.3±7.0 MB/s, size: 65.8 KB)
[K[34m[1mval: [0mScanning /content/drive/MyDrive/Colab Notebooks/MSAAI521_FinalProject/build/data/split_data/labels/test.cache... 41 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 41/41 80.4Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 1.6s/it 4.7s
                   all         41       1096      0.771      0.388 

0