# Object Detection Metrics with Confidence Intervals

## 1. Setup and Installation

First, ensure you have the required packages installed:

In [None]:
%pip install tqdm PyYAML ultralytics

In [None]:
import time
from pathlib import Path
from confidenceinterval import MetricEvaluator 
from ultralytics import YOLO
import shutil

print("‚úì Imports successful")

## 2. Load Model and Run Predictions

We'll use a pre-trained YOLOv8 model to generate predictions on the COCO128 validation set.

In [None]:
# Load YOLOv8 nano model (smallest and fastest)
print('Loading YOLOv8n model...')
model = YOLO('yolov8n.pt')  # Automatically downloads if not present
print('‚úì Model loaded')

model.val(data="coco128.yaml", epochs=1)

dataset_path = Path('coco128')

# # check dataset structur
def validate_yolo_dataset(
    dataset_path: Path,
    split: str = "train2017",
    fix_structure: bool = False,
    verbose: bool = True,
):
    dataset_path = Path(dataset_path)

    issues = {
        "missing_labels": [],
        "missing_images": [],
        "extra_labels": [],
        "extra_images": [],
        "structure_errors": [],
    }

    if verbose:
        print("\n" + "=" * 70)
        print("YOLO DATASET VALIDATION")
        print("=" * 70)
        print(f"Dataset path: {dataset_path}")
        print(f"Split: {split}")

    if not dataset_path.exists():
        issues["structure_errors"].append(f"Dataset directory not found: {dataset_path}")
        return False, issues

    images_root = dataset_path / "images"
    labels_root = dataset_path / "labels"
    images_split = images_root / split
    labels_split = labels_root / split

    # --------------------------------------------------
    # üîß FIX: Move files UP from split folder to root
    # --------------------------------------------------
    if fix_structure:
        if images_split.exists():
            images_root.mkdir(exist_ok=True, parents=True)
            moved = 0
            for img in images_split.iterdir():
                if img.is_file():
                    shutil.move(str(img), str(images_root / img.name))
                    moved += 1
            if verbose:
                print(f"üîß Moved {moved} images from {images_split} ‚Üí {images_root}")
            if not any(images_split.iterdir()):
                images_split.rmdir()

        if labels_split.exists():
            labels_root.mkdir(exist_ok=True, parents=True)
            moved = 0
            for lbl in labels_split.glob("*.txt"):
                if lbl.is_file():
                    shutil.move(str(lbl), str(labels_root / lbl.name))
                    moved += 1
            if verbose:
                print(f"üîß Moved {moved} labels from {labels_split} ‚Üí {labels_root}")
            if not any(labels_split.iterdir()):
                labels_split.rmdir()

    # --------------------------------------------------
    # Validate final structure
    # --------------------------------------------------
    if not images_root.exists():
        issues["structure_errors"].append(f"Images directory not found: {images_root}")
        return False, issues

    if not labels_root.exists():
        issues["structure_errors"].append(f"Labels directory not found: {labels_root}")
        return False, issues

    if verbose:
        print("\n‚úì Directory structure:")
        print(f"  Images: {images_root}")
        print(f"  Labels: {labels_root}")

    # --------------------------------------------------
    # Collect files
    # --------------------------------------------------
    image_exts = [".jpg", ".jpeg", ".png", ".bmp", ".webp"]
    image_files = {}

    for ext in image_exts:
        for img in images_root.glob(f"*{ext}"):
            image_files[img.stem] = img
        for img in images_root.glob(f"*{ext.upper()}"):
            image_files[img.stem] = img

    label_files = {lbl.stem: lbl for lbl in labels_root.glob("*.txt")}

    if verbose:
        print("\nüìä Dataset Statistics:")
        print(f"  Total images: {len(image_files)}")
        print(f"  Total labels: {len(label_files)}")

    image_stems = set(image_files)
    label_stems = set(label_files)

    missing_labels = image_stems - label_stems
    missing_images = label_stems - image_stems

    issues["missing_labels"] = sorted(missing_labels)
    issues["missing_images"] = sorted(missing_images)

    matched = len(image_stems & label_stems)
    is_valid = matched > 0 and not issues["structure_errors"]

    if verbose:
        if missing_labels:
            print(f"\n‚ö†Ô∏è Images without labels: {len(missing_labels)}")
            for stem in sorted(list(missing_labels))[:10]:
                print(f"   - {stem}")

        if missing_images:
            print(f"\n‚ö†Ô∏è Labels without images: {len(missing_images)}")
            for stem in sorted(list(missing_images))[:10]:
                print(f"   - {stem}.txt")
                
    if verbose:
        print(f"\n{'‚úì' if is_valid else '‚ùå'} Matched pairs: {matched}")

    return is_valid, issues

validate_yolo_dataset(dataset_path, split='train2017', fix_structure=True, verbose=True)

# Run predictions on COCO128 validation images
print('\nRunning predictions on COCO128...')
print('=' * 50)

start = time.time()

results = model.predict(
    source=dataset_path / 'images' / 'train2017',  # Validation images
    imgsz=640,
    conf=0.25,  # Confidence threshold
    verbose=False
)

elapsed = time.time() - start

print(f"‚úì Predictions completed in {elapsed:.2f}s")
print(f"  - Predicted on {len(results)} images")
print(f"  - Average: {elapsed/len(results)*1000:.1f}ms per image")

## 3. Compute mAP@0.5:0.95 with Confidence Interval

Now we'll compute the mean Average Precision with a 95% confidence interval using bootstrap resampling.

In [None]:
# Initialize the metric evaluator
evaluate = MetricEvaluator()
print("‚úì Metric evaluator initialized")

# Compute mAP@0.5:0.95 with confidence interval
print('\nComputing mAP@0.5:0.95 with 95% CI...')
print('=' * 50)

map_value, (lower, upper) = evaluate.evaluate(
    y_true=str(dataset_path),  # Dataset root directory
    y_pred=results,             # prediction results
    task='detection',
    metric='map',           # available: 'map', 'precision', 'recall'
    method='bootstrap_percentile', # default method 'bootstrap_bca'
    n_resamples=100,  # Use 100 for speed (use 1000+ for production)
    plot=True,  # Create histogram plot
    # plot_per_class=True, # Plot per-class distributions default: False
)

print(f"\n" + "=" * 50)
print(f"RESULTS")
print("=" * 50)
print(f"mAP@0.5:0.95: {map_value:.4f}")
print(f"95% CI: [{lower:.4f}, {upper:.4f}]")
print(f"CI width: {upper - lower:.4f}")
print(f"\n‚úì Histogram plot saved to results/ directory")

Image not found for label 000000000656.txt, using default shape (640, 640)
Image not found for label 000000000659.txt, using default shape (640, 640)


‚úì Metric evaluator initialized

Computing mAP@0.5:0.95 with 95% CI...
  ‚ö†Ô∏è  Skipped 2 predictions without matching ground truth labels:
     - 000000000250.txt (label file not found)
     - 000000000508.txt (label file not found)


Bootstrap CI:   3%|‚ñé         | 3/100 [00:00<00:04]

Bootstrap CI: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:04<00:00]


Histogram plot saved to: results/mAP@0.5:0.95_bootstrap_percentile_20251222_134358.png

RESULTS
mAP@0.5:0.95: 0.4916
95% CI: [0.4389, 0.5357]
CI width: 0.0967

‚úì Histogram plot saved to results/ directory
