# 2. Data Exploration & Registration

This notebook handles:
- Loading and registering COCO-format datasets with Detectron2.
- Exploring dataset statistics (categories, image counts, annotation distribution).
- Visualizing annotated samples.

**Prerequisites:** Run `1_setup.ipynb` first.

## 2.1 Imports

In [None]:
import os
import json

import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

from detectron2.data.datasets import register_coco_instances
from detectron2.data import MetadataCatalog, DatasetCatalog

import config
from utils.visualization import show_dataset_samples, show_specific_image

## 2.2 Choose Dataset

Select which dataset to explore by setting `DATASET_SOURCE` and `SUBSET`.

**Part 1 (AGAR):** `DATASET_SOURCE = 'agar'`, `SUBSET` âˆˆ `{'total', 'bright', 'dark', 'vague', 'lowres'}`

**Part 2 (Curated):** `DATASET_SOURCE = 'roboflow'`, download dataset first.

In [None]:
# ===================== CONFIGURE HERE =====================
DATASET_SOURCE = "agar"      # 'agar' or 'roboflow'
SUBSET = "total"             # For AGAR: 'total', 'bright', 'dark', 'vague', 'lowres'
# =========================================================

if DATASET_SOURCE == "agar":
    dataset = config.AGAR_DATASETS[SUBSET]
    img_dir = config.AGAR_IMG_DIR
    train_path, val_path, test_path = dataset["train"], dataset["val"], dataset["test"]
    img_dir_train = img_dir_val = img_dir_test = img_dir
    train_name = f"{SUBSET}_train"
    val_name = f"{SUBSET}_val"
    test_name = f"{SUBSET}_test"

elif DATASET_SOURCE == "roboflow":
    # Download Roboflow dataset (run once)
    # !curl -L "{config.ROBOFLOW_DOWNLOAD_URL}" > roboflow.zip; unzip roboflow.zip; rm roboflow.zip
    dataset = config.ROBOFLOW_DATASETS["curated"]
    train_path = dataset["train"]
    val_path = dataset["val"]
    test_path = dataset["test"]
    img_dir_train = dataset["train_dir"]
    img_dir_val = dataset["val_dir"]
    img_dir_test = dataset["test_dir"]
    train_name = "robo_train"
    val_name = "robo_val"
    test_name = "robo_test"

print(f"Dataset: {DATASET_SOURCE} / {SUBSET if DATASET_SOURCE == 'agar' else 'curated'}")
print(f"Train: {train_path}")
print(f"Val:   {val_path}")
print(f"Test:  {test_path}")

## 2.3 Register Datasets

In [None]:
# Register datasets with Detectron2 (safe to re-run)
for name in [train_name, val_name, test_name]:
    if name in DatasetCatalog.list():
        DatasetCatalog.remove(name)
        MetadataCatalog.remove(name)

register_coco_instances(train_name, {}, train_path, img_dir_train)
register_coco_instances(val_name, {}, val_path, img_dir_val)
register_coco_instances(test_name, {}, test_path, img_dir_test)

print(f"Registered: {train_name}, {val_name}, {test_name}")

## 2.4 Dataset Statistics

In [None]:
with open(train_path, 'r') as f:
    data = json.load(f)

print("Categories:", data['categories'])
print(f"Number of training images: {len(data['images'])}")
print(f"Number of annotations: {len(data['annotations'])}")

# Class distribution
from collections import Counter
class_counts = Counter(ann['category_id'] for ann in data['annotations'])
cat_names = {c['id']: c['name'] for c in data['categories']}
print("\nClass distribution:")
for cat_id, count in sorted(class_counts.items()):
    name = cat_names.get(cat_id, f"class_{cat_id}")
    print(f"  {name}: {count} ({100*count/len(data['annotations']):.1f}%)")

## 2.5 Visualize Training Samples

In [None]:
show_dataset_samples(train_name, num_samples=5, scale=0.5, seed=42)

## 2.6 Visualize a Specific Image

In [None]:
# Change the filename to inspect a specific image
# show_specific_image(train_name, "429.jpg", img_dir_train, scale=0.5)