# 01 â€“ Data Exploration

This notebook inspects the raw image inventory and the derived labeling artifacts. It expects the preprocessing step to have produced `image_manifest.csv`, `majority_labels.csv`, and `label_summary.json` under `/app/output`. The focus is to understand dataset size, folder/source composition, and the gap between available images and labels.

In [2]:
from pathlib import Path
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

output_root = Path("/app/output")
manifest_path = output_root / "image_manifest.csv"
labels_path = output_root / "majority_labels.csv"
summary_path = output_root / "label_summary.json"

# Helper to ensure directories exist for exports
def ensure_dir(path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    return path


## Load manifest and labels

In [5]:
manifest = pd.read_csv(manifest_path) if manifest_path.exists() else pd.DataFrame()
majority_labels = pd.read_csv(labels_path) if labels_path.exists() else pd.DataFrame()

dataset = pd.DataFrame()
missing = pd.DataFrame()

print(f"Manifest rows: {len(manifest)}")
print(f"Majority labels rows: {len(majority_labels)}")

summary = {}
if summary_path.exists():
    summary = json.loads(summary_path.read_text(encoding="utf-8"))
    display(summary)


Manifest rows: 0
Majority labels rows: 0


## Dataset composition

In [7]:
if manifest.empty:
    print("Manifest is empty or missing; run preprocessing first.")
else:
    display(manifest.head())
    by_folder = manifest.groupby("folder").size().sort_values(ascending=False)
    by_source = manifest.groupby("source").size()
    by_identifier = manifest.groupby("identifier").size().sort_values(ascending=False)

    print("Images by folder (top 10):")
    display(by_folder.head(10))

    print("Images by source:")
    display(by_source)

    print("Images by identifier (top 10):")
    display(by_identifier.head(10))


Manifest is empty or missing; run preprocessing first.


## Visualization: source and identifier distribution

In [None]:
if not manifest.empty:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.countplot(x="source", data=manifest, ax=axes[0])
    axes[0].set_title("Images by source")
    axes[0].set_xlabel("Source")
    axes[0].set_ylabel("Count")

    top_ids = manifest["identifier"].value_counts().nlargest(10).reset_index()
    top_ids.columns = ["identifier", "count"]
    sns.barplot(x="identifier", y="count", data=top_ids, ax=axes[1], color="steelblue")
    axes[1].set_title("Top 10 identifiers by image count")
    axes[1].set_xlabel("Identifier")
    axes[1].set_ylabel("Count")
    axes[1].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    ensure_dir(output_root / "figures" / "source_identifier.png")
    plt.savefig(output_root / "figures" / "source_identifier.png", dpi=150)
    plt.show()


## Label coverage

In [None]:
if manifest.empty or majority_labels.empty:
    print("Manifest or labels missing; run preprocessing first.")
else:
    dataset = manifest.merge(majority_labels, on="file_upload", how="left")
    labeled = dataset[dataset["majority_label"].notna()]
    missing = dataset[dataset["majority_label"].isna()]

    print(f"Labeled images: {len(labeled)} / {len(dataset)} ({len(labeled)/len(dataset):.1%})")
    print(f"Unlabeled images: {len(missing)}")

    by_source = labeled.groupby("source")["file_upload"].count()
    print("Labeled images by source:")
    display(by_source)

    plt.figure(figsize=(6, 4))
    sns.countplot(y="source", data=labeled, order=by_source.sort_values(ascending=False).index)
    plt.title("Labeled coverage by source")
    plt.xlabel("Count")
    plt.ylabel("Source")
    ensure_dir(output_root / "figures" / "label_coverage_by_source.png")
    plt.tight_layout()
    plt.savefig(output_root / "figures" / "label_coverage_by_source.png", dpi=150)
    plt.show()


## Missing labels (top examples)

In [None]:
if not dataset.empty:
    top_missing = missing.head(10)
    print("Sample of unlabeled images (top 10):")
    display(top_missing)
