In [6]:
import os
import numpy as np

def loadtxt_safe_2d(path, dtype=float, delimiter=None, ncols=5):
    """Load whitespace (or CSV if delimiter=',') labels into a 2D array.
       Returns shape (0, ncols) for missing/empty files. Ensures 2D even for one row."""
    if not os.path.exists(path):
        return np.empty((0, ncols), dtype=dtype)
    try:
        arr = np.loadtxt(path, dtype=dtype, delimiter=delimiter, ndmin=1)
    except Exception as e:
        # empty file or parse error → treat as no boxes
        return np.empty((0, ncols), dtype=dtype)
    if arr.ndim == 1:
        if arr.size == 0:
            return np.empty((0, ncols), dtype=dtype)
        arr = arr.reshape(1, -1)
    if arr.shape[1] != ncols:
        raise ValueError(f"{path}: expected {ncols} columns, got {arr.shape[1]}")
    return arr

label_root = '/local_data/dataset/polyp/detection/patients_complete/labels/train/'
labels = [os.path.join(label_root, f) for f in os.listdir(label_root) if f.endswith('.txt')]

# Counters
num_boxes_total = 0
num_bg_images = 0                 # images with 0 boxes
num_boxes_cls0 = 0
num_boxes_cls1 = 0
num_images_both_classes = 0       # images containing both class 0 and class 1
num_images_more_than_one_box = 0  # images with >=2 boxes

# Collect all box areas (fraction of image area) for distribution
areas = []  # will store w*h (in [0,1])

for label_path in labels:
    lbl = loadtxt_safe_2d(label_path, dtype=float, ncols=5)

    # If no boxes in this image
    if len(lbl) == 0:
        num_bg_images += 1
        continue

    # Sanity clamp (optional): keep values in [0,1] for xywh
    lbl[:, 1:5] = np.clip(lbl[:, 1:5], 0.0, 1.0)

    # Count boxes
    num_boxes_total += len(lbl)
    num_boxes_cls0 += int((lbl[:, 0] == 0).sum())
    num_boxes_cls1 += int((lbl[:, 0] == 1).sum())

    # Per-image stats
    if len(lbl) >= 2:
        num_images_more_than_one_box += 1
    has0 = np.any(lbl[:, 0] == 0)
    has1 = np.any(lbl[:, 0] == 1)
    if has0 and has1:
        num_images_both_classes += 1

    # Areas (fraction of image area); xywh normalized
    w = lbl[:, 3]
    h = lbl[:, 4]
    areas.extend((w * h).tolist())

# -------------------------
# Print dataset statistics
# -------------------------
print(f"Total images:               {len(labels)}")
print(f"Background-only images:     {num_bg_images}")
print(f"Images with >1 box:         {num_images_more_than_one_box}")
print(f"Images with both classes:   {num_images_both_classes}")
print()
print(f"Total boxes:                {num_boxes_total}")
print(f"  class 0 boxes:            {num_boxes_cls0}")
print(f"  class 1 boxes:            {num_boxes_cls1}")

# -------------------------
# Box size distribution
# -------------------------
areas = np.array(areas, dtype=float)
if areas.size == 0:
    print("\nNo boxes found → size distribution not available.")
else:
    # Percent of image area
    areas_pct = areas * 100.0

    # Choose readable percentage bins (in % of image area)
    bins_pct = np.array([0, 0.1, 0.5, 1, 2, 5, 10, 25, 100], dtype=float)
    hist, edges = np.histogram(areas_pct, bins=bins_pct)
    hist_pct = hist / hist.sum() * 100.0

    print("\nBox size distribution (% of image area):")
    for i in range(len(edges) - 1):
        lo, hi = edges[i], edges[i + 1]
        print(f"  [{lo:>5.1f}% – {hi:>5.1f}%]: {hist[i]:6d} boxes ({hist_pct[i]:5.2f}%)")

    # Some summary stats
    p50, p75, p90, p95 = np.percentile(areas_pct, [50, 75, 90, 95])
    print("\nSummary (box area as % of image):")
    print(f"  mean = {areas_pct.mean():.4f}%,  std = {areas_pct.std():.4f}%")
    print(f"  p50  = {p50:.4f}%,  p75 = {p75:.4f}%,  p90 = {p90:.4f}%,  p95 = {p95:.4f}%")


Total images:               6442
Background-only images:     509
Images with >1 box:         374
Images with both classes:   53

Total boxes:                6511
  class 0 boxes:            2803
  class 1 boxes:            3708

Box size distribution (% of image area):
  [  0.0% –   0.1%]:      0 boxes ( 0.00%)
  [  0.1% –   0.5%]:    304 boxes ( 4.67%)
  [  0.5% –   1.0%]:    740 boxes (11.37%)
  [  1.0% –   2.0%]:   1120 boxes (17.20%)
  [  2.0% –   5.0%]:   1972 boxes (30.29%)
  [  5.0% –  10.0%]:   1211 boxes (18.60%)
  [ 10.0% –  25.0%]:    900 boxes (13.82%)
  [ 25.0% – 100.0%]:    264 boxes ( 4.05%)

Summary (box area as % of image):
  mean = 6.3317%,  std = 8.7060%
  p50  = 3.2855%,  p75 = 7.4179%,  p90 = 14.8906%,  p95 = 22.6160%


In [7]:
import os
import numpy as np

def loadtxt_safe_2d(path, dtype=float, delimiter=None, ncols=5):
    """Load whitespace (or CSV if delimiter=',') labels into a 2D array.
       Returns shape (0, ncols) for missing/empty files. Ensures 2D even for one row."""
    if not os.path.exists(path):
        return np.empty((0, ncols), dtype=dtype)
    try:
        arr = np.loadtxt(path, dtype=dtype, delimiter=delimiter, ndmin=1)
    except Exception as e:
        # empty file or parse error → treat as no boxes
        return np.empty((0, ncols), dtype=dtype)
    if arr.ndim == 1:
        if arr.size == 0:
            return np.empty((0, ncols), dtype=dtype)
        arr = arr.reshape(1, -1)
    if arr.shape[1] != ncols:
        raise ValueError(f"{path}: expected {ncols} columns, got {arr.shape[1]}")
    return arr

label_root = '/local_data/dataset/polyp/detection/patients_complete/labels/val/'
labels = [os.path.join(label_root, f) for f in os.listdir(label_root) if f.endswith('.txt')]

# Counters
num_boxes_total = 0
num_bg_images = 0                 # images with 0 boxes
num_boxes_cls0 = 0
num_boxes_cls1 = 0
num_images_both_classes = 0       # images containing both class 0 and class 1
num_images_more_than_one_box = 0  # images with >=2 boxes

# Collect all box areas (fraction of image area) for distribution
areas = []  # will store w*h (in [0,1])

for label_path in labels:
    lbl = loadtxt_safe_2d(label_path, dtype=float, ncols=5)

    # If no boxes in this image
    if len(lbl) == 0:
        num_bg_images += 1
        continue

    # Sanity clamp (optional): keep values in [0,1] for xywh
    lbl[:, 1:5] = np.clip(lbl[:, 1:5], 0.0, 1.0)

    # Count boxes
    num_boxes_total += len(lbl)
    num_boxes_cls0 += int((lbl[:, 0] == 0).sum())
    num_boxes_cls1 += int((lbl[:, 0] == 1).sum())

    # Per-image stats
    if len(lbl) >= 2:
        num_images_more_than_one_box += 1
    has0 = np.any(lbl[:, 0] == 0)
    has1 = np.any(lbl[:, 0] == 1)
    if has0 and has1:
        num_images_both_classes += 1

    # Areas (fraction of image area); xywh normalized
    w = lbl[:, 3]
    h = lbl[:, 4]
    areas.extend((w * h).tolist())

# -------------------------
# Print dataset statistics
# -------------------------
print(f"Total images:               {len(labels)}")
print(f"Background-only images:     {num_bg_images}")
print(f"Images with >1 box:         {num_images_more_than_one_box}")
print(f"Images with both classes:   {num_images_both_classes}")
print()
print(f"Total boxes:                {num_boxes_total}")
print(f"  class 0 boxes:            {num_boxes_cls0}")
print(f"  class 1 boxes:            {num_boxes_cls1}")

# -------------------------
# Box size distribution
# -------------------------
areas = np.array(areas, dtype=float)
if areas.size == 0:
    print("\nNo boxes found → size distribution not available.")
else:
    # Percent of image area
    areas_pct = areas * 100.0

    # Choose readable percentage bins (in % of image area)
    bins_pct = np.array([0, 0.1, 0.5, 1, 2, 5, 10, 25, 100], dtype=float)
    hist, edges = np.histogram(areas_pct, bins=bins_pct)
    hist_pct = hist / hist.sum() * 100.0

    print("\nBox size distribution (% of image area):")
    for i in range(len(edges) - 1):
        lo, hi = edges[i], edges[i + 1]
        print(f"  [{lo:>5.1f}% – {hi:>5.1f}%]: {hist[i]:6d} boxes ({hist_pct[i]:5.2f}%)")

    # Some summary stats
    p50, p75, p90, p95 = np.percentile(areas_pct, [50, 75, 90, 95])
    print("\nSummary (box area as % of image):")
    print(f"  mean = {areas_pct.mean():.4f}%,  std = {areas_pct.std():.4f}%")
    print(f"  p50  = {p50:.4f}%,  p75 = {p75:.4f}%,  p90 = {p90:.4f}%,  p95 = {p95:.4f}%")

Total images:               704
Background-only images:     25
Images with >1 box:         45
Images with both classes:   8

Total boxes:                734
  class 0 boxes:            323
  class 1 boxes:            411

Box size distribution (% of image area):
  [  0.0% –   0.1%]:      0 boxes ( 0.00%)
  [  0.1% –   0.5%]:     35 boxes ( 4.77%)
  [  0.5% –   1.0%]:     79 boxes (10.76%)
  [  1.0% –   2.0%]:    147 boxes (20.03%)
  [  2.0% –   5.0%]:    213 boxes (29.02%)
  [  5.0% –  10.0%]:    123 boxes (16.76%)
  [ 10.0% –  25.0%]:    114 boxes (15.53%)
  [ 25.0% – 100.0%]:     23 boxes ( 3.13%)

Summary (box area as % of image):
  mean = 6.1166%,  std = 7.7775%
  p50  = 3.1609%,  p75 = 7.5050%,  p90 = 15.4938%,  p95 = 20.8863%
