This file contains a K-means algorithm used to find the optimal anchor sizes for Mask R-CNN training.

link to our dataset on roboflow:
https://universe.roboflow.com/segmentationcmmd/segmentation-calc-mass-aarac
original dataset:
https://www.cancerimagingarchive.net/analysis-result/tompei-cmmd/

In [None]:
import torch
import torchvision.transforms as T
from torchvision.datasets import CocoDetection
import os
import numpy as np
import cv2
from PIL import Image
from pycocotools import mask as maskUtils
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
dataset_root = "/content/segmentation-calc-mass-12/"
ann_fileTr = os.path.join(dataset_root, "train/_annotations.coco.json")
ann_fileTe = os.path.join(dataset_root, "test/_annotations.coco.json")
ann_fileV = os.path.join(dataset_root, "valid/_annotations.coco.json")
img_folderTr = os.path.join(dataset_root, "train/")
img_folderTe = os.path.join(dataset_root, "test/")
img_folderV = os.path.join(dataset_root, "valid/")


In [None]:
class CustomCocoDataset(CocoDetection):
    def __init__(self, img_folder, ann_file, transform=None):
        super().__init__(img_folder, ann_file)
        self.transform = transform
        self.img_folder = img_folder

    def __getitem__(self, idx):
        img, target = super().__getitem__(idx)
        img_info = self.coco.loadImgs(self.ids[idx])[0]
        img_path = os.path.join(self.img_folder, img_info["file_name"])
        img = Image.open(img_path).convert("RGB")
        image_id = self.ids[idx]  # Get the COCO image ID

        # Convert COCO annotations to Mask R-CNN format
        boxes = []
        masks = []
        labels = []

        for ann in target:
            xmin, ymin, w, h = ann["bbox"]
            xmax, ymax = xmin + w, ymin + h
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(ann["category_id"])

            # Handle segmentation masks
            if isinstance(ann["segmentation"], list):  # Polygon format
                mask = np.zeros((img.height, img.width), dtype=np.uint8)
                for seg in ann["segmentation"]:
                    poly = np.array(seg, dtype=np.int32).reshape((-1, 2))
                    cv2.fillPoly(mask, [poly], 1)
            else:  # RLE format
                mask = maskUtils.decode(ann["segmentation"])
            masks.append(mask)

        # Convert to tensors
        if boxes:
            boxes = torch.as_tensor(np.array(boxes, dtype=np.float32), dtype=torch.float32)
            labels = torch.as_tensor(np.array(labels, dtype=np.int64), dtype=torch.int64)
            masks = torch.as_tensor(np.array(masks, dtype=np.uint8), dtype=torch.uint8)
        else:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
            masks = torch.zeros((0, img.height, img.width), dtype=torch.uint8)

        target_dict = {
            "boxes": boxes,
            "labels": labels,
            "masks": masks,
            "image_id": torch.tensor([image_id]),  # Add image_id as tensor
            "size": torch.tensor([img.height, img.width])  # Useful for evaluation
        }

        if self.transform:
            img = self.transform(img)

        return img, target_dict

# Define transformation
transform = T.Compose([T.ToTensor()])

# Load dataset
train_dataset = CustomCocoDataset(img_folderTr, ann_fileTr, transform=transform)
val_dataset = CustomCocoDataset(img_folderV, ann_fileV, transform=transform)
test_dataset = CustomCocoDataset(img_folderTe, ann_fileTe, transform=transform)

loading annotations into memory...
Done (t=0.74s)
creating index...
index created!
loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


957x1147 size


In [None]:
# 1) Collect box dimensions from TRAINING DATA ONLY
all_wh = []

for image, target in train_dataset:  # Only use train_dataset!
    boxes = target['boxes']
    widths = (boxes[:, 2] - boxes[:, 0]).cpu().numpy()
    heights = (boxes[:, 3] - boxes[:, 1]).cpu().numpy()
    all_wh.append(np.stack([widths, heights], axis=1))

all_wh = np.vstack(all_wh)  # Shape: (total_boxes, 2)

# 2) K-means clustering for anchor sizes
NUM_CLUSTERS = 5  # Matches typical FPN levels
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
kmeans.fit(all_wh)
anchors = kmeans.cluster_centers_

# 3) Prepare for FPN integration
# Sort anchors by area (width*height)
anchors = anchors[anchors.prod(axis=1).argsort()]

# Convert to base sizes (sqrt(wh)) for AnchorGenerator
anchor_sizes = tuple((np.sqrt(w * h),) for w, h in anchors)  # One size per FPN level

# Use standard aspect ratios (better than clustering ratios)
ASPECT_RATIOS = (0.5, 1.0, 2.0)  # Same for all FPN levels
aspect_ratios = (ASPECT_RATIOS,) * NUM_CLUSTERS

print("Custom anchors configured for FPN:")
print(f"Sizes per level: {[round(s[0], 1) for s in anchor_sizes]}")
print(f"Aspect ratios: {ASPECT_RATIOS}")

Custom anchors configured for FPN:
Sizes per level: [np.float32(67.5), np.float32(126.1), np.float32(191.9), np.float32(277.5), np.float32(407.9)]
Aspect ratios: (0.5, 1.0, 2.0)


In [None]:
def compute_ious(anchors, gt_wh):
    a_areas = anchors.prod(axis=1)
    gt_areas = gt_wh.prod(axis=1)

    inter_w = np.minimum(anchors[:,0], gt_wh[:,0,None])
    inter_h = np.minimum(anchors[:,1], gt_wh[:,1,None])
    inter = inter_w * inter_h

    ious = inter / (a_areas + gt_areas[:,None] - inter)
    return ious.max(axis=1).mean()  # Mean best IoU

print(f"Mean IoU: {compute_ious(anchors, all_wh):.2f}")

Mean IoU: 0.70
