In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from coco_dataset import COCOPanopticDataset
from pixeldecoder import PixelDecoder
from backbone import BackboneWithMultiScaleFeatures
from tokenizer import TaskTokenizer
from mlp import TaskMLP
from text_mapper import TextMapper
from contrastive_loss import ContrastiveLoss
from query_formulation import TaskConditionedQueryFormulator
from compute_loss import SetCriterion
from hungarian_matcher import HungarianMatcher
from transformer_decoder import TransformerDecoder
from predict import MaskClassPredictor
from torchmetrics import JaccardIndex
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from panopticapi.evaluation import pq_compute

# Define hyperparameters
vocab_size = 30000
embed_dim = 256
max_seq_len = 128
num_queries = 100
temperature = 0.2
num_heads = 8
num_layers = 6
num_classes = 80
contrastive_weight = 0.5
primary_loss_weight = 1.0

# COCO Dataset Paths (Change the path if necessary)
train_image_dir = "datasets/coco/train2017"
train_instance_file = "datasets/coco/annotations/instances_train2017.json"
train_panoptic_file = "datasets/coco/annotations/panoptic_train2017.json"
train_panoptic_mask_dir = "datasets/coco/panoptic_train2017"

data_transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
])

# Initialize the dataset and DataLoader
full_train_dataset = COCOPanopticDataset(
    image_dir=train_image_dir,
    instance_file=train_instance_file,
    panoptic_file=train_panoptic_file,
    panoptic_mask_dir=train_panoptic_mask_dir,
    transform=data_transform
)

train_dataset = torch.utils.data.Subset(full_train_dataset, range(5000))
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4)





  from .autonotebook import tqdm as notebook_tqdm


In [None]:
backbone = BackboneWithMultiScaleFeatures()
pixel_decoder = PixelDecoder(input_channels=[256, 512, 1024, 2048])
tokenizer = TaskTokenizer(vocab_size, embed_dim, max_seq_len)
mlp = TaskMLP(input_dim=embed_dim, hidden_dim=embed_dim, output_dim=embed_dim)
text_mapper = TextMapper(vocab_size=vocab_size, embed_dim=embed_dim)
contrastive_loss_fn = ContrastiveLoss(temperature)
task_query_formulator = TaskConditionedQueryFormulator(num_queries=num_queries, embed_dim=embed_dim)
matcher = HungarianMatcher(cost_class=1, cost_mask=1, cost_dice=1)
criterion = SetCriterion(matcher=matcher, num_classes=num_classes, weight_dict={'loss_ce': 1, 'loss_mask': 1, 'loss_dice': 1}, eos_coef=0.1, losses=['labels', 'masks'])
transformer_decoder = TransformerDecoder(
    embed_dim=embed_dim,
    num_queries=num_queries,
    num_classes=num_classes,
    num_heads=num_heads,
    num_layers=num_layers
)

mask_class_predictor = MaskClassPredictor(embed_dim, num_queries, num_classes)

optimizer = torch.optim.Adam([
    {"params": backbone.parameters()},
    {"params": pixel_decoder.parameters()},
    {"params": transformer_decoder.parameters()},
    {"params": mask_class_predictor.parameters()},
    {"params": mlp.parameters()},
    {"params": text_mapper.parameters()},
    {"params": task_query_formulator.parameters()},
], lr=1e-4)

# Initialize IoU metric for semantic segmentation
miou_metric = JaccardIndex(task='multiclass', num_classes=num_classes)

all_pred_logits, all_pred_masks, all_gt_labels, all_gt_masks = [], [], [], []

for image_batch, mask_batch in train_loader:
    optimizer.zero_grad()

    #Extract Multi-Scale Features
    multi_scale_features = backbone(image_batch)
    decoded_features = pixel_decoder(multi_scale_features)
    image_features_1_4 = decoded_features[0]
    
    #Tokenize Task Texts
    task_texts = ["panoptic", "instance", "semantic"]
    task_embeddings = tokenizer.forward(task_texts)  # [3, max_seq_len, embed_dim]
    task_embeddings = mlp(task_embeddings.mean(dim=1).unsqueeze(1)).squeeze(1)  # [3, embed_dim]
    
    #Map Task Embeddings to Q_text
    q_text = text_mapper(
        panoptic_text=task_embeddings[0].unsqueeze(0).long(),
        instance_text=task_embeddings[1].unsqueeze(0).long(),
        semantic_text=task_embeddings[2].unsqueeze(0).long()
    )

    #Generate Q_task
    batch_size = image_batch.size(0)
    q_task = task_query_formulator(task_embeddings.unsqueeze(1), batch_size).permute(1, 0, 2)

    #Calculate Contrastive Loss between Q_text and Q_task
    contrastive_loss = contrastive_loss_fn(q_text, q_task)
    print(f"Contrastive Loss: {contrastive_loss.item()}")

    if q_text.size(0) == 1:
        q_text = q_text.expand(q_task.size(0), -1, -1)

    q_text = F.normalize(q_text, dim=-1)
    q_task = F.normalize(q_task, dim=-1)

    batch_size, num_tasks, embed_dim = q_text.size()
    _, num_queries, _ = q_task.size()
    
    q_text = q_text.reshape(batch_size * num_tasks, embed_dim)
    q_task = q_task.reshape(batch_size * num_queries, embed_dim)

    decoder_output = transformer_decoder(q_task, multi_scale_features)

    flattened_image_features_1_4 = image_features_1_4.view(1, embed_dim, 128 * 128).permute(0, 2, 1)

    combined_input = torch.cat([decoder_output, flattened_image_features_1_4], dim=1)
    
    mask_pred, class_pred = mask_class_predictor(combined_input)
    
    outputs = {'pred_logits': class_pred, 'pred_masks': mask_pred}
    targets = [{'labels': mask_batch[0]}]
    primary_loss = criterion(outputs, targets)
    
    total_loss = contrastive_weight * contrastive_loss + primary_loss_weight * sum(primary_loss.values())
    
    total_loss.backward()
    optimizer.step()
    
    print(f"Contrastive Loss: {contrastive_loss.item()}, Primary Loss: {sum(primary_loss.values()).item()}, Total Loss: {total_loss.item()}")

    all_pred_logits.append(class_pred)
    all_pred_masks.append(mask_pred)
    all_gt_labels.append(mask_batch[0])
    all_gt_masks.append(mask_batch[0])

    mask_pred_resized = F.interpolate(mask_pred, size=(512, 512), mode="nearest")
    mask_pred_classes = mask_pred_resized.argmax(dim=1)

    mask_pred_classes = mask_pred_classes.squeeze(0).long()
    mask_batch_labels = mask_batch[0].long()

    mask_batch_labels = mask_batch_labels.clone()
    mask_batch_labels[(mask_batch_labels >= num_classes) | (mask_batch_labels < 0)] = -1

    mask_pred_classes = mask_pred_classes.to(mask_batch_labels.device)

    miou_metric.update(mask_pred_classes, mask_batch_labels)

    break



Contrastive Loss: 5.762584686279297
Cross-entropy loss: 0.0
Contrastive Loss: 5.762584686279297, Primary Loss: 0.0, Total Loss: 2.8812923431396484
mask_batch shape: torch.Size([1, 512, 512])
mask_pred_classes shape after processing: torch.Size([512, 512])
mask_batch_labels shape: torch.Size([512, 512])


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x11dc7c0e0>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1568, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/connection

In [None]:
import numpy as np
from PIL import Image  # Use PIL for saving images
import os
import json
from pycocotools.coco import COCO
from pycocotools import mask as maskUtils
import torch
from panopticapi.evaluation import pq_compute

# Directory to save predicted masks
pred_folder = "predicted_masks_folder"
os.makedirs(pred_folder, exist_ok=True)

# Convert all_pred_masks to images and save them
for i, mask in enumerate(all_pred_masks):
    mask_np = mask.cpu().detach().numpy()  # (1, num_classes, 128, 128)
    print(f"Original mask shape: {mask_np.shape}")

    if mask_np.shape[1] > 1:
        mask_np = np.argmax(mask_np, axis=1)
    else:
        mask_np = mask_np.squeeze(1)
    mask_np = np.squeeze(mask_np)

    if mask_np.ndim != 2:
        print(f"Skipping mask {i} due to incompatible shape: {mask_np.shape}")
        continue

    mask_np = (mask_np * (255 // (mask_np.max() + 1))).astype(np.uint8)
    mask_image = Image.fromarray(mask_np)
    mask_filename = os.path.join(pred_folder, f"pred_mask_{i}.png")
    mask_image.save(mask_filename)

print("All compatible masks processed and saved.")

# Mean IoU Calculation
miou = miou_metric.compute()
print(f"Mean IoU (mIoU): {miou}")

# COCO Evaluation for Instance Segmentation
val_instance_file = "datasets/coco/annotations/instances_val2017.json"
val_panoptic_file = "datasets/coco/annotations/panoptic_val2017.json"
coco_gt = COCO(val_instance_file)

with open(val_panoptic_file, "r") as f:
    gt_data = json.load(f)
image_id_map = {img['file_name']: img['id'] for img in gt_data['images']}

coco_results = []
for i, (logit, mask) in enumerate(zip(all_pred_logits, all_pred_masks)):
    image_file_name = f"{str(i).zfill(12)}.jpg"  # Adjust this if necessary based on actual filenames
    image_id = image_id_map.get(image_file_name)

    if image_id is None:
        print(f"Warning: No image ID found for {image_file_name}. Skipping.")
        continue

    mask_np = mask.cpu().detach().numpy().astype(np.uint8)
    mask_rle = maskUtils.encode(np.asfortranarray(mask_np[0]))  # Assuming batch size of 1
    mask_rle['counts'] = mask_rle['counts'].decode("utf-8")

    coco_results.append({
        "image_id": image_id,
        "category_id": int(torch.argmax(logit).item()),
        "segmentation": mask_rle,
        "score": float(torch.max(logit).item())
    })

# Save and Evaluate
if coco_results:
    with open("predictions.json", "w") as f:
        json.dump(coco_results, f)

    coco_dt = coco_gt.loadRes("predictions.json")
    coco_eval = COCOeval(coco_gt, coco_dt, iouType='segm')
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()
else:
    print("Error: coco_results is empty. No predictions available for COCO evaluation.")

# Panoptic Quality Evaluation
pq_results = pq_compute(
    gt_json_file=val_panoptic_file,
    pred_json_file="predictions.json",
    gt_folder="datasets/coco/panoptic_val2017",
    pred_folder=pred_folder
)

print(f"Panoptic Quality (PQ): {pq_results['All']['pq']}")
print(f"Segmentation Quality (SQ): {pq_results['All']['sq']}")
print(f"Recognition Quality (RQ): {pq_results['All']['rq']}")


Original mask shape: (1, 100, 128, 128)
All compatible masks processed and saved.
Mean IoU (mIoU): 3.1534666049992666e-05
loading annotations into memory...
Done (t=0.16s)
creating index...
index created!
Error: coco_results is empty. No predictions available for COCO evaluation.
Evaluation panoptic segmentation metrics:
Ground truth:
	Segmentation folder: datasets/coco/panoptic_val2017
	JSON file: datasets/coco/annotations/panoptic_val2017.json
Prediction:
	Segmentation folder: predicted_masks_folder
	JSON file: predictions.json


TypeError: list indices must be integers or slices, not str

In [None]:
# Step 11: Calculate metrics after running on the dataset
miou = miou_metric.compute()
print(f"Mean IoU (mIoU): {miou}")

import json
from pycocotools.coco import COCO
from pycocotools import mask as maskUtils
import numpy as np
import torch

# Paths to COCO files
train_instance_file = "datasets/coco/annotations/instances_train2017.json"
val_instance_file = "datasets/coco/annotations/instances_val2017.json"
val_panoptic_file = "datasets/coco/annotations/panoptic_val2017.json"

# Load the COCO ground truth data
coco_gt = COCO(val_instance_file)

# Dictionary to map image filenames to COCO image IDs from the validation set
with open(val_panoptic_file, "r") as f:
    gt_data = json.load(f)
image_id_map = {img['file_name']: img['id'] for img in gt_data['images']}

coco_results = []
for i, (logit, mask) in enumerate(zip(all_pred_logits, all_pred_masks)):
    # Obtain the image file name for this prediction (adjust as per your data loader setup)
    image_file_name = "YOUR_IMAGE_FILE_NAME"  # Replace with the method to get the actual image filename
    image_id = image_id_map.get(image_file_name)

    if image_id is None:
        print(f"Warning: No image ID found for {image_file_name}. Skipping.")
        continue

    # Detach and prepare mask for RLE encoding
    mask = mask.cpu().detach().numpy().astype(np.uint8)
    mask_rle = maskUtils.encode(np.asfortranarray(mask))
    mask_rle['counts'] = mask_rle['counts'].decode("utf-8")  # JSON-compatible format

    # Append prediction to results
    coco_results.append({
        "image_id": image_id,
        "category_id": int(torch.argmax(logit).item()),
        "segmentation": mask_rle,
        "score": float(torch.max(logit).item())
    })

# Perform COCO evaluation only if there are results
if coco_results:
    # Save predictions to a JSON file if necessary
    with open("predictions.json", "w") as f:
        json.dump(coco_results, f)

    # Load predictions into COCO format for evaluation
    coco_dt = coco_gt.loadRes(coco_results)
    coco_eval = COCOeval(coco_gt, coco_dt, iouType='segm')
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()
else:
    print("Error: coco_results is empty. No predictions available for COCO evaluation.")

# Panoptic Quality Evaluation
pq_results = pq_compute(
    gt_json_file=val_panoptic_file,  # Replace with your ground truth JSON file path
    pred_json_file="predictions.json",  # Ensure this JSON file has the prediction results in the required format
    gt_folder="datasets/coco/panoptic_val2017",  # Path to your ground truth mask folder
    pred_folder=pred_folder  # Path to the folder with predicted masks
)

print(f"Panoptic Quality (PQ): {pq_results['All']['pq']}")
print(f"Segmentation Quality (SQ): {pq_results['All']['sq']}")
print(f"Recognition Quality (RQ): {pq_results['All']['rq']}")


Mean IoU (mIoU): 3.1534666049992666e-05
loading annotations into memory...
Done (t=0.15s)
creating index...
index created!
Error: coco_results is empty. No predictions available for COCO evaluation.
Evaluation panoptic segmentation metrics:
Ground truth:
	Segmentation folder: datasets/coco/panoptic_val2017
	JSON file: datasets/coco/annotations/panoptic_val2017.json
Prediction:
	Segmentation folder: predicted_masks_folder
	JSON file: predictions.json


TypeError: list indices must be integers or slices, not str

http://images.cocodataset.org/zips/train2017.zip
http://images.cocodataset.org/zips/val2017.zip
http://images.cocodataset.org/annotations/annotations_trainval2017.zip
http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip
http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip