## install deps

In [None]:
!pip install -qq -U openmim==0.3.9
# !pip install -qq -U mmdet==3.2.0
!mim install -qq mmengine
!mim install -qq "mmcv>=2.0.0"

In [None]:
!git clone https://github.com/open-mmlab/mmdetection.git
%cd mmdetection
!pip install -qq -v -e .
%cd /content

In [None]:
!pip install -qq git+https://github.com/cocodataset/panopticapi.git

## download rtdet config and weights

In [None]:
!mim download mmdet --config rtmdet_tiny_8xb32-300e_coco --dest .

## run test detection

In [None]:
!python mmdetection/demo/image_demo.py \
        mmdetection/demo/demo.jpg \
        rtmdet_tiny_8xb32-300e_coco.py \
        --weights rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth \
        --device cpu

In [None]:
%cd outputs/vis
from IPython.display import Image


display(Image("demo.jpg"))
%cd /content

## download cityscapes dataset

In [None]:
import gdown

In [None]:
def download_from_colab(file_id: str):
    output_file = "cityscapes.zip"
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, output_file, quiet=True)

In [None]:
%cd /content
%rm -rf data
%rm -rf cityscapes

In [None]:
download_from_colab("1PMPMfEKWK0kvwadQAtvcrEY1SDTYvRgt")

!mkdir -p data
!unzip -qq -o cityscapes.zip -d /content/data/cityscapes
!rm cityscapes.zip

In [None]:
download_from_colab("1eUn338xKhhmJ6ykfWu0_CAwVMLZ7aFx1")

!mkdir -p data
!unzip -qq -o cityscapes.zip -d /content/data/cityscapes
!rm cityscapes.zip

## convert cityscapes to coco

In [None]:
%cd /content
%mkdir -p scripts

In [None]:
!pip install -qq tqdm fire cityscapesscripts

In [None]:
%%writefile scripts/convert_cityscapes_to_coco_panoptic.py
import os
import fire
import json
import glob
import numpy as np
import shutil
import PIL.Image as Image

from tqdm import tqdm
from joblib import Parallel, delayed
from panopticapi.utils import IdGenerator


try:
    # set up path for cityscapes scripts
    # sys.path.append('./cityscapesScripts/')
    from cityscapesscripts.helpers.labels import labels, id2label
except Exception:
    raise Exception("Please load Cityscapes scripts from https://github.com/mcordts/cityscapesScripts")


class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)


def image_converter(f, categories_dict, out_folder):
    original_format = np.array(Image.open(f))

    file_name = f.split('/')[-1]
    image_id = '_'.join(file_name.split('_')[:3])
    image_filename = '{}.png'.format(image_id)
    segm_filename = '{}.png'.format(image_id)

    # image entry, id for image is its filename without extension
    image_config = {
        "id": image_id,
        "width": original_format.shape[1],
        "height": original_format.shape[0],
        "file_name": image_filename
    }

    pan_format = np.zeros((original_format.shape[0], original_format.shape[1], 3), dtype=np.uint8)
    id_generator = IdGenerator(categories_dict)

    l = np.unique(original_format)
    segm_info = []
    for el in l:
        if el < 1000:
            semantic_id = el
            is_crowd = 1
        else:
            semantic_id = el // 1000
            is_crowd = 0
        if semantic_id not in categories_dict:
            continue
        if categories_dict[semantic_id]['isthing'] == 0:
            is_crowd = 0
        mask = original_format == el
        segment_id, color = id_generator.get_id_and_color(semantic_id)
        pan_format[mask] = color

        area = np.sum(mask)  # segment area computation

        # bbox computation for a segment
        hor = np.sum(mask, axis=0)
        hor_idx = np.nonzero(hor)[0]
        x = hor_idx[0]
        width = hor_idx[-1] - x + 1
        vert = np.sum(mask, axis=1)
        vert_idx = np.nonzero(vert)[0]
        y = vert_idx[0]
        height = vert_idx[-1] - y + 1
        bbox = [x, y, width, height]

        segm_info.append({
            "id": int(segment_id),
            "category_id": int(semantic_id),
            "area": area,
            "bbox": bbox,
            "iscrowd": is_crowd,
        })

    annotation_config = {
        'image_id': image_id,
        'file_name': segm_filename,
        "segments_info": segm_info,
    }

    Image.fromarray(pan_format).save(os.path.join(out_folder, segm_filename))
    return image_config, annotation_config


def panoptic_converter(
        gt_folder_path: str,
        gt_output_folder_path: str,
        gt_output_annotations_file_path: str,
        img_folder_path: str,
        img_output_folder_path: str,
        n_jobs: int = 4,
        remove_folders: bool = False,
):

    if not os.path.isdir(gt_output_folder_path):
        print("Creating folder {} for panoptic segmentation GT PNGs".format(gt_output_folder_path))
        os.mkdir(gt_output_folder_path)

    if not os.path.isdir(img_output_folder_path):
        print("Creating folder {} for panoptic segmentation 8-bit PNGs".format(img_output_folder_path))
        os.mkdir(img_output_folder_path)

    categories = []
    for idx, el in tqdm(enumerate(labels), total=len(labels), desc='Adding categories'):
        if el.ignoreInEval:
            continue

        categories.append({
            'id': el.id,
            'name': el.name,
            'color': el.color,
            'supercategory': el.category,
            'isthing': 1 if el.hasInstances else 0
        })

    categories_dict = {cat['id']: cat for cat in categories}

    gt_file_list = sorted(glob.glob(os.path.join(gt_folder_path, '*/*_gtFine_instanceIds.png')))

    result = Parallel(n_jobs=n_jobs, return_as="list")(
        delayed(image_converter)(f, categories_dict, gt_output_folder_path)
        for f in tqdm(gt_file_list, total=len(gt_file_list), desc='Converting images')
    )
    images, annotations = list(zip(*result))

    d = {
        'images': images,
        'annotations': annotations,
        'categories': categories,
    }

    with open(gt_output_annotations_file_path, 'w') as f:
        json.dump(d, f, cls=NpEncoder)

    if remove_folders:
        shutil.rmtree(gt_folder_path)

    img_file_list = sorted(glob.glob(os.path.join(img_folder_path, '*/*_leftImg8bit.png')))
    Parallel(n_jobs=n_jobs, return_as="list")(
        delayed(shutil.copyfile)(
            f,
            os.path.join(img_output_folder_path, f"{'_'.join(f.split('/')[-1].split('_')[:3])}.png")
        )
        for f in tqdm(img_file_list, total=len(img_file_list), desc='Move 8-bit images')
    )

    if remove_folders:
        shutil.rmtree(img_folder_path)


fire.Fire(panoptic_converter)


In [None]:
!mkdir -p data/cityscapes/annotations && \
python scripts/convert_cityscapes_to_coco_panoptic.py \
        --gt_folder_path=data/cityscapes/gtFine/test/ \
        --gt_output_folder_path=data/cityscapes/gtFine/cityscapes_panoptic_test/ \
        --gt_output_annotations_file_path=data/cityscapes/annotations/cityscapes_panoptic_test.json \
        --img_folder_path=data/cityscapes/leftImg8bit/test \
        --img_output_folder_path=data/cityscapes/leftImg8bit/cityscapes_panoptic_test \
        --n_jobs=6 && \
python scripts/convert_cityscapes_to_coco_panoptic.py \
        --gt_folder_path=data/cityscapes/gtFine/val/ \
        --gt_output_folder_path=data/cityscapes/gtFine/cityscapes_panoptic_val/ \
        --gt_output_annotations_file_path=data/cityscapes/annotations/cityscapes_panoptic_val.json \
        --img_folder_path=data/cityscapes/leftImg8bit/val \
        --img_output_folder_path=data/cityscapes/leftImg8bit/cityscapes_panoptic_val \
        --n_jobs=6 && \
python scripts/convert_cityscapes_to_coco_panoptic.py \
        --gt_folder_path=data/cityscapes/gtFine/train/ \
        --gt_output_folder_path=data/cityscapes/gtFine/cityscapes_panoptic_train/ \
        --gt_output_annotations_file_path=data/cityscapes/annotations/cityscapes_panoptic_train.json \
        --img_folder_path=data/cityscapes/leftImg8bit/train \
        --img_output_folder_path=data/cityscapes/leftImg8bit/cityscapes_panoptic_train \
        --n_jobs=6

In [None]:
!head -c 250 data/cityscapes/annotations/cityscapes_panoptic_train.json

## download mask2former (coco and panoptic coco)

In [None]:
%cd /content
!mim download mmdet --config mask2former_r50_8xb2-lsj-50e_coco-panoptic --dest .
!mim download mmdet --config mask2former_r50_8xb2-lsj-50e_coco --dest .

## run test segmentation mask2former coco

In [None]:
!python mmdetection/demo/image_demo.py \
        mmdetection/demo/demo.jpg \
        mask2former_r50_8xb2-lsj-50e_coco.py \
        --weights mask2former_r50_8xb2-lsj-50e_coco_20220506_191028-41b088b6.pth \
        --device cpu

In [None]:
from IPython.display import Image


Image("outputs/vis/demo.jpg")

## run test panoptic segmentation mask2former coco

In [None]:
!python mmdetection/demo/image_demo.py \
        mmdetection/demo/demo.jpg \
        mask2former_r50_8xb2-lsj-50e_coco-panoptic.py \
        --weights mask2former_r50_8xb2-lsj-50e_coco-panoptic_20230118_125535-54df384a.pth \
        --device cpu

In [None]:
from IPython.display import Image


Image("outputs/vis/demo.jpg")

## run test segmentation on one image from cityscapes

### CLI

In [None]:
!python mmdetection/demo/image_demo.py \
        /content/data/cityscapes/leftImg8bit/train/krefeld/krefeld_000000_000108_leftImg8bit.png \
        mask2former_r50_8xb2-lsj-50e_coco-panoptic.py \
        --weights mask2former_r50_8xb2-lsj-50e_coco-panoptic_20230118_125535-54df384a.pth \
        --device cpu

In [None]:
from IPython.display import Image


Image("outputs/vis/demo.jpg")

Python SDK

In [None]:
%cd /content/mmdetection
from mmdet.apis import DetInferencer


# Initialize the DetInferencer
inferencer = DetInferencer("mask2former_r50_8xb2-lsj-50e_coco-panoptic")

# Perform inference
inf_result = inferencer(
    "/content/data/cityscapes/leftImg8bit/train/krefeld/krefeld_000000_000108_leftImg8bit.png",
    return_vis=True,
    out_dir="./outputs",
)
%cd /content

In [None]:
inf_result.keys()

In [None]:
inf_result["predictions"][0].keys()

In [None]:
inf_result["predictions"][0]["panoptic_seg"]

In [None]:
inf_result["predictions"][0]["masks"][0]

In [None]:
from matplotlib import cm
from PIL import Image


im = Image.fromarray(inf_result["visualization"][0])
display(im)

In [None]:
# from panopticapi import evaluation

# evaluation.pq_compute(
#     gt_json_file='/content/data/cityscapes/annotations/instancesonly_filtered_gtFine_train.json',
#     pred_json_file='',
# )

## Infer one image and compute metrics using torchmetrics

In [None]:
# !pip install -qq torchmetrics==1.2.1

In [None]:
# from torch import tensor

# preds = tensor([[[[6, 0], [0, 0], [6, 0], [6, 0]],
#                  [[0, 0], [0, 0], [6, 0], [0, 1]],
#                  [[0, 0], [0, 0], [6, 0], [0, 1]],
#                  [[0, 0], [7, 0], [6, 0], [1, 0]],
#                  [[0, 0], [7, 0], [7, 0], [7, 0]]]])
# target = tensor([[[[6, 0], [0, 1], [6, 0], [0, 1]],
#                   [[0, 1], [0, 1], [6, 0], [0, 1]],
#                   [[0, 1], [0, 1], [6, 0], [1, 0]],
#                   [[0, 1], [7, 0], [1, 0], [1, 0]],
#                   [[0, 1], [7, 0], [7, 0], [7, 0]]]])

# print(preds.shape, target.shape)

In [None]:
# from torchmetrics.detection import PanopticQuality

# metric = PanopticQuality(things = {0, 1}, stuffs = {6, 7})
# metric.update(preds, target)
# fig_, ax_ = metric.plot()

In [None]:
from IPython.display import Image


Image("/content/data/cityscapes/gtFine/train/krefeld/krefeld_000000_000108_gtFine_color.png")

## train mask2former on cityscapes dataset

In [None]:
%cd /content/
%rm -rf work_dirs

In [None]:
%%writefile mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train.py
auto_scale_lr = dict(base_batch_size=16, enable=False)  # base_batch_size=16
backend_args = None
batch_augments = [
    dict(
        img_pad_value=0,
        mask_pad_value=0,
        pad_mask=True,
        pad_seg=True,
        seg_pad_value=255,
        size=(
            1024,
            1024,
        ),
        type='BatchFixedSizePad'),
]
data_preprocessor = dict(
    batch_augments=[
        dict(
            img_pad_value=0,
            mask_pad_value=0,
            pad_mask=True,
            pad_seg=True,
            seg_pad_value=255,
            size=(
                1024,
                1024,
            ),
            type='BatchFixedSizePad'),
    ],
    bgr_to_rgb=True,
    mask_pad_value=0,
    mean=[
        123.675,
        116.28,
        103.53,
    ],
    pad_mask=True,
    pad_seg=True,
    pad_size_divisor=32,
    seg_pad_value=255,
    std=[
        58.395,
        57.12,
        57.375,
    ],
    type='DetDataPreprocessor')
data_root = './data/cityscapes/'
dataset_type = 'CityscapesPanopticDataset'
default_hooks = dict(
    checkpoint=dict(
        by_epoch=False,
        interval=5000,
        max_keep_ckpts=3,
        save_last=True,
        type='CheckpointHook'),
    logger=dict(interval=50, type='LoggerHook'),
    param_scheduler=dict(type='ParamSchedulerHook'),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    timer=dict(type='IterTimerHook'),
    visualization=dict(type='DetVisualizationHook'))
default_scope = 'mmdet'
dynamic_intervals = [
    (
        365001,
        368750,
    ),
]
embed_multi = dict(decay_mult=0.0, lr_mult=1.0)
env_cfg = dict(
    cudnn_benchmark=False,
    dist_cfg=dict(backend='nccl'),
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
image_size = (
    1024,
    1024,
)
interval = 5000
load_from = None
log_level = 'INFO'
log_processor = dict(by_epoch=False, type='LogProcessor', window_size=50)
max_iters = 368750
model = dict(
    backbone=dict(
        depth=50,
        frozen_stages=-1,
        init_cfg=dict(checkpoint='torchvision://resnet50', type='Pretrained'),
        norm_cfg=dict(requires_grad=False, type='BN'),
        norm_eval=True,
        num_stages=4,
        out_indices=(
            0,
            1,
            2,
            3,
        ),
        style='pytorch',
        type='ResNet'),
    data_preprocessor=dict(
        batch_augments=[
            dict(
                img_pad_value=0,
                mask_pad_value=0,
                pad_mask=True,
                pad_seg=True,
                seg_pad_value=255,
                size=(
                    1024,
                    1024,
                ),
                type='BatchFixedSizePad'),
        ],
        bgr_to_rgb=True,
        mask_pad_value=0,
        mean=[
            123.675,
            116.28,
            103.53,
        ],
        pad_mask=True,
        pad_seg=True,
        pad_size_divisor=32,
        seg_pad_value=255,
        std=[
            58.395,
            57.12,
            57.375,
        ],
        type='DetDataPreprocessor'),
    init_cfg=None,
    panoptic_fusion_head=dict(
        init_cfg=None,
        loss_panoptic=None,
        num_stuff_classes=21,#53
        num_things_classes=14,#80
        type='MaskFormerFusionHead'),
    panoptic_head=dict(
        enforce_decoder_input_project=False,
        feat_channels=256,
        in_channels=[
            256,
            512,
            1024,
            2048,
        ],
        loss_cls=dict(
            class_weight=[
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                1.0,
                0.1,

                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 1.0,
                # 0.1,
            ],
            loss_weight=2.0,
            reduction='mean',
            type='CrossEntropyLoss',
            use_sigmoid=False),
        loss_dice=dict(
            activate=True,
            eps=1.0,
            loss_weight=5.0,
            naive_dice=True,
            reduction='mean',
            type='DiceLoss',
            use_sigmoid=True),
        loss_mask=dict(
            loss_weight=5.0,
            reduction='mean',
            type='CrossEntropyLoss',
            use_sigmoid=True),
        num_queries=100,
        num_stuff_classes=21, # 53
        num_things_classes=14, # 80
        num_transformer_feat_level=3,
        out_channels=256,
        pixel_decoder=dict(
            act_cfg=dict(type='ReLU'),
            encoder=dict(
                layer_cfg=dict(
                    ffn_cfg=dict(
                        act_cfg=dict(inplace=True, type='ReLU'),
                        embed_dims=256,
                        feedforward_channels=1024,
                        ffn_drop=0.0,
                        num_fcs=2),
                    self_attn_cfg=dict(
                        batch_first=True,
                        dropout=0.0,
                        embed_dims=256,
                        num_heads=8,
                        num_levels=3,
                        num_points=4)),
                num_layers=6),
            norm_cfg=dict(num_groups=32, type='GN'),
            num_outs=3,
            positional_encoding=dict(normalize=True, num_feats=128),
            type='MSDeformAttnPixelDecoder'),
        positional_encoding=dict(normalize=True, num_feats=128),
        strides=[
            4,
            8,
            16,
            32,
        ],
        transformer_decoder=dict(
            init_cfg=None,
            layer_cfg=dict(
                cross_attn_cfg=dict(
                    batch_first=True, dropout=0.0, embed_dims=256,
                    num_heads=8),
                ffn_cfg=dict(
                    act_cfg=dict(inplace=True, type='ReLU'),
                    embed_dims=256,
                    feedforward_channels=2048,
                    ffn_drop=0.0,
                    num_fcs=2),
                self_attn_cfg=dict(
                    batch_first=True, dropout=0.0, embed_dims=256,
                    num_heads=8)),
            num_layers=9,
            return_intermediate=True),
        type='Mask2FormerHead'),
    test_cfg=dict(
        filter_low_score=True,
        instance_on=True,
        iou_thr=0.8,
        max_per_image=100,
        panoptic_on=True,
        semantic_on=False),
    train_cfg=dict(
        assigner=dict(
            match_costs=[
                dict(type='ClassificationCost', weight=2.0),
                dict(
                    type='CrossEntropyLossCost', use_sigmoid=True, weight=5.0),
                dict(eps=1.0, pred_act=True, type='DiceCost', weight=5.0),
            ],
            type='HungarianAssigner'),
        importance_sample_ratio=0.75,
        num_points=12544,
        oversample_ratio=3.0,
        sampler=dict(type='MaskPseudoSampler')),
    type='Mask2Former')
num_classes = 35#133
num_stuff_classes = 21#53
num_things_classes = 14#80
optim_wrapper = dict(
    clip_grad=dict(max_norm=0.01, norm_type=2),
    optimizer=dict(
        betas=(
            0.9,
            0.999,
        ),
        eps=1e-08,
        lr=0.0001,
        type='AdamW',
        weight_decay=0.05),
    paramwise_cfg=dict(
        custom_keys=dict(
            backbone=dict(decay_mult=1.0, lr_mult=0.1),
            level_embed=dict(decay_mult=0.0, lr_mult=1.0),
            query_embed=dict(decay_mult=0.0, lr_mult=1.0),
            query_feat=dict(decay_mult=0.0, lr_mult=1.0)),
        norm_decay_mult=0.0),
    type='OptimWrapper')
param_scheduler = dict(
    begin=0,
    by_epoch=False,
    end=368750,
    gamma=0.1,
    milestones=[
        327778,
        355092,
    ],
    type='MultiStepLR')
resume = False
test_cfg = dict(type='TestLoop')
test_dataloader = dict(
    batch_size=1,
    dataset=dict(
        ann_file='annotations/cityscapes_panoptic_val.json',
        backend_args=None,
        data_prefix=dict(img='leftImg8bit/cityscapes_panoptic_val/', seg='gtFine/cityscapes_panoptic_val/'),
        data_root='./data/cityscapes/',
        pipeline=[
            dict(backend_args=None, type='LoadImageFromFile'),
            dict(keep_ratio=True, scale=(
                1333,
                800,
            ), type='Resize'),
            dict(backend_args=None, type='LoadPanopticAnnotations'),
            dict(
                meta_keys=(
                    'img_id',
                    'img_path',
                    'ori_shape',
                    'img_shape',
                    'scale_factor',
                ),
                type='PackDetInputs'),
        ],
        test_mode=True,
        type='CityscapesPanopticDataset'),
    drop_last=False,
    num_workers=2,
    persistent_workers=True,
    sampler=dict(shuffle=False, type='DefaultSampler'))
test_evaluator = [
    dict(
        ann_file='./data/cityscapes/annotations/cityscapes_panoptic_val.json',
        backend_args=None,
        seg_prefix='./data/cityscapes/gtFine/cityscapes_panoptic_val',
        type='CocoPanopticMetric'),
    # dict(
    #     ann_file='data/cityscapes/annotations/cityscapes_instances_val.json',
    #     backend_args=None,
    #     metric=[
    #         'bbox',
    #         'segm',
    #     ],
    #     type='CocoMetric'),
]
test_pipeline = [
    dict(backend_args=None, type='LoadImageFromFile'),
    dict(keep_ratio=True, scale=(
        1333,
        800,
    ), type='Resize'),
    dict(backend_args=None, type='LoadPanopticAnnotations'),
    dict(
        meta_keys=(
            'img_id',
            'img_path',
            'ori_shape',
            'img_shape',
            'scale_factor',
        ),
        type='PackDetInputs'),
]
train_cfg = dict(
    dynamic_intervals=[
        (
            365001,
            368750,
        ),
    ],
    max_iters=1000, #368750
    type='IterBasedTrainLoop',
    val_interval=5000)
train_dataloader = dict(
    batch_sampler=dict(type='AspectRatioBatchSampler'),
    batch_size=1, # 2
    dataset=dict(
        ann_file='annotations/cityscapes_panoptic_train.json',
        backend_args=None,
        data_prefix=dict(
            img='leftImg8bit/cityscapes_panoptic_train/', seg='gtFine/cityscapes_panoptic_train/'),
        data_root='./data/cityscapes/',
        filter_cfg=dict(filter_empty_gt=True, min_size=32),
        pipeline=[
            dict(backend_args=None, to_float32=True, type='LoadImageFromFile'),
            dict(
                backend_args=None,
                type='LoadPanopticAnnotations',
                with_bbox=True,
                with_mask=True,
                with_seg=True),
            dict(prob=0.5, type='RandomFlip'),
            dict(
                keep_ratio=True,
                ratio_range=(
                    0.1,
                    2.0,
                ),
                scale=(
                    1024,
                    1024,
                ),
                type='RandomResize'),
            dict(
                allow_negative_crop=True,
                crop_size=(
                    1024,
                    1024,
                ),
                crop_type='absolute',
                recompute_bbox=True,
                type='RandomCrop'),
            dict(type='PackDetInputs'),
        ],
        type='CityscapesPanopticDataset'),
    num_workers=2,
    persistent_workers=True,
    sampler=dict(shuffle=True, type='DefaultSampler'))
train_pipeline = [
    dict(backend_args=None, to_float32=True, type='LoadImageFromFile'),
    dict(
        backend_args=None,
        type='LoadPanopticAnnotations',
        with_bbox=True,
        with_mask=True,
        with_seg=True),
    dict(prob=0.5, type='RandomFlip'),
    dict(
        keep_ratio=True,
        ratio_range=(
            0.1,
            2.0,
        ),
        scale=(
            1024,
            1024,
        ),
        type='RandomResize'),
    dict(
        allow_negative_crop=True,
        crop_size=(
            1024,
            1024,
        ),
        crop_type='absolute',
        recompute_bbox=True,
        type='RandomCrop'),
    dict(type='PackDetInputs'),
]
val_cfg = dict(type='ValLoop')
val_dataloader = dict(
    batch_size=1,
    dataset=dict(
        ann_file='annotations/cityscapes_panoptic_val.json',
        backend_args=None,
        data_prefix=dict(img='leftImg8bit/cityscapes_panoptic_val/', seg='gtFine/cityscapes_panoptic_val/'),
        data_root='./data/cityscapes/',
        pipeline=[
            dict(backend_args=None, type='LoadImageFromFile'),
            dict(keep_ratio=True, scale=(
                1333,
                800,
            ), type='Resize'),
            dict(backend_args=None, type='LoadPanopticAnnotations'),
            dict(
                meta_keys=(
                    'img_id',
                    'img_path',
                    'ori_shape',
                    'img_shape',
                    'scale_factor',
                ),
                type='PackDetInputs'),
        ],
        test_mode=True,
        type='CityscapesPanopticDataset'),
    drop_last=False,
    num_workers=2,
    persistent_workers=True,
    sampler=dict(shuffle=False, type='DefaultSampler'))
val_evaluator = [
    dict(
        ann_file='./data/cityscapes/annotations/cityscapes_panoptic_val.json',
        backend_args=None,
        seg_prefix='./data/cityscapes/gtFine/cityscapes_panoptic_val/',
        type='CocoPanopticMetric'),
    # dict(
    #     ann_file='data/cityscapes/annotations/cityscapes_instances_val.json',
    #     backend_args=None,
    #     metric=[
    #         'bbox',
    #         'segm',
    #     ],
    #     type='CocoMetric'),
]
vis_backends = [
    dict(type='LocalVisBackend'),
]
visualizer = dict(
    name='visualizer',
    type='DetLocalVisualizer',
    vis_backends=[
        dict(type='LocalVisBackend'),
    ])


In [None]:
%%writefile /content/mmdetection/mmdet/datasets/cityscapes_panoptic_dataset.py
from mmdet.registry import DATASETS
from .api_wrappers import COCOPanoptic
from .coco_panoptic import CocoPanopticDataset


@DATASETS.register_module()
class CityscapesPanopticDataset(CocoPanopticDataset):

    METAINFO = {
        'classes':
        (
         'unlabeled', 'ego vehicle', 'rectification border', 'out of roi', 'static', 'dynamic',
         'ground', 'road', 'sidewalk', 'parking', 'rail track', 'building', 'wall', 'fence',
         'guard rail', 'bridge', 'tunnel', 'pole', 'polegroup', 'traffic light', 'traffic sign',
         'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'caravan',
         'trailer', 'train', 'motorcycle', 'bicycle', 'license plate'
        ),
        'thing_classes': (
          'ego vehicle', 'static', 'dynamic', 'person', 'rider', 'car', 'truck', 'bus',
          'caravan', 'trailer', 'train', 'motorcycle', 'bicycle', 'license plate'
        ),
        'stuff_classes': (
          'unlabeled', 'rectification border', 'out of roi', 'ground', 'road', 'sidewalk',
          'parking', 'rail track', 'building', 'wall', 'fence', 'guard rail', 'bridge',
          'tunnel', 'pole', 'polegroup', 'traffic light', 'traffic sign', 'vegetation',
          'terrain', 'sky'
        ),
        'palette':
        [
         (0,  0,  0), (0,  0,  0), (0,  0,  0), (0,  0,  0), (0,  0,  0), (111, 74,  0),
         (81,  0, 81), (128, 64, 128), (244, 35, 232), (250, 170, 160), (230, 150, 140),
         (70, 70, 70), (102, 102, 156), (190, 153, 153), (180, 165, 180), (150, 100, 100),
         (150, 120, 90), (153, 153, 153), (153, 153, 153), (250, 170, 30), (220, 220,  0),
         (107, 142, 35), (152, 251, 152), (70, 130, 180), (220, 20, 60), (255,  0,  0),
         (0,  0, 142), (0,  0, 70), (0, 60, 100), (0,  0, 90), (0,  0, 110), (0, 80, 100),
         (0,  0, 230), (119, 11, 32), (0,  0, 142)
        ]
    }
    COCOAPI = COCOPanoptic
    # ann_id is not unique in coco panoptic dataset.
    ANN_ID_UNIQUE = False


In [None]:
%%writefile /content/mmdetection/mmdet/datasets/__init__.py
# Copyright (c) OpenMMLab. All rights reserved.
from .ade20k import (ADE20KInstanceDataset, ADE20KPanopticDataset,
                     ADE20KSegDataset)
from .base_det_dataset import BaseDetDataset
from .base_semseg_dataset import BaseSegDataset
from .base_video_dataset import BaseVideoDataset
from .cityscapes import CityscapesDataset
from .coco import CocoDataset
from .coco_caption import CocoCaptionDataset
from .coco_panoptic import CocoPanopticDataset
from .coco_semantic import CocoSegDataset
from .crowdhuman import CrowdHumanDataset
from .dataset_wrappers import ConcatDataset, MultiImageMixDataset
from .deepfashion import DeepFashionDataset
from .dsdl import DSDLDetDataset
from .isaid import iSAIDDataset
from .lvis import LVISDataset, LVISV1Dataset, LVISV05Dataset
from .mot_challenge_dataset import MOTChallengeDataset
from .objects365 import Objects365V1Dataset, Objects365V2Dataset
from .openimages import OpenImagesChallengeDataset, OpenImagesDataset
from .refcoco import RefCocoDataset
from .reid_dataset import ReIDDataset
from .samplers import (AspectRatioBatchSampler, ClassAwareSampler,
                       GroupMultiSourceSampler, MultiSourceSampler,
                       TrackAspectRatioBatchSampler, TrackImgSampler)
from .utils import get_loading_pipeline
from .v3det import V3DetDataset
from .voc import VOCDataset
from .wider_face import WIDERFaceDataset
from .xml_style import XMLDataset
from .youtube_vis_dataset import YouTubeVISDataset
from .cityscapes_panoptic_dataset import CityscapesPanopticDataset

__all__ = [
    'XMLDataset', 'CocoDataset', 'DeepFashionDataset', 'VOCDataset',
    'CityscapesDataset', 'LVISDataset', 'LVISV05Dataset', 'LVISV1Dataset',
    'WIDERFaceDataset', 'get_loading_pipeline', 'CocoPanopticDataset',
    'MultiImageMixDataset', 'OpenImagesDataset', 'OpenImagesChallengeDataset',
    'AspectRatioBatchSampler', 'ClassAwareSampler', 'MultiSourceSampler',
    'GroupMultiSourceSampler', 'BaseDetDataset', 'CrowdHumanDataset',
    'Objects365V1Dataset', 'Objects365V2Dataset', 'DSDLDetDataset',
    'BaseVideoDataset', 'MOTChallengeDataset', 'TrackImgSampler',
    'ReIDDataset', 'YouTubeVISDataset', 'TrackAspectRatioBatchSampler',
    'ADE20KPanopticDataset', 'CocoCaptionDataset', 'RefCocoDataset',
    'BaseSegDataset', 'ADE20KSegDataset', 'CocoSegDataset',
    'ADE20KInstanceDataset', 'iSAIDDataset', 'V3DetDataset', 'ConcatDataset', 'CityscapesPanopticDataset'
]

In [None]:
import locale


locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
import torch


with torch.no_grad():
    torch.cuda.empty_cache()
!nvidia-smi

In [None]:
!python mmdetection/tools/train.py mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train.py

In [None]:
# !rm -rf /content/outputs/

In [None]:
!python /content/mmdetection/demo/image_demo.py \
		/content/mmdetection/demo/demo.jpg \
        /content/work_dirs/mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train/mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train.py \
        --weights /content/work_dirs/mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train/iter_1000.pth \
        --device cpu

In [None]:
from IPython.display import Image


Image("outputs/vis/demo.jpg")

In [None]:
!python /content/mmdetection/demo/image_demo.py \
		/content/data/cityscapes/leftImg8bit/train/krefeld/krefeld_000000_000108_leftImg8bit.png \
        /content/work_dirs/mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train/mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train.py \
        --weights /content/work_dirs/mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train/iter_1000.pth \
        --device cpu

In [None]:
from IPython.display import Image


Image("outputs/vis/krefeld_000000_000108_leftImg8bit.png")

In [None]:
!python mmdetection/tools/test.py \
        /content/work_dirs/mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train/mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train.py \
        /content/work_dirs/mask2former_r50_8xb2-lsj-50e_cityscapes-panoptic_train/iter_1000.pth

In [None]:
%cd /content
!python mmdetection/tools/train.py \
        mmdetection/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py

In [None]:
%cd /content

%rm -rf data

In [None]:
%cd /content

%mkdir data
%cd data
%mkdir coco
%cd coco
%mkdir images
%cd images

# !wget -c http://images.cocodataset.org/zips/train2017.zip
!wget -c http://images.cocodataset.org/zips/val2017.zip
!wget -c http://images.cocodataset.org/zips/test2017.zip
# !wget -c http://images.cocodataset.org/zips/unlabeled2017.zip

# !unzip train2017.zip
!unzip val2017.zip
!unzip test2017.zip
# !unzip unlabeled2017.zip

# %rm train2017.zip
%rm val2017.zip
%rm test2017.zip
# %rm unlabeled2017.zip

%cd ../
!wget -c http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!wget -c http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip
!wget -c http://images.cocodataset.org/annotations/image_info_test2017.zip
!wget -c http://images.cocodataset.org/annotations/image_info_unlabeled2017.zip
!wget http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip

!unzip annotations_trainval2017.zip
!unzip stuff_annotations_trainval2017.zip
!unzip image_info_test2017.zip
!unzip image_info_unlabeled2017.zip
!unzip panoptic_annotations_trainval2017.zip

%rm annotations_trainval2017.zip
%rm stuff_annotations_trainval2017.zip
%rm image_info_test2017.zip
%rm image_info_unlabeled2017.zip
%rm panoptic_annotations_trainval2017.zip