# Mask RCNN - Performing Instance Segementation for Video

In [1]:
# Importing Required Dependencies
import os
import cv2
import math
import time
import random
import sys
import numpy as np
import torch
import torchvision
from PIL import Image
from torchvision import transforms


# Load the Working Device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
torch.backends.cudnn.benchmark = True

# Import MRCNN Model
mask_rcnn = torchvision.models.detection.maskrcnn_resnet50_fpn(weights='DEFAULT')
mask_rcnn.eval().to(device=device)

Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth


100%|██████████| 170M/170M [00:01<00:00, 176MB/s]


MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(in

In [2]:
# COCO Dateset for Class Labels
# These are the classes that are available in the COCO-Dataset
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Create a color map for each COCO class
COCO_COLORS = {
    idx: tuple([random.randint(0, 255) for _ in range(3)])      # Key
    for idx in range(len(COCO_INSTANCE_CATEGORY_NAMES))         # Value
}

def color_for_label(label_idx):
    return COCO_COLORS.get(label_idx, (255, 255, 255))  # Returns the color of each idx associated, else white

def apply_color_mask(img_rgb, mask, color, alpha=0.5):
    # Outputs the blended Image with opacity factor alpha
    # alpha = 0 (no mask, fully transparent)
    out = img_rgb.copy()
    colored = np.zeros_like(out, dtype=np.uint8)
    colored[mask == 1] = color
    return cv2.addWeighted(out, 1, colored, alpha, 0)

In [3]:
CLASS_IDS = list(range(1, 81))  # All COCO classes
CONF_THRESHOLD = 0.5 # @param {"type":"number","placeholder":"Enter Minimum Confidence Threshold"}
MASK_THRES = 0.5 # @param {"type":"number","placeholder":"Enter Minimum Mask Threshold"}
max_long_side = 1280 # @param {"type":"integer","placeholder":"Enter Max Long Side"}

In [4]:
# Making Directories to Store Input and Output Data
video_dir_input = 'video_dir_input'           # Storing Input Video
os.makedirs(video_dir_input,exist_ok=True)

video_dir_output = 'video_dir_output'         # Storing Output Video
os.makedirs(video_dir_output,exist_ok=True)

In [10]:
INPUT_VIDEO = os.path.join(video_dir_input,'NightLife2.mp4')
OUTPUT_VIDEO = os.path.join(video_dir_output,'NightLife2_output.mp4')

In [6]:
# Utilities
def letterbox_resize(image,max_long_side=640):
  if max_long_side is None:
    return Image,1.0
  h,w = image.shape[:2]
  long_side = max(h,w)
  if long_side <= max_long_side:
    return image,1.0
  scale = max_long_side/float(long_side)
  new_h,new_w = int(round(scale * h)),int(round(scale * w))
  image = cv2.resize(image,(new_w,new_h),interpolation=cv2.INTER_AREA) # cv2 Documentation Requires Width x Heigh
  return image,scale


In [7]:
# Helper Function to Read, and Write the Video Output
def video_readwrite(path_in,path_out,max_long_side):
  assert os.path.isfile(path_in), f"Cannot find input video at {path_in}"
  cap = cv2.VideoCapture(path_in)
  assert cap.isOpened(), "Failed to open input file."

  src_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  src_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
  fps = cap.get(cv2.CAP_PROP_FPS)

  if fps is None or fps <=0 or math.isnan(fps):
    fps = 25.0

  dummy = np.zeros((src_h,src_w,3),dtype=np.uint8)
  temp,_ = letterbox_resize(dummy,max_long_side)
  dst_h,dst_w = temp.shape[0],temp.shape[1]

  # Creating the Output
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
  out = cv2.VideoWriter(path_out,fourcc,fps,(dst_w,dst_h))
  assert out.isOpened(), "Failed to output writer"

  return cap,out


In [8]:
# Pre-processing the Input Frames to convert to Tensor
# NOTE: Aviod Normalization in pre-processing as the ResNet Model Already performs the Normalization
preprocess = transforms.Compose([
    transforms.ToTensor()
])

In [11]:
cap, out = video_readwrite(INPUT_VIDEO, OUTPUT_VIDEO, max_long_side)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if cap.get(cv2.CAP_PROP_FRAME_COUNT) > 0 else None

print(f"Device: {device}")
print(f"Total Frames: {total_frames}")

processed, t0 = 0, time.time()

# Main processing loop with inference mode for efficiency
with torch.inference_mode():
    while True:
        # Read frame from video
        ret, frame = cap.read()
        if not ret:
            print("Can't receive frame (stream end?). Exiting ...")
            break

        # Preprocess frame
        # Resize frame while maintaining aspect ratio with letterboxing
        # NOTE: Default Color Code of CV2 is BGR
        frame_bgr, _ = letterbox_resize(frame, max_long_side=max_long_side)
        H, W = frame_bgr.shape[:2]  # Get the first 2 dimensions

        # Scale thickness and font based only on frame *height*
        base = H / 640.0   # relative scale; 640px height as reference
        box_th   = max(1, int(round(2 * base)))   # usually 1–3
        font_sc  = 0.5 * base                     # readable but not oversized
        text_th  = max(1, int(round(1 * base)))   # text stroke thickness

        # Convert BGR to RGB for model input
        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

        # Apply preprocessing transforms (normalization, tensor conversion)
        inp = preprocess(Image.fromarray(frame_rgb)).to(device)

        # Model inference
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            outputs = mask_rcnn([inp])[0]  # Get first (and only) batch item

        # Extract model outputs
        boxes = outputs['boxes'].detach().cpu().numpy()              # Bounding boxes [N, 4]
        labels = outputs['labels'].detach().cpu().numpy()            # Class labels [N]
        scores = outputs['scores'].detach().cpu().numpy()            # Confidence scores [N]
        raw_masks = outputs['masks'].detach().cpu().numpy()          # Segmentation masks [N,1, H, W]
        masks = (raw_masks[:,0] > MASK_THRES).astype(np.uint8)       # [N,H,W] 0/1 Binary

        out_img = frame_rgb.copy()      # Draw on this

        # Process each detected objects
        for i in range(len(boxes)):
            if float(scores[i]) < CONF_THRESHOLD:
                continue

            # Map Index to Label
            label_idx = int(labels[i])
            pred_name = COCO_INSTANCE_CATEGORY_NAMES[label_idx]

            # Resize mask to match frame dimensions if needed
            masks_i = masks[i]
            if masks_i.shape[:2] != (H, W):
                masks_i = cv2.resize(masks_i, (W, H), interpolation=cv2.INTER_NEAREST)

            # Get fixed color for the class
            color = color_for_label(label_idx)
            blended_img = apply_color_mask(out_img, masks_i, color, alpha=0.5)

            # Draw Bounding Box
            x1,y1,x2,y2 = boxes[i].astype(int)
            #x1,y1 = max(0,x1),max(0,y1)
            #x2,y2 = max(W -1,x2),max(H -1,y2)
            box_color = (0,255,0) # Green for added Visibility
            cv2.rectangle(blended_img,(x1,y1),(x2,y2),box_color,thickness=box_th)

            # Draw Text
            label_text = f"{pred_name}: {scores[i]:.2f}"
            cv2.putText(
                blended_img,
                label_text,
                (x1,max(15, y1 - 5)),
                cv2.FONT_HERSHEY_SIMPLEX,
                font_sc,
                (0,255,0),
                thickness=text_th
            )

            out_img = blended_img

        # Convert back to BGR for Writer to write
        out_frame = cv2.cvtColor(out_img,cv2.COLOR_RGB2BGR)
        out.write(out_frame)
        processed += 1

        # Step 2j: Progress reporting
        if processed % 50 == 0 and total_frames:
                print(f"Processed: {processed}/{total_frames}")

# Step 3: Cleanup and final stats
cap.release()
out.release()

dt = time.time() - t0
fps_eff = processed / dt if dt > 0 else 0.0
print(f"Done. Saved to: {OUTPUT_VIDEO}")
print(f"Processing speed: {fps_eff:.2f} FPS")

Device: cuda
Total Frames: 532


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Processed: 50/532
Processed: 100/532
Processed: 150/532
Processed: 200/532
Processed: 250/532
Processed: 300/532
Processed: 350/532
Processed: 400/532
Processed: 450/532
Processed: 500/532
Can't receive frame (stream end?). Exiting ...
Done. Saved to: video_dir_output/NightLife2_output.mp4
Processing speed: 1.52 FPS
