# The City x 2 People Trainer
Based on Detectron2 tutorial

Notes:
*   If you have Pro+, remember to set up background execution in the Runtime...Change Runtime Type menu.


Mount Google Drive for persistent storage.  This is where training images should be.

Here is the folder I've been using for testing, which you'll need to add to your Drive and then update the paths below, potentially. https://drive.google.com/drive/folders/1L9Qs21Du0ZQCVJ5xavITEPqAbXZyjmbb?usp=sharing

In [None]:
# https://neptune.ai/blog/google-colab-dealing-with-files
from google.colab import drive
drive.mount('/content/train')#, force_remount=True)

Mounted at /content/train


# Install detectron2
Get the right packages and do basic imports.

In [None]:
!python -m pip install pyyaml==5.1
# Detectron2 has not released pre-built binaries for the latest pytorch (https://github.com/facebookresearch/detectron2/issues/4053)
# so we install from source instead. This takes a few minutes.
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

# Install pre-built detectron2 that matches pytorch version, if released:
# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
#!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/{CUDA_VERSION}/{TORCH_VERSION}/index.html

# exit(0)  # After installation, you may need to "restart runtime" in Colab. This line can also restart runtime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyyaml==5.1
  Downloading PyYAML-5.1.tar.gz (274 kB)
[K     |████████████████████████████████| 274 kB 32.6 MB/s 
[?25hBuilding wheels for collected packages: pyyaml
  Building wheel for pyyaml (setup.py) ... [?25l[?25hdone
  Created wheel for pyyaml: filename=PyYAML-5.1-cp38-cp38-linux_x86_64.whl size=44089 sha256=f1c986a07e842c301fa51ca957023b2c358d2c4f1773e0e9b3222a7b0d201de9
  Stored in directory: /root/.cache/pip/wheels/52/dd/2b/10ff8b0ac81b93946bb5fb9e6749bae2dac246506c8774e6cf
Successfully built pyyaml
Installing collected packages: pyyaml
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 6.0
    Uninstalling PyYAML-6.0:
      Successfully uninstalled PyYAML-6.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask 202

In [None]:
import torch, detectron2
!nvcc --version
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)
print("detectron2:", detectron2.__version__)

# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import os, json, cv2, random
from google.colab.patches import cv2_imshow

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
torch:  1.13 ; cuda:  cu116
detectron2: 0.6


# Set up Cx2 helper functions

In [None]:

# This function builds a VGG format annotation file for a single file and single class from OpenCV contours

def getVGGIA(dict_in, contours, class_name, file_name, height, width):
  # based on https://gist.githubusercontent.com/vintel38/57ccd513f072e050cc14c5ac9267de33/raw/3fd31d1efda286d191588ddfa2391726b0c094d6/annot.py
  res = 0.6
  eps = 0.001  # tuning this improves the contour accuracy but generates larger files
  surf = 100

  areas = [cv2.contourArea(contours[idx])*res*res for idx in range(len(contours))]
  large_contour = []
  for i in range(len(areas)):
      if areas[i]>surf:
          large_contour.append(contours[i])
  approx_contour = [cv2.approxPolyDP(c, eps * cv2.arcLength(c, True), True) for c in large_contour]

  regions = {}

  jsonf = dict_in
  # -------------------------------------------------------------------------------
  # BUILDING VGG ANNTOTATION TOOL ANNOTATIONS LIKE
  if len(approx_contour) > 0:
      regions = {str(i):None for i in range(len(approx_contour))}
      for i in range(len(approx_contour)):
          shape_attributes = {}
          region_attributes = {}
          region_attributes['class'] = class_name
          regionsi = {}
          shape_attributes['name'] = 'polygon'
          shape_attributes['all_points_x'] = approx_contour[i][:, 0][:, 0].tolist()
          # https://stackoverflow.com/questions/26646362/numpy-array-is-not-json-serializable
          shape_attributes['all_points_y'] = approx_contour[i][:, 0][:, 1].tolist()
          regionsi['shape_attributes'] = shape_attributes
          regionsi['region_attributes'] = region_attributes
          regions[str(i)] = regionsi    ## TODO: See if this fixes parser issue below

      elt = os.path.basename(file_name)

      file_size = os.path.getsize(file_name)
      name = elt + str(file_size)
      json_elt = {}
      json_elt['filename'] = elt
      json_elt['size'] = str(file_size)
      json_elt['regions'] = regions
      json_elt['file_attributes'] = {}
      #jb
      json_elt['height'] = height
      json_elt['width'] = width
      #
      jsonf[name] = json_elt


      return(jsonf)
  else:
      print("error in VGGIA"+file_name)

We first download an image from our Ul Qoma test set, then run the pre-trained model. The intention is to show how we can find a bbox for "person" that we can then use as input to the trainer... While this is a green screen scene, should work on any scene in which there are only Ul Qoman people.

# Load and register dataset

Munge the VGGIA format into a detectron dataset. Register itto detectron2, following the [detectron2 custom dataset tutorial](https://detectron2.readthedocs.io/tutorials/datasets.html).

Here, the dataset is in its custom format, therefore we write a function to parse it and prepare it into detectron2's standard format. User should write such a function when using a dataset in custom format. See the tutorial for more details.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# if your dataset is in COCO format, this cell can be replaced by the following three lines:
# from detectron2.data.datasets import register_coco_instances
# register_coco_instances("my_dataset_train", {}, "json_annotation_train.json", "path/to/image/dir")
# register_coco_instances("my_dataset_val", {}, "json_annotation_val.json", "path/to/image/dir")
import sys
from detectron2.structures import BoxMode

def get_cx2_dicts(classnum, img_dir,annots):
    #json_file = os.path.join(img_dir, "via_region_data.json")
    json_file=annots
    with open(json_file) as f:
        imgs_anns = json.load(f)

    LIMIT = sys.maxsize #5
    dataset_dicts = []

    for idx, v in enumerate(imgs_anns.values()):
        #print(idx)
        if idx>LIMIT: break
        record = {}

        filename = os.path.join(img_dir, v["filename"])
        #height, width = cv2.imread(filename).shape[:2]

        #print(idx, filename)
        record["file_name"] = filename
        record["image_id"] = idx
        record["height"] = v["height"] #height
        record["width"] = v["width"] #width

        annos = v["regions"]
        objs = []

        #print(type(annos))
        ## Transfer list to dict if needed... sigh... (jb)
        # if type(annos) is list:
        #   #print("Converting")
        #   annos = { str(i):annos[i] for i in range(0, len(annos))}


        for _, anno in annos.items():
            #assert not anno["region_attributes"]  ## jb this doesn't work for ours because we have properly storeed.. .but they are assuming all the same
            anno = anno["shape_attributes"]
            px = anno["all_points_x"]
            py = anno["all_points_y"]
            poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)]
            poly = [p for x in poly for p in x]

            obj = {
                "bbox": [np.min(px), np.min(py), np.max(px), np.max(py)],
                "bbox_mode": BoxMode.XYXY_ABS,
                "segmentation": [poly],
                "category_id": classnum,  ##JB
            }
            objs.append(obj)
        record["annotations"] = objs
        dataset_dicts.append(record)
    return dataset_dicts

# for d in ["train", "val"]:
#     DatasetCatalog.register("cx2_" + d, lambda d=d: get_cx2_dicts("balloon/" + d))
#     MetadataCatalog.get("cx2_" + d).set(thing_classes=["balloon"])
# cx2_metadata = MetadataCatalog.get("cx2_train")

print("Registering Ul Qoma")
try:  # remove them from the catalogs if exist
  DatasetCatalog.pop("person_ul_qoma")
  MetadataCatalog.pop("person_ul_qoma")
except:
  pass
d="train"
DatasetCatalog.register("person_ul_qoma", lambda d=d: get_cx2_dicts(1, "/content/train/MyDrive/TrainingClips_10.27.22/ToTrain/ExtractedFrames-UlQoma", "/content/train/MyDrive/TrainingClips_10.27.22/via_region_data__ul_qoma.json"))
MetadataCatalog.get("person_ul_qoma").set(thing_classes=["person_beszel","person_ul_qoma"])
cx2_metadata_uq = MetadataCatalog.get("person_ul_qoma")
print("Registering Beszel")
try:
  DatasetCatalog.pop("person_beszel")
  MetadataCatalog.pop("person_beszel")
except:
  pass
DatasetCatalog.register("person_beszel", lambda d=d: get_cx2_dicts(0, "/content/train/MyDrive/TrainingClips_10.27.22/ToTrain/ExtractedFrames-Bezel", "/content/train/MyDrive/TrainingClips_10.27.22/via_region_data__beszel.json"))
MetadataCatalog.get("person_beszel").set(thing_classes=["person_beszel", "person_ul_qoma"])
cx2_metadata_b = MetadataCatalog.get("person_beszel")
#note this doesn't actually call the fxn above, just registers it.

print("Registering combined")
try:
  DatasetCatalog.pop("person_cx2")
  MetadataCatalog.pop("person_cx2")
except:
  pass
DatasetCatalog.register("person_cx2", lambda d=d: [*get_cx2_dicts(0, "/content/train/MyDrive/TrainingClips_10.27.22/ToTrain/ExtractedFrames-Bezel", "/content/train/MyDrive/TrainingClips_10.27.22/via_region_data__beszel.json"), *get_cx2_dicts(1, "/content/train/MyDrive/TrainingClips_10.27.22/ToTrain/ExtractedFrames-UlQoma", "/content/train/MyDrive/TrainingClips_10.27.22/via_region_data__ul_qoma.json")])
MetadataCatalog.get("person_cx2").set(thing_classes=["person_beszel", "person_ul_qoma"])
cx2_metadata = MetadataCatalog.get("person_cx2")

## TODO - Revise to consolidate to a single dataset.
##


Registering Ul Qoma
Registering Beszel
Registering combined


# Test new model
The detectron sample has some image tests (or they can be derived from above) but we're going to show here how to test on a video sample directly.

https://stackoverflow.com/questions/60663073/how-can-i-properly-run-detectron2-on-videos

In [None]:
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()
# import some common libraries
import numpy as np
import tqdm
import cv2
# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode, Visualizer
from detectron2.data import MetadataCatalog
import time
import os

in_video = "/content/train/MyDrive/TrainingClips_10.27.22/Location01/ID_Mixed_01.mp4"
out_video = '/content/train/MyDrive/TrainingClips_10.27.22/Location01/ID_Mixed_01-out.mp4'
model_dir = "/content/train/MyDrive/TrainingClips_10.27.22/model"
frame_dir = '/content/train/MyDrive/TrainingClips_10.27.22/out/ID_Mixed_01-Frames'
mask_dir = '/content/train/MyDrive/TrainingClips_10.27.22/out/ID_Mixed_01-Masks'

#num_frames = 7500 # 7500 # Create a cut-off for debugging
metadata = cx2_metadata

# Extract video properties
video = cv2.VideoCapture(in_video)
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames_per_second = video.get(cv2.CAP_PROP_FPS)
num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))

# Initialize video writer
video_writer = cv2.VideoWriter(out_video, fourcc=cv2.VideoWriter_fourcc(*"mp4v"), fps=float(frames_per_second), frameSize=(width, height), isColor=True)

# Initialize predictor
cfg = get_cfg()
cfg.OUTPUT_DIR = model_dir
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TEST = ()
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")  # Let training initialize from model zoo
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2  # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model we just trained
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.4   # set a custom testing threshold (0.6 -- orig)
predictor = DefaultPredictor(cfg)
# Initialize visualizer
v = VideoVisualizer(metadata, ColorMode.IMAGE)

prevMask = None
#set what you want to inpaint
beszel = True

def runOnVideo(video, maxFrame):
    for readFrames in range(maxFrame):
        hasFrame, frame = video.read()
        if not hasFrame:
            print("does not have frame")
            break
        outputs = predictor(frame)
        #frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) ---- INGRID: this was converting to blue!!!
        visualization = v.draw_instance_predictions(frame, outputs["instances"].to("cpu"))
        prevMask = np.zeros(frame.shape) #initialize prevMask
        try:
          #extract which one i want
          #instances = outputs["instances"] #make an easier name to access
          #print("b4 beszel")
          if beszel:
            beszel_detections = outputs["instances"][outputs["instances"].pred_classes == 0]
            #print(len(beszel_detections.pred_masks))

            mask = np.uint8(np.asarray(beszel_detections.to("cpu").pred_masks[0]))
            for i in range(1, len(beszel_detections.pred_masks)):
                mask = np.add(mask, np.uint8(np.asarray(beszel_detections.to("cpu").pred_masks[i])))

          else:
            #print("in ul qoma")
            ul_qoma_detections = outputs["instances"][outputs["instances"].pred_classes == 1]

            mask = np.uint8(np.asarray(ul_qoma_detections.to("cpu").pred_masks[0]))
            for i in range(1, len(ul_qoma_detections.pred_masks)):
                mask = np.add(mask, np.uint8(np.asarray(ul_qoma_detections.to("cpu").pred_masks[i])))

          prevMask = mask
        except:
          #mask set to the prev mask, if none: then will be empty numpy of zeros
          mask = prevMask
          print("used prev mask")
          #The shape of an image is accessed by img.shape. It returns a tuple of the number of rows, columns, and channels (if the image is color):
        gray = mask*255

        visualization = visualization.get_image()
        #visualization = cv2.cvtColor(visualization.get_image(), cv2.COLOR_BGR2RGB) -- TURNING IT BLUE!!
        yield {"visualization": visualization, "frame":frame, "mask":gray, "idx":readFrames+1}

# Enumerate the frames of the video
for out in tqdm.tqdm(runOnVideo(video, num_frames), total=num_frames):
   video_writer.write(out["visualization"])
   cv2.imwrite(os.path.join(frame_dir, "%06d.png" % out["idx"]), out["frame"])
   cv2.imwrite(os.path.join(mask_dir, "%06d.png" % out["idx"]), out["mask"])

# Release resources
video.release()
video_writer.release()
cv2.destroyAllWindows()


[12/14 06:52:13 d2.checkpoint.c2_model_loading]: Following weights matched with model:
| Names in Model                                  | Names in Checkpoint                                                                                  | Shapes                                          |
|:------------------------------------------------|:-----------------------------------------------------------------------------------------------------|:------------------------------------------------|
| backbone.bottom_up.res2.0.conv1.*               | backbone.bottom_up.res2.0.conv1.{norm.bias,norm.running_mean,norm.running_var,norm.weight,weight}    | (64,) (64,) (64,) (64,) (64,64,1,1)             |
| backbone.bottom_up.res2.0.conv2.*               | backbone.bottom_up.res2.0.conv2.{norm.bias,norm.running_mean,norm.running_var,norm.weight,weight}    | (64,) (64,) (64,) (64,) (64,64,3,3)             |
| backbone.bottom_up.res2.0.conv3.*               | backbone.bottom_up.res2.0.conv3.{norm.bia

100%|██████████| 722/722 [05:55<00:00,  2.03it/s]
