# Extracting frames and bounding boxes

---
## Setup
*   You need to mount your drive and create a folder named `ataxia_dataset/` with all the videos (in mp4 format).
---

### CoLab setup

In [None]:
ROOT = '/content/drive'
from google.colab import drive
drive.mount(ROOT)

### Imports

In [None]:
import os
import cv2
import json
import time
import shutil
import subprocess
import numpy as np
from tqdm import tqdm
from PIL import Image
from os.path import join
import matplotlib.pyplot as plt
from multiprocessing import Pool

import torch
import torchvision
from torchvision import transforms as T


JSONS_PATH = "/content/jsons/"
FRAMES_PATH = "/content/frames/"
RAW_VIDEOS_PATH = "/content/drive/MyDrive/ataxia_dataset/"
# use CUDA if available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

---
## 1. Extract frames using `ffmpeg`
*   We will extract frames from all videos at 30 fps and store them temporarily on CoLab.
*   Takes about 17m.
*   Advice: Run this on a CPU instance and follow the instructions in the end.
---

#### Actual extraction code

In [None]:
# Making Directory
!mkdir "$FRAMES_PATH"

In [None]:
def process_video(fname: str):
  """
  Extract frames from a video at 30 FPS, must be in `x.mp4` format.
  Frames are saved in `raw_videos_path/video_num/output_xxx.jpg`.
  """
  # ignore other files if present in the directory
  if ".mp4" not in fname:
    print(f"{fname} is not a video, skipping...")
    return None

  # 1.mp4 -> 1 conversion + :03 to ensure sorting
  video_num = f"{int(fname.split('.mp4')[0]):03}"
  if os.path.exists(os.path.join(FRAMES_PATH, video_num)):
      print(f"Already processed {video_num:03}, skipping...")
      return None

  # make the output directory for this video
  os.mkdir(os.path.join(FRAMES_PATH, video_num))

  # Break the video into frames at 30 FPS
  command = ["ffmpeg",
              "-i",
              os.path.join(RAW_VIDEOS_PATH, fname), # e.g. raw_videos_path/num.mp4
              "-vf",
              "fps=30",
              os.path.join(FRAMES_PATH, video_num, "output_%03d.jpg")] # e.g. /path/001/output_001.jpg
  subprocess.run(command, stderr=subprocess.PIPE)

In [None]:
# prepare the input for this function
videos = os.listdir(RAW_VIDEOS_PATH)
# CoLab allows 2 CPUs in the free tier
with Pool(processes=2) as pool:
  list(tqdm(pool.imap_unordered(process_video, videos), total=len(videos)))

#### Example frame

In [None]:
# Example: Original picture before detection
person0 = join(FRAMES_PATH, '000')
person0_frames = os.listdir(person0)
person0_frames.sort()

print(join(person0, person0_frames[0]))
img_ex_path = join(person0, person0_frames[0])
img_ex_origin = cv2.imread(img_ex_path)
img_ex = cv2.cvtColor(img_ex_origin, cv2.COLOR_BGR2RGB)

plt.imshow(img_ex)
plt.axis('off')
plt.show()

At this point, it is advisable to download / save the extracted frames as a zip so as to not run this extraction again in case of any errors in the next stage (the file size is about 700MB).

In [None]:
!zip frames.zip -r "$FRAMES_PATH"
shutil.copy("frames.zip", RAW_VIDEOS_PATH) # the frames will be in your drive.

### Extract processed frames
If you followed the advice and ran it on a CPU only runtime, we will now need to switch to a GPU runtime for FRCNN, you can run the prvious cell to save the frames and then extract using the next block.

In [None]:
# # uncomment the lines below to unzip the frames extracted earlier
# shutil.copy(RAW_VIDEOS_PATH + "frames.zip", "/content/")
# !unzip frames.zip
# !mv content/frames/ /content/

---
## 2. Object Detection with Faster R-CNN

*  We will use a pretrained Faster R-CNN model using ResNet50 as a backbone with FPN.
*  Takes about 1h20m.
---

### Example usage

In [None]:
# Download the pretrained Faster R-CNN model from torchvision
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True).to(DEVICE) #cuda speeds up by ~8x
model.eval()

In [None]:
# Define the class names given by PyTorch's official Docs

COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [None]:
# Example: Original picture before detection
person0 = join(FRAMES_PATH, '000')
person0_frames = os.listdir(person0)
person0_frames.sort()

print(join(person0, person0_frames[0]))
img_ex_path = join(person0, person0_frames[0])

In [None]:
# example prediction
threshold = 0.8
# Load the image
img = Image.open(img_ex_path)
# Defing PyTorch Transform
transform = T.Compose([T.ToTensor()])
# Apply the transform to the image
img = transform(img).to(DEVICE)
start = time.time()
# Pass the image to the model
pred = model([img])
end = time.time()
print(f"Took {end - start:.2f} seconds")

`pred` is a list of batched-predictions, if the model was input a batch-size of 2, it will be of length 2, and such. **Importantly** the model's predictions are always sorted in a **descending** order by score.


In [None]:
print(f"Batch-size={len(pred)}.")
print(f"An object is represented as a dictionary with keys: {pred[0].keys()}.")
print(f"The model found: {len(pred[0]['boxes'])} predictions.")
print(f"pred[0]['boxes']: {pred[0]['boxes']}")
print(f"pred[0]['labels']: {pred[0]['labels']}")
print(f"pred[0]['scores']: {pred[0]['scores']}")

# Get Prediction Labels for each prediction
pred_class = [COCO_INSTANCE_CATEGORY_NAMES[i] for i in list(pred[0]['labels'].cpu().numpy())]
print(f"Classes detected: {pred_class}")
# Appropriately parse Bounding boxes (outputted by model as [x1, y1, x2, y2])
pred_boxes = [[(i[0], i[1]), (i[2], i[3])] for i in list(pred[0]['boxes'].detach().cpu().numpy().astype(np.int32))]
pred_score = list(pred[0]['scores'].detach().cpu().numpy())
# Get the last index with score greater than threshold
pred_t = [pred_score.index(x) for x in pred_score if x > threshold][-1]
pred_boxes = pred_boxes[:pred_t+1]
pred_class = pred_class[:pred_t+1]
print(f"Classes w/ scores > threshold: {pred_class}")

In [None]:
# This will display the bounding box overlayed on the image
img = cv2.imread(img_ex_path) # Read image with cv2
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert to RGB
for i in range(len(pred_boxes)):
    cv2.rectangle(img, pred_boxes[i][0], pred_boxes[i][1], color=(0, 255, 0), thickness=3) # Draw Rectangle with the coordinates
    cv2.putText(img,pred_class[i], pred_boxes[i][0],  cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0),thickness=3) # Write the prediction class
    plt.figure(figsize=(15,20)) # display the output image
    plt.imshow(img)
    plt.xticks([])
    plt.yticks([])
    plt.show()

In [None]:
# Thus, Define a function for get a prediction result from the model
# this can be batched for multiple images
def get_prediction(img_path: str, threshold: float):
  img = Image.open(img_path).convert('RGB') # Load the image
  img = transform(img).to(DEVICE) # Apply the transform to the image
  pred = model([img]) # Pass the image to the model

  # Mask out to only keep people and confidence > threshold here itself
  mask = (pred[0]['labels'] == 1) & (pred[0]['scores'] > threshold)
  boxes = pred[0]['boxes'][mask]
  labels = pred[0]['labels'][mask]
  scores = pred[0]['scores'][mask]

  # Get the Prediction Score
  pred_class = [COCO_INSTANCE_CATEGORY_NAMES[i] for i in list(labels.cpu().numpy())]
  # Bounding boxes, the type conversion to np.int32 is necessary for cv2 later
  pred_boxes = [(i[0], i[1], i[2], i[3]) for i in list(boxes.detach().cpu().numpy().astype(np.int32))]
  pred_score = list(scores.detach().cpu().numpy())

  return pred_boxes, pred_class, pred_score

*   The picture above is an example of applying Detection Network (in our case, Faster R-CNN).
*   Since the purpose of dataset we are using is 'tracking', we only keep the 'person' class in our predictions.
*   We need a prediction result (bounding box offset, class label, pred scores) for all the images.

### Actual extraction code

In [None]:
# Making Directory
!mkdir "$JSONS_PATH"

In [None]:
# Extract bboxes, labels and scores and store as a json in proper format
# Data Structure format from - https://github.com/mlvlab/COSE474/blob/master/3_MOT_detinfo.json
for video in tqdm(sorted(os.listdir(FRAMES_PATH))):
  person_data = join(FRAMES_PATH, video)
  person_json = {}

  data_list = os.listdir(person_data)
  data_list.sort()

  for frame in data_list:
    cur_frame = join(FRAMES_PATH, video, frame)
    pred_boxes, pred_class, pred_score = get_prediction(cur_frame, 0.9)
    data_list = [{"bbox": [int(bbox[i]) for i in range(4)],
                  "labels": 1, # label will always be 1 as we are only detecting people.
                  "scores": float(score)} for bbox, score in zip(pred_boxes, pred_score)]
    person_json[frame] = data_list

  f = open(join(JSONS_PATH, video + ".json"), "w")
  json.dump(person_json, f, indent=4)
  f.close()

We move the result to the head directory (you can download as well for safe-keeping, this file is quite small).

In [None]:
!zip -r bboxes.zip /content/jsons
shutil.copy("bboxes.zip", RAW_VIDEOS_PATH)

## Thank you!
No we can move on to further processing using these frames and jsons to actually track the patient in the video.

#### Extra! Make a video with the bounding boxes (I used this for the tutorial)

In [None]:
# # uncomment the lines below to unzip the bboxes extracted earlier
# shutil.copy(RAW_VIDEOS_PATH + "bboxes.zip", "/content/")
# !unzip bboxes.zip
# !mv content/jsons/ /content/

In [None]:
BOX_VIDS_PATH = "/content/box_vids/"
!mkdir "$BOX_VIDS_PATH"

In [None]:
for video in tqdm(sorted(os.listdir(FRAMES_PATH))):
  person_data = join(FRAMES_PATH, video)
  person_json = json.load(open(join(JSONS_PATH, video + ".json")))

  data_list = os.listdir(person_data)
  data_list.sort()

  for frame in data_list:
    pred_boxes, pred_class, pred_score = person_json[frame], person_json[frame], person_json[frame]
    frame_path = join(person_data, frame)
    # This will display the bounding box overlayed on the image
    img = cv2.imread(frame_path) # Read image with cv2
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert to RGB
    for i in range(len(pred_boxes)):
        pt1 = (pred_boxes[i]['bbox'][0], pred_boxes[i]['bbox'][1])
        pt2 = (pred_boxes[i]['bbox'][2], pred_boxes[i]['bbox'][3])
        cv2.rectangle(img, pt1, pt2, color=(0, 255, 0), thickness=3) # Draw Rectangle with the coordinates
        # save to BOX_VIDS / video / frame.png
        if not os.path.exists(join(BOX_VIDS_PATH, video)):
          os.mkdir(join(BOX_VIDS_PATH, video))
        cv2.imwrite(join(BOX_VIDS_PATH, video, frame), cv2.cvtColor(img, cv2.COLOR_RGB2BGR))

In [None]:
# make a video with ffmpeg
!ffmpeg -framerate 30 -i /content/box_vids/001/output_%03d.jpg -c:v libx264 -pix_fmt yuv420p output_video.mp4

# Done