In [None]:
# If the notebook is used by itself, uncomment
# !git clone https://github.com/HugoCasa/dlav-group1.git
# %cd dlav-group1/m2

: 

In [24]:
!pip install onnxruntime mediapipe lap cython_bbox

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cython_bbox
  Downloading cython_bbox-0.1.3.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 203 kB/s 
Building wheels for collected packages: cython-bbox
  Building wheel for cython-bbox (setup.py) ... [?25l[?25hdone
  Created wheel for cython-bbox: filename=cython_bbox-0.1.3-cp37-cp37m-linux_x86_64.whl size=58445 sha256=3b33998dbb23110f6425d7e68f03ac74831b07a6ae327efafa7fc11be0786a35
  Stored in directory: /root/.cache/pip/wheels/51/82/21/5def8bc98ae4ea436d7f0decb7194d20d7e3e6d0578a4129d7
Successfully built cython-bbox
Installing collected packages: cython-bbox
Successfully installed cython-bbox-0.1.3


In [17]:
# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import PIL
import io

import copy
import time
import cv2

from helpers import *

from Detector.detector import ObjectDetector
from Tracker.tracker import MultiObjectTracker


%load_ext autoreload
%autoreload 2

## Models

## Helper Functions
Below are a few helper converting between different image data types and formats and to create the webcam video stream using javascript. 

In [28]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

In [20]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;
    
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = 
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth; 640
      captureCanvas.height = 480; //video.videoHeight; 480
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)
  
def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

## Resulting Milestone 2
Below is the result of what we have done for Milestone 2.

In [38]:
USE_GPU = False
CAP_FPS = 20
TARGET_GESTURE_ID = 2
INIT_TIME_SEC = 10
IOU_THRESHOLD_SIMILAR_BBOX = 0.5


# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0 

# Hand Gesture Detection
gesture_detector = ObjectDetector(
    name= "hand_gesture",
    target_id= None,
    use_gpu=USE_GPU
)

# People Detection
people_detector = ObjectDetector(
    name= "yolox",
    target_id = 1, # Detect people only
    use_gpu=USE_GPU,
)

# Person Re-identification
tracker = MultiObjectTracker(
    "bytetrack",
    CAP_FPS,
    use_gpu=USE_GPU,
)

# Person Re-identification
person_reid = MultiObjectTracker(
    "person_reid",
    CAP_FPS,
    use_gpu=USE_GPU,
)

t_target_id = None
pr_target_id = None
first_detection_time = None
target_bbox = []


while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    start_time = time.time()

    # convert JS response to OpenCV Image
    frame = js_to_image(js_reply["img"])

    # create transparent overlay for bounding box
    debug_image = np.zeros([480,640,4], dtype=np.uint8)

    # Person  Detection
    d_bboxes, d_scores, d_class_ids = people_detector(frame)


    # Multi People Tracking
    track_ids, t_bboxes, t_scores, t_class_ids = tracker(
        frame,
        d_bboxes,
        d_scores,
        d_class_ids,
    )

    if t_target_id == None:
        # Hand Gesture Detection
        hg_bboxes, hg_scores, hg_class_ids = gesture_detector(frame)

        # Draw gesture detection
        draw_debug_info_detector(debug_image, hg_bboxes, hg_scores, hg_class_ids)

        # If at least two target gesture detected
        if  hg_class_ids.count(TARGET_GESTURE_ID) > 1:
            
            # Compute center coords of target gestures
            hand_centers = []
            for hg_box, hg_id in zip(hg_bboxes, hg_class_ids):
                if hg_id == TARGET_GESTURE_ID:
                    hand_centers.append(calc_center(hg_box))

            # For each detected people count number of detected gesture in their box  
            for t_box, t_id in zip(t_bboxes, track_ids):
                count_gesture_in_box = 0
                for hand_center in hand_centers:
                    if in_bounding_box(hand_center, t_box):
                        count_gesture_in_box+=1
                # If more than one, set it as target people
                if count_gesture_in_box > 1:
                    t_target_id = t_id
                    first_detection_time = time.time()
    else:
        target_bbox = retrieve_target_bbox(t_target_id, track_ids, t_bboxes)

        target_detected_in_frame = len(target_bbox) != 0

        # INIT_TIME_SEC of init for reid or target person not detected in frame by tracker
        if (first_detection_time != None and time.time() - first_detection_time <= INIT_TIME_SEC) or (not target_detected_in_frame):
            
            pr_track_ids, pr_bboxes, pr_scores, pr_class_ids = person_reid(
                frame,
                d_bboxes,
                d_scores,
                d_class_ids,
            )


            if pr_target_id == None and len(pr_bboxes) > 0:

                # Ow this one
                best_matching_idx, IOU_score = compute_best_matching_bbox_idx(target_bbox, pr_bboxes)
                if (IOU_score > IOU_THRESHOLD_SIMILAR_BBOX):
                    pr_target_id = pr_track_ids[best_matching_idx]   

            elif not target_detected_in_frame:
                if pr_target_id in pr_track_ids and len(t_bboxes) > 0:

                    # Ow this one
                    best_matching_idx, IOU_score = compute_best_matching_bbox_idx(pr_bboxes[pr_track_ids.index(pr_target_id)], t_bboxes)
                    if (IOU_score > IOU_THRESHOLD_SIMILAR_BBOX):
                        t_target_id = track_ids[best_matching_idx]  


            else:
                draw_target_bbox(debug_image, target_bbox)
    
                    
    elapsed_time = time.time() - start_time

    debug_image = draw_debug_info(
        debug_image,
        elapsed_time,
        track_ids,
        t_bboxes,
        t_scores,
        t_class_ids,
        t_target_id,
    )

    debug_image[:,:,3] = (debug_image.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(debug_image)
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes


<IPython.core.display.Javascript object>