# Prototyping of Model Pipeline

## Prototyping Backlog

* Core
    * https://github.com/ahmetgunduz/Real-time-GesRec
    * https://github.com/okankop/Efficient-3DCNNs
* Mediapipe Gesture Detection
    * The default model is not good enough for our purposes (doesn't have support for number hand gestures or swiping)
    * https://storage.googleapis.com/mediapipe-assets/gesture_recognizer/model_card_hand_gesture_classification_with_faireness_2022.pdf
    * https://developers.google.com/mediapipe/solutions/vision/gesture_recognizer#models
    * https://developers.google.com/mediapipe/solutions/vision/gesture_recognizer/customize
* Pure OpenCV and Semgentation Approach (No ML)
    * https://github.com/Gogul09/gesture-recognition
    * Limited to counting number of fingers.
* Gesture + Fingertip Detection
    * https://github.com/MahmudulAlam/Unified-Gesture-and-Fingertip-Detection
    * Not ready made right out of the box.
    * Missing swipe + 9, 10 gestures from EgoGestures
* https://github.com/anantSinghCross/realtime-hand-gesture-recognition/blob/master/captureHand.py
    * LSTM + Dense Layers
    * 30% error rate
* https://github.com/CLiu13/GestureOS
    * Using gestures and basic CV to transform images
* https://github.com/dennishnf/cnn-hand-gesture-interface
    * 2D CNN w/ C++
* https://www.hackster.io/mjrobot/tinyml-motion-recognition-using-raspberry-pi-pico-6b6071
    * simple gestures
* https://www.youtube.com/watch?v=nzHqep1Vrjo
    * Pose estimation + gesture recognition with CV
    
## Plan

1. Test models and get sample inputs to work.
2. Build the input pipeline for reading video, slicing the video and feeding the results into the model.
3. Combine the input pipeline with the test models.

# MediaPipe

In [1]:
import torch
import numpy as np
import cv2
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from time import time

BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
GestureRecognizerResult = mp.tasks.vision.GestureRecognizerResult
VisionRunningMode = mp.tasks.vision.RunningMode

model_path = '/home/joseph/Coding/ml_projects/gesture-controlled-lamp/gesture_recognizer.task'
base_options = BaseOptions(model_asset_path=model_path)

# Create a gesture recognizer instance with the live stream mode:


def print_result(result: GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
    #     print('gesture recognition result: {}'.format(result))
    for gesture in result.gestures:
        for pred_cat in gesture:
            print("Gesture: ", pred_cat.category_name)
#             cv2.putText(output_image.numpy_view(), pred_cat.category_name, (10, 50), cv2.FONT_HERSHEY_SIMPLEX,
#                         1, (0, 0, 255), 2, cv2.LINE_AA)


options = GestureRecognizerOptions(
    base_options=base_options,
    running_mode=VisionRunningMode.LIVE_STREAM,
    result_callback=print_result)
# from scipy import stats

# Initialize the webcam for Hand Gesture Recognition Python project
cap = cv2.VideoCapture(0)

with GestureRecognizer.create_from_options(options) as recognizer:

    try:
        i = 0
        gathered_img = np.zeros((16, 3, 240, 320))
        pred_label = 0
        while True:
            # Read each frame from the webcam
            _, frame = cap.read()
            x, y, c = frame.shape
            mp_image = mp.Image(
                image_format=mp.ImageFormat.SRGB, data=frame)
            frame_timestamp_ms = int(time() * 1000)
            recognizer.recognize_async(mp_image, frame_timestamp_ms)
            cv2.imshow("Output", frame)

            if cv2.waitKey(1) == ord('q'):
                break
            i += 1
    finally:
        # release the webcam and destroy all active windows
        print("Cleaning up...")
        cap.release()
        cv2.destroyAllWindows()

W20230418 22:24:34.098016 72342 gesture_recognizer_graph.cc:128] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleration to Xnnpack.
I20230418 22:24:34.103335 72342 hand_gesture_recognizer_graph.cc:249] Custom gesture classifier is not defined.
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  None
Gesture:  

Gesture:  None
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gesture:  Open_Palm
Gestu

# Jester Models

* Theory:
    * https://zhuogege1943.com/2019/06/16/Going-with-small-and-fast-networks-1/
    * https://machinethink.net/blog/mobile-architectures/

In [1]:
import os
os.chdir("../")
os.getcwd()

'/home/joseph/Coding/ml_projects/gesture-controlled-lamp'

In [5]:
from torchsummary import summary
from infer import *
from dataclasses import dataclass

model = create_mobilenetv2()
summary(model, input_size=(3, 16, 56, 56))

loading pretrained model /home/joseph/Desktop/jester_mobilenetv2_0.7x_RGB_16_best.pth
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 22, 16, 28, 28]           1,782
       BatchNorm3d-2       [-1, 22, 16, 28, 28]              44
             ReLU6-3       [-1, 22, 16, 28, 28]               0
            Conv3d-4       [-1, 22, 16, 28, 28]             594
       BatchNorm3d-5       [-1, 22, 16, 28, 28]              44
             ReLU6-6       [-1, 22, 16, 28, 28]               0
            Conv3d-7       [-1, 11, 16, 28, 28]             242
       BatchNorm3d-8       [-1, 11, 16, 28, 28]              22
  InvertedResidual-9       [-1, 11, 16, 28, 28]               0
           Conv3d-10       [-1, 66, 16, 28, 28]             726
      BatchNorm3d-11       [-1, 66, 16, 28, 28]             132
            ReLU6-12       [-1, 66, 16, 28, 28]               0
           Conv3d

In [8]:
import cv2
import numpy as np
import torch
# from scipy import stats

# Initialize the webcam for Hand Gesture Recognition Python project
cap = cv2.VideoCapture(0)

input_dim = 112
try:
    i = 0
    gathered_img = np.zeros((16, 3, input_dim, input_dim))
    pred_label = 0
    while True:
        # Read each frame from the webcam
        _, frame = cap.read()
        reshaped, frame = preprocess_mobilenetv2_from_cv2(frame, reshape_size=(input_dim, input_dim))
        gathered_img[i % 16] = reshaped
        if i != 0 and i % 16 == 0:
            input_tensor = preprocess_mobilenetv2_queued(gathered_img)
            pred = model(input_tensor)
            pred_label = int(torch.argmax(pred))
            gathered_img = np.zeros((16, 3, input_dim, input_dim))
            # show the prediction on the frame
        cv2.putText(frame, JESTER_LABELS[pred_label], (10, 50), cv2.FONT_HERSHEY_SIMPLEX,
                    1, (0,0,255), 2, cv2.LINE_AA)
        cv2.imshow("Output", frame)

        if cv2.waitKey(1) == ord('q'):
            break
        i += 1
finally:
    # release the webcam and destroy all active windows
    print("Cleaning up...")
    cap.release()
    cv2.destroyAllWindows()

Cleaning up...


### Async Ver

In [18]:
import asyncio
import cv2
import numpy as np
import torch
# from scipy import stats


# Initialize the webcam for Hand Gesture Recognition Python project
cap = cv2.VideoCapture(0)

try:
    i = 0
    gathered_img = np.zeros((16, 3, 240, 320))
    pred_label = 0
    while True:
        # Read each frame from the webcam
        _, frame = cap.read()
        x, y, c = frame.shape
        frame = cv2.flip(cv2.resize(frame, (y // 2, x // 2)), 1)
        gathered_img[i % 16] = frame.transpose(2, 0, 1)
        if i != 0 and i % 16 == 0:
            reshaped = gathered_img.transpose(1, 0, 2, 3)
            reshaped = np.expand_dims(reshaped, axis=0)
            input_tensor = torch.from_numpy(reshaped).float().div(255)
#             async def pred(x, input_frame, pred_model, str_labels):
#                 print("predicting...")
#                 pred = pred_model(x)
#                 pred_label_x = int(torch.argmax(pred))
#                 print("predicted: ", str_labels[pred_label_x])
#                 cv2.putText(input_frame, str_labels[pred_label_x], (10, 50), cv2.FONT_HERSHEY_SIMPLEX,
#                     1, (0,0,255), 2, cv2.LINE_AA)
#                 return pred_label_x
#             asyncio.create_task(pred(input_tensor, frame, model, labels))
            gathered_img = np.zeros((16, 3, 240, 320))
            # show the prediction on the frame

        cv2.imshow("Output", frame/255)

        if cv2.waitKey(1) == ord('q'):
            break
        i += 1
finally:
    # release the webcam and destroy all active windows
    print("Cleaning up...")
    cap.release()
    cv2.destroyAllWindows()
#     loop = asyncio.get_event_loop()
#     loop.run_until_complete(display_date(loop))
#     loop.close()

Cleaning up...


In [10]:
    loop = asyncio.get_event_loop()


In [16]:
fut = loop.create_future()
loop.stop()

In [None]:
loop.