# ASL Detection

(optimized the script from Sign_Language_Detection_with_Action_Recognition.ipynb)

## 1. Install / import dependencies

### install

<span style="color: red;">Only run this when you first set up your environment</span>

In [None]:
!pip install tensorflow opencv-python mediapipe sklearn matplotlib
# not installing tensorflow-gpu since I'm not using GPU

### import

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

## 2. Find and draw keypoints using MediaPipe (MP) holistic

### shortcuts to some MP methods
* holistic model method
* drawing utilities

In [2]:
mp_holistic = mp.solutions.holistic # holistic model
mp_drawing = mp.solutions.drawing_utils # drawing utilities

### define mediapipe_detection()
A function to find the MP landmarks on an image

In [3]:
# mediapipe_detection(image, model) will find the mediapipe landmarks for an image
# image: the image from the feed we will be scrutinizing
# model: the MP detection model

def mediapipe_detection(image, model):
    # convert color from BGR (cv) to RGB (for mp detection)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # make image unwriteable to save memory
    image.flags.writeable = False
    # make prediction using the MP detection model
    results = model.process(image)
    # make image writeable again
    image.flags.writeable = True
    # convert image back to BGR
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

### define draw_landmarks()
A function that will draw the MP landmarks over an image

In [4]:
# draw_landmarks(image, results) will draw the landmark points over the live image
# image: the image from the feed we will be drawing on top of
# results: the landmark list we found with mediapipe_detection()

def draw_landmarks(image, results):
    # draw face landmarks & connections
    # no longer use FACE_CONNECTIONS.  Rather, use FACEMESH_TESSELATION
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=3), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             )
    # draw pose landmarks & connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(255,170,75), thickness=2, circle_radius=2)
                             )
    # draw left hand landmarks & connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    # draw right hand landmarks & connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )
    

### try a live demonstration of media pipe

<span style="color: red;">Change the VideoCapture number 0, 1, or 2.  0 is usually the built-in camera, 1, 2 are for peripherals.</span>

In [None]:
# Use this code to view a live demo of media pipe landmarks mapped to subject

cap = cv2.VideoCapture(0) # 0, 1, 2
# set the mediapipe model
with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic:
    while cap.isOpened():
        #read feed
        ret, frame = cap.read()
        #make detection
        image, results = mediapipe_detection(frame, holistic)
        #draw landmarks and connections
        draw_landmarks(image, results)
        # show to screen
        cv2.imshow('OpenCV Feed', image)
        #break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    # end the video capture setting
    cap.release()
    cv2.waitKey(1)
    # close the video capture window(s)
    cv2.destroyAllWindows()
    cv2.waitKey(1)

## 3. Extract keypoint values
### save number of landmarks and dimensions for each body set

<span style="color: red;">Check the validity of these numbers periodically, since Google may change them.</span>

In [5]:
# save the number of mp landmarks for:
# facemesh, pose, left hand, right hand
# numbers found in MP documentation
num_mp_lmks = {
    'face' : {'num' : 468, 'dim' : 3}, # dim: x,y,z
    'pose' : {'num' : 33, 'dim' : 4}, # dim: x,y,z,visibility
    'hand' : {'num' : 21, 'dim' : 3} # dim: x,y,z
}

### calculate total number of keypoints for each body set

In [6]:
# add total by multiplying number of landmarks w/ their dimensions
for v in num_mp_lmks.values():
    v['total'] = (v['num'] * v['dim'])
num_mp_lmks

{'face': {'num': 468, 'dim': 3, 'total': 1404},
 'pose': {'num': 33, 'dim': 4, 'total': 132},
 'hand': {'num': 21, 'dim': 3, 'total': 63}}

### calculate final total of all keypoints

In [7]:
num_keypoints =  0
for k, v in num_mp_lmks.items():
    num_keypoints += v['total']
    if k == 'hand':
        num_keypoints += v['total'] # add a 2nd hand
num_keypoints

1662

### define extract_keypoints()
This will extract every single keypoint in a video frame

In [8]:
# extract_keypoints(results)
# returns a single array with every single keypoint value in a frame
#results are the media pipe detection results

def extract_keypoints(results):
    if results.pose_landmarks:
        pose = np.array([[res.x, res.y, res.z, res.visibility]\
            for res in results.pose_landmarks.landmark]).flatten()
    else:
        pose = np.zeros(num_mp_lmks['pose']['total'])
    
    if results.face_landmarks:
        face = np.array([[res.x, res.y, res.z]\
            for res in results.face_landmarks.landmark]).flatten()
    else:
        face = np.zeros(num_mp_lmks['face']['total'])

    if results.left_hand_landmarks:
        lhand = np.array([[res.x, res.y, res.z]\
            for res in results.left_hand_landmarks.landmark]).flatten()
    else:
        lhand = np.zeros(num_mp_lmks['hand']['total'])

    if results.right_hand_landmarks:
        rhand = np.array([[res.x, res.y, res.z]\
            for res in results.right_hand_landmarks.landmark]).flatten()
    else:
        rhand = np.zeros(num_mp_lmks['hand']['total'])

    return np.concatenate([pose, face, lhand, rhand])

## 4. Set up data collection settings and directories

### path to save testing/training data

<span style="color: red;">Set up the correct path</span>

In [9]:
#path definitions:

# path for exported data, numpy arrays
DATA_PATH = os.path.join('ASL', 'MP_Data')

### labels

<span style="color: red;">Edit the labels</span>

In [10]:
# ASL actions that we try to detect
actions = np.array([
    'hello',
    'thanks',
    'iloveyou',
    'neutral',
    'not_hotdog'
])
actions

array(['hello', 'thanks', 'iloveyou', 'neutral', 'not_hotdog'],
      dtype='<U10')

### number of videos per label

<span style="color: red;">Edit this number</span>

In [11]:
# 30 videos worth of data per action
num_sequences = 30

### number of frames per video

<span style="color: red;">Edit this number</span>

In [12]:
#video sequences will be 15 frames in length
sequence_length = 15

### create data folders

<span style="color: red;">Run this depending on above changes</span>

In [None]:
# create folders
    # 1 folder per action
        # 1 folder per sequence
            # 30 frames worth of data will be saved in each sequence folder

for action in actions:
    for sequence in range(num_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

## 5. Collect keypoint values for training and testing

<span style="color: red;">Run this when you need to collect data</span>

<span style="color: red;">Change the VideoCapture number 0, 1, or 2.  0 is usually the built-in camera, 1, 2 are for peripherals.</span>

In [None]:
cap = cv2.VideoCapture(0) # 0, 1, 2
# set the mediapipe model
with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic:
    # loop through actions
    for action in actions:
        # loop through sequences (aka videos)
        for sequence in range(num_sequences):
            # loop through video length (aka sequence length)
            for frame_num in range(sequence_length):
                
                #read feed
                ret, frame = cap.read()
                
                #make detection
                image, results = mediapipe_detection(frame, holistic)
                
                #draw landmarks and connections
                draw_landmarks(image, results)
                
                #collection pauses and messaging
                if frame_num == 0:
                    if sequence == 0:
                        cv2.putText(image, ('GET SET FOR {}'.format(action)),
                                    (120,200), cv2.FONT_HERSHEY_SIMPLEX, 2.5, (150,0,0), 4, cv2.LINE_AA)
                        # show to screen
                        cv2.imshow('OpenCV Feed', image)
                        cv2.waitKey(2000)
                    else:
                        cv2.putText(image, ('STARTING {} IN 2 SECONDS!'.format(action)),
                                    (120,200), cv2.FONT_HERSHEY_SIMPLEX, 2, (0,150,0), 2, cv2.LINE_AA)
                    cv2.putText(image, ('Collecting frames for {} Video Number {}'.format(action, sequence)),
                                (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,150), 1, cv2.LINE_AA)
                    # show to screen
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(2000)
                else:
                    cv2.putText(image, ('Collecting frames for {} Video Number {}'.format(action, sequence)),
                                (30,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,150), 1, cv2.LINE_AA)
                    # show to screen
                    cv2.imshow('OpenCV Feed', image)
                
                #export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)
                
                #break gracefully
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
                # end video capture
    cap.release()
    cv2.waitKey(1)
    # close the video capture window(s)
    cv2.destroyAllWindows()
    cv2.waitKey(1)

### for releasing camera and closing windows

<span style="color: red;">Use the following just as needed</span>

In [None]:
# use this if you need to clear cv2 windows

cap.release()
cv2.waitKey(1)
# close the video capture window(s)
cv2.destroyAllWindows()
cv2.waitKey(1)

## 6. Pre-process data and create labels and features

### import

In [13]:
# import dependencies
from sklearn.model_selection import train_test_split
#helps us split up data for testing and training

from tensorflow.keras.utils import to_categorical
# to convert an np array of values to np array of 0s, 1s

### create label map

In [14]:
label_map = {label : num for num, label in enumerate(actions)}
label_map

{'hello': 0, 'thanks': 1, 'iloveyou': 2, 'neutral': 3, 'not_hotdog': 4}

### define data and label arrays
* the __sequences array__ holds the __MP data__ for __each video__
* the __label array__ holds the __correct labels__ for __each video__

In [15]:
sequences, labels = [],[] #empty arrays. Think of sequences as x-data, label as y-data

### fetch the data and labels

In [16]:
# bring in the saved data from disk
for action in actions:
    for seq in range(num_sequences):
        seq_data = [] # MP data for all of the frames for this sequence
        for frame_num in range(sequence_length):
            result = np.load(os.path.join(DATA_PATH, action, str(seq), '{}.npy'.format(frame_num)))
            seq_data.append(result)
        sequences.append(seq_data)
        labels.append(label_map[action])

### define X and y
* X axis is the sequences data (as a numpy array)
* y axis is the correct labels (as a numpy array)

In [17]:
X = np.array(sequences) # make a numpy array from sequences
y = to_categorical(labels).astype(int) # labels as binary flag array

### split data / labels into train / test
* splitting is done randomly
* each time you run you will get a new grouping

<span style="color: red;">Decide what percentage you want to be training data</span>

In [18]:
# randomly split data into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05) #5% will be test data

## 7. Build and train LSTM neural network

### import packages

In [19]:
# import required packages
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

### set up log to view with TensorBoard

<span style="color: red;">Set up the correct path</span>

In [20]:
# set up the log path to then view the log with TensorBoard
LOG_DIR = os.path.join('ASL', 'logs')
tb_callback = TensorBoard(log_dir = LOG_DIR)

### build or rebuild the model
This is the model's shape
When [loading a model](#load_model) from disk, make sure to:
1. run this model rebuild first
2. [compile](#compile_model) the model
3. then you can load [load the model](#load_model)

In [21]:
# instantiate the Sequential API model
model = Sequential()

##### NEED TO EDIT THIS SO THAT I CAN DEFINE NUMBER OF SEQUENCES AND NUMBER OF FRAMES VIA VARIABLES #####
# creating the LSTM layers
# 1st layer: 64 LSTM units, return sequence (True) so next layer can use it
# shape of each sequence (video) is 30 frames (sequence_length) by 1662 keypoints (num_keypoints)
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(sequence_length, num_keypoints)))

# 2nd layer: 128 LSTM units
model.add(LSTM(128, return_sequences=True, activation='relu'))

#3rd layer: 64 LSTM units, won't be returning the sequence
model.add(LSTM(64, return_sequences=False, activation='relu'))

# creating the Dense layers
# 1st layer: 64 Densly connected Neural Network neurons
model.add(Dense(64, activation='relu'))

#2nd layer: 32 Dense NN neurons
model.add(Dense(32, activation='relu'))

# create the Actions layer
# 3 neural network units (actions.shape = (3,) and so actions.shape[0] = 3)
# choosing softmax because all three values in the model will add up to 1
model.add(Dense(actions.shape[0], activation='softmax'))

2022-09-17 22:56:47.008310: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<a id="compile_model"></a>
### compile the model
After you have defined the model's shape, you will need to compile the model.

When [loading a model](#load_model) from disk, make sure to:
1. run the [model rebuild](#model_rebuild) first
2. Run this model compiler
3. then you can load [load the model](#load_model)

In [22]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

### train the model

<span style="color: red;">WARNING: THIS WILL REWRITE YOUR DATA. Edit the number of epochs</span>

In [None]:
model.fit(X_train, y_train, epochs=1000, callbacks=[tb_callback])

### view the model summary

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 15, 64)            442112    
                                                                 
 lstm_1 (LSTM)               (None, 15, 128)           98816     
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 5)                 165       
                                                                 
Total params: 596,741
Trainable params: 596,741
Non-trai

## 8. Make predictions
Make __predictions__ on __x_test__ and __compare__ them to the __correct answers__ in __y_test__
### run a prediction on some data

In [25]:
# X_test is our test data
# y_test is the correct labels for X_test

# use the model to predict what our test data is
prediction_results = model.predict(X_test)



### view the predictions

In [26]:
# view the prediction results
for i in range(len(prediction_results)):
    print(actions[np.argmax(prediction_results[i])])

not_hotdog
iloveyou
thanks
neutral
neutral
hello
hello
neutral


### view the actual labels

In [27]:
# match our predictions with the actual labels:
y_test

array([[0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

### let's remind ourselves of our labels

In [28]:
# remind ourselves of the actions
actions

array(['hello', 'thanks', 'iloveyou', 'neutral', 'not_hotdog'],
      dtype='<U10')

## 9. Save your model (aka "weights")

### save to disk

* <span style="color: red;">Name your model</span>

* <span style="color: red;">Define your file path</span>

In [None]:
# save the model I trained
model_name = 'TBD'
model.save(os.path.join('ASL', '{}.h5'.format(model_name)))

### delete your model

<span style="color: red;">You should do this first before you re-train or load another model</span>

In [None]:
# now I can delete the model
del model

# check that it was successfully deleted:
model

<a id="load_model"></a>
### loading a saved model:

<span style="color: red;">Make sure to follow these steps in order</span>

1. run the [model rebuild](#model_rebuild)
2. [compile](#compile) the data
3. reload the saved model

* <span style="color: red;">Set saved model name</span>

* <span style="color: red;">Define the model's file path</span>

In [24]:
saved_model_name = 'model_3'
model.load_weights(os.path.join('ASL', '{}.h5'.format(saved_model_name)))

# check that it was successfully loaded:
model

<keras.engine.sequential.Sequential at 0x14b508880>

## 10. Evaluation using confusion matrix and accuracy

In [None]:
### TBD ###

## 11. Test in Real Time

<span style="color: red;">Change the VideoCapture number 0, 1, or 2.  0 is usually the built-in camera, 1, 2 are for peripherals.</span>

In [None]:
np.expand_dims(X_test[0], axis=0)

In [None]:
model.predict(np.expand_dims(X_test[0], axis=0))

In [None]:
# 1. Three new detection variables
sequence = [] #to collect our sequence_length number of frames. Once we get sequence_length number of frames we will start predicting
sentence = [] #concatenate our history of detections together
threshold = 0.5 #render results only if above a certain threshold

cap = cv2.VideoCapture(0) # 2 is for logitech (near right USB-C port)
# set the mediapipe model
with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic:
    while cap.isOpened():
        
        #read feed
        ret, frame = cap.read()
        
        #make detection
        image, results = mediapipe_detection(frame, holistic)
        
        #draw landmarks and connections
        draw_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[(-1*sequence_length):]
        
        if len(sequence) == sequence_length:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))
            
            # 3. Vis logic
            if res[np.argmax(res)] > threshold: # checking if result is above our threshold
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])


            if len(sentence) > 5:
                sentence = sentence[-5:]
            
        cv2.rectangle(image, (0,0), (640,40), (245, 17, 16), -1)
        cv2.putText(image, ' {}'.format(sentence[-1]), (3,30),
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        
        # show to screen
        cv2.imshow('OpenCV Feed', image)
        
        #break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
            
    # end the video capture setting
    cap.release()
    cv2.waitKey(1)
    # close the video capture window(s)
    cv2.destroyAllWindows()
    cv2.waitKey(1)

[/Users/build/sandbox/BuildRepos/Sol-Windows-1/External/VirtualMonitor-macOS/MersiveRelayCam/RelayPlugin/VtcComLib/VtcComLib.cpp] initializer()
[/Users/build/sandbox/BuildRepos/Sol-Windows-1/External/VirtualMonitor-macOS/MersiveRelayCam/RelayPlugin/avCam/avCam/avCamLib.mm] initializer()
	09-17-2022  22:59:31.053 @@@@===RelayStream:: set kCMIOStreamPropertyFormatDescription sub=2vuy w=1280 h=720
	09-17-2022  22:59:31.799 LocalCamera::passResolution set resolution to 1280x720
	09-17-2022  22:59:31.575 Local Camera start, ret=0


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [None]:
# end the video capture setting
cap.release()
cv2.waitKey(1)
# close the video capture window(s)
cv2.destroyAllWindows()
cv2.waitKey(1)

In [None]:
model