Import Dependencies

In [1]:
!pip3 install mediapipe==0.10.9

Collecting mediapipe==0.10.9
  Downloading mediapipe-0.10.9-cp310-cp310-macosx_11_0_universal2.whl.metadata (9.6 kB)
Collecting absl-py (from mediapipe==0.10.9)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting flatbuffers>=2.0 (from mediapipe==0.10.9)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting matplotlib (from mediapipe==0.10.9)
  Downloading matplotlib-3.8.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.8 kB)
Collecting opencv-contrib-python (from mediapipe==0.10.9)
  Downloading opencv_contrib_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl.metadata (20 kB)
Collecting protobuf<4,>=3.11 (from mediapipe==0.10.9)
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe==0.10.9)
  Downloading sounddevice-0.4.6-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib->mediapipe==0.10.9)
  Downloading

In [2]:
!pip install tensorflow opencv-python scikit-learn matplotlib

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp310-cp310-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting ml-dtypes~=0.3.1 (from tensorflow)
  Downloading ml_dtypes-0.3.2-cp310-cp310-macosx_10_9_universal2.whl.metadata (20 kB)
Collecting opt-einsum>=2.3.2 (from tenso

In [20]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

Access Webcam and Make Detections

In [21]:
mp_holistic = mp.solutions.holistic # Makes detections
mp_drawing = mp.solutions.drawing_utils # Draws detections

In [22]:
# Make landmark detections for hands, face, etc.
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert color from BGR->RGB
    image.flags.writeable = False
    results = model.process(image) # Make prediction on the current image/frame
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # Convert color from RGB->BGR
    return image, results

In [23]:
# Draw the landmarks onto the rendered image
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [24]:
# Draw the landmarks onto the rendered image with optional style parameters
def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                              mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                              mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             )
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(80,256,121), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [63]:
# Set up videocapture and loop through frames
cap = cv2.VideoCapture(0) #Device value 0 should correspond to the webcam
# Check if the camera opened successfully
if not cap.isOpened():
    print("Error: Could not open video capture device.")
    exit()

# Set the mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    try:
        while cap.isOpened():
            ret, frame = cap.read() # Capture the return value and frame at each point in camera
            # Make sure that the frame is read correctly
            if not ret:
                print("Can't receive frame (stream end?). Exiting ...")
                break
    
            # Make detections
            image, results = mediapipe_detection(frame, holistic)
            print(results)

            # Draw landmarks on frame
            draw_styled_landmarks(image, results)
            
            cv2.imshow('OpenCV Feed', image) # Show the current frame to the user and names the feed "OpenCV Feed"
        
            # Break out of the capture feed gracefully if the q key is pressed
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break     
    finally:
        cap.release()
        cv2.destroyAllWindows()
        # There is a known bug on Mac where destroyAllWindows doesn't work unless a certain amount of time is spent waiting after
        for i in range (1,5):
            cv2.waitKey(1)

I0000 00:00:1714414956.732212       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1 Pro


<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

KeyboardInterrupt: 

Extract Keypoint Values

In [62]:
def extract_keypoints(results):
    # If there is no data for the body part in that frame then we want to create an array of zeros of the same size
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

Setup Folders for Collection

In [26]:
DATA_PATH = os.path.join('MP_Data') # Path for the exported data
actions = np.array(['hello', 'thanks', 'iloveyou']) # These are the sign language actions that I want the model to be able to recognize
no_sequences = 30 # Number of videos that we will show for each action
sequence_length = 30 # Number of frames that we will use to detect an action

In [12]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

Collect Keypoint Values for Training and Testing Purposes

In [14]:
# Set up videocapture and loop through frames
cap = cv2.VideoCapture(0) #Device value 0 should correspond to the webcam
# Check if the camera opened successfully
if not cap.isOpened():
    print("Error: Could not open video capture device.")
    exit()

# Set the mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    try:
        # For each action, capture 30 videos and 30 frames in each video
        for action in actions:
            for sequence in range(no_sequences):
                for frame_num in range(sequence_length):
                    
                    ret, frame = cap.read() # Capture the return value and frame at each point in camera
                    # Make sure that the frame is read correctly
                    if not ret:
                        print("Can't receive frame (stream end?). Exiting ...")
                        break
            
                    # Make detections
                    image, results = mediapipe_detection(frame, holistic)
                    print(results)
        
                    # Draw landmarks on frame
                    draw_styled_landmarks(image, results)

                    # Let user know what data is being collected and add 2 second break between each video
                    if frame_num == 0:
                        cv2.putText(image, 'STARTING COLLECTION', (120, 200),
                                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                        cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                        cv2.imshow('OpenCV Feed', image) # Show the current frame to the user and names the feed "OpenCV Feed"
                        cv2.waitKey(2000)
                    else:
                        cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                        cv2.imshow('OpenCV Feed', image) # Show the current frame to the user and names the feed "OpenCV Feed"

                    # Save the frame keypoints to its corresponding file
                    keypoints = extract_keypoints(results)
                    npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                    np.save(npy_path, keypoints)
        
                    # Break out of the capture feed gracefully if the q key is pressed
                    if cv2.waitKey(10) & 0xFF == ord('q'):
                        break     
    finally:
        cap.release()
        cv2.destroyAllWindows()
        # There is a known bug on Mac where destroyAllWindows doesn't work unless a certain amount of time is spent waiting after
        for i in range (1,5):
            cv2.waitKey(1)

I0000 00:00:1707533367.565616       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1 Pro


<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

Preproces Data and Create Labels && Features

In [27]:
from sklearn.model_selection import train_test_split # Allow us to partition data into training and testing data
from tensorflow.keras.utils import to_categorical

In [28]:
label_map = {label:num for num, label in enumerate(actions)}

In [29]:
sequences, labels = [], [] # Sequences represents features (x-data) and labels represents label/category (y-data)
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [30]:
X = np.array(sequences)

In [31]:
y = to_categorical(labels).astype(int)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

Build and Train LSTM (Long Short Term Memory) Neural Network

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [34]:
# Set up the log directory for TensorBoard
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [35]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [36]:
# TODO: Experiment with different optimizers
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])

Epoch 1/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - categorical_accuracy: 1.0000 - loss: 3.5062e-09
Epoch 2/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - categorical_accuracy: 0.5849 - loss: 9.4840
Epoch 3/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - categorical_accuracy: 0.3053 - loss: 235.7444
Epoch 4/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - categorical_accuracy: 0.3914 - loss: 251.1769
Epoch 5/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - categorical_accuracy: 0.3249 - loss: 1571.7939
Epoch 6/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - categorical_accuracy: 0.2525 - loss: 9947.0020 
Epoch 7/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - categorical_accuracy: 0.4853 - loss: 3616.3542
Epoch 8/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

Make Predictions

In [64]:
model.summary()

In [38]:
model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step


array([[0.3404329 , 0.33993948, 0.3196276 ],
       [0.3404329 , 0.33993948, 0.3196276 ],
       [0.3404329 , 0.33993948, 0.3196276 ],
       [0.3404329 , 0.33993948, 0.3196276 ],
       [0.34043294, 0.33993948, 0.31962764]], dtype=float32)

In [39]:
y_test

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]])

Save Model Weights

In [47]:
# Save the model: Uncomment only if you make changes that improve the model
#model.save('action.keras')

In [50]:
# Load the model
model.load_weights('action.h5')

Evaluate Model Accuracy With Confusion Matrix

In [51]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [52]:
yhat = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


In [53]:
# Convert the predictions from their one-hot encoded (e.g. [1,0,0], [0,1,0]) prediction to corresponding categorical label (e.g. 0, 1, 2)
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [54]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[4, 0],
        [0, 1]],

       [[4, 0],
        [0, 1]],

       [[2, 0],
        [0, 3]]])

In [55]:
accuracy_score(ytrue, yhat)

1.0

Test the Model in Real Time

In [None]:
from scipy import stats

In [65]:
# Color things for a good output image, might delete late
colors = [(245, 117, 16), (117, 245, 16), (16, 117, 245)]
def prob_vis(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cvs.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
    return output_frame

In [69]:
# Delclare variables that are used for making detections
sequence = []
sentence = []
predictions = []
threshold = 0.5

# Set up videocapture and loop through frames
cap = cv2.VideoCapture(0) #Device value 0 should correspond to the webcam
# Check if the camera opened successfully
if not cap.isOpened():
    print("Error: Could not open video capture device.")
    exit()

# Set the mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    try:
        while cap.isOpened():
            ret, frame = cap.read() # Capture the return value and frame at each point in camera
            # Make sure that the frame is read correctly
            if not ret:
                print("Can't receive frame (stream end?). Exiting ...")
                break
    
            # Make detections
            image, results = mediapipe_detection(frame, holistic)

            # Draw landmarks on frame
            draw_styled_landmarks(image, results)

            # Prediction logic keeps the most recent 30 frames for each prediction
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-30:]

            # We only want to start making predictions once we have 30 frames of data to use
            if len(sequence) == 30:
                res = model.predict(np.expand_dims(sequence, axis=0))[0] # Need to use expand_dims because our model is expecting a row indicating number of sequences
                predictions.append(np.argmax(res))

                # Visualization Logic
                # Only make predictions if it has been confident in that prediction for 10 slides in a row
                if np.unique(predictions[-10:])[0] == np.argmax(res):
                    # We only want to make a prediction if the model is confident in that sign
                    if res[np.argmax(res)] > threshold:
                        if len(sentence) > 0:
                            # We don't want to add an action to the sentence until it is a new action (not double counting actions)
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                        else:
                            sentence.append(actions[np.argmax(res)])
    
                # Make sure that we do not end with giant sentences
                if len(sentence) > 5:
                    sentence = sentence[-5:]

            # Render rectangle around prediction
            cv2.rectangle(image, (0,0), (2000, 40), (500, 117, 16), -1)
            cv2.putText(image, ' '.join(sentence), (3,30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
            cv2.imshow('OpenCV Feed', image) # Show the current frame to the user and names the feed "OpenCV Feed"
        
            # Break out of the capture feed gracefully if the q key is pressed
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break     
    finally:
        cap.release()
        cv2.destroyAllWindows()
        # There is a known bug on Mac where destroyAllWindows doesn't work unless a certain amount of time is spent waiting after
        for i in range (1,5):
            cv2.waitKey(1)

I0000 00:00:1714417178.998581       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1 Pro


<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti