# Import and Install General Dependencies

In [22]:
!pip3 install mediapipe==0.10.9



In [23]:
!pip install tensorflow opencv-python scikit-learn matplotlib



In [73]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

# Detect and Extract Key Features

In [None]:
# Used to detect the features which will be extracted
mp_holistic = mp.solutions.holistic

In [None]:
# Find key features on the current frame
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert color from BGR->RGB
    image.flags.writeable = False
    results = model.process(image) # Make prediction on the current image/frame
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # Convert color from RGB->BGR
    return image, results

In [None]:
# Create an array of features when they are detected
def extract_keypoints(results):
    # If there is no data for the body part in that frame then we want to create an array of zeros of the same size
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

# Define Recognized Actions and Data Location

In [77]:
DATA_PATH = os.path.join('MP_Data')
actions = np.array(['hello', 'thanks', 'iloveyou'])
no_sequences = 30 # Number of videos that we will show for each action
sequence_length = 30 # Number of frames that we will use to detect an action

In [33]:
# Create required directory structure for each recognized action
#for action in actions:
#    for sequence in range(no_sequences):
#        try:
#            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
#        except:
#            pass

# Preprocess Data, Create Labels, and Create Features

In [78]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [79]:
label_map = {label:num for num, label in enumerate(actions)}

In [80]:
# NOTE: Sequences represent extracted features (x-data) and labels represent label/category (y-data)
sequences, labels = [], []

# TODO: This function will have to significantly change to accomidate Microsoft data
# For each recognized action go through each of its corresponding videos
for action in actions:
    for sequence in range(no_sequences):
        window = []
        # For each video, go through load the extracted feature data at every frame
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        # For each video add the feature data to sequences and it's correct characterization to labels
        sequences.append(window)
        labels.append(label_map[action])

In [138]:
# Split up the data into testing and training data
X = np.array(sequences)
y = to_categorical(labels).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

# Build and Train the Nueural Network

In [132]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from keras import callbacks
# TODO: Decide if I want to remove the logs for final submission
from tensorflow.keras.callbacks import TensorBoard
import matplotlib.pyplot as plt
import pandas as pd

In [139]:
# TODO: Remove this for final submission?
# Set up the log directory for TensorBoard
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [140]:
# Define the model architecture
model = Sequential()
model.add(Input(shape=(30, 1662)))
model.add(LSTM(64, return_sequences=True, activation='relu'))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [141]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [142]:
# Train the model on training data
model.fit(X_train, y_train, epochs=160, callbacks=[tb_callback])

Epoch 1/160
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - categorical_accuracy: 0.4619 - loss: 1.7349
Epoch 2/160
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - categorical_accuracy: 0.4500 - loss: 8.4827
Epoch 3/160
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - categorical_accuracy: 0.3796 - loss: 31.6166
Epoch 4/160
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - categorical_accuracy: 0.3659 - loss: 44.4033
Epoch 5/160
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - categorical_accuracy: 0.3424 - loss: 21.4760
Epoch 6/160
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - categorical_accuracy: 0.2818 - loss: 20.9088
Epoch 7/160
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - categorical_accuracy: 0.2525 - loss: 16.0127
Epoch 8/160
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step -

<keras.src.callbacks.history.History at 0x36707ff10>

In [46]:
# Save the model unimportant for the grader
#model.save('CustomData_160Epochs.keras')
#model.load_weights('action.keras')

# Evaluate Model Performance with Testing Data

In [144]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [145]:
# Make predictions on the testing data
yhat = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step


In [146]:
# Convert the predictions from their one-hot encoded (e.g. [1,0,0], [0,1,0]) prediction to corresponding categorical label (e.g. 0, 1, 2)
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [150]:
# Use these metrics to evaluate performance of model
print('Confusion Matrix:')
print(multilabel_confusion_matrix(ytrue, yhat))
print('--------------------------------------------------------------')
print(f'Accuracy on testing data: {accuracy_score(ytrue, yhat)}')

Confusion Matrix:
[[[2 0]
  [0 3]]

 [[4 0]
  [0 1]]

 [[4 0]
  [0 1]]]
--------------------------------------------------------------
Accuracy on testing data: 1.0


# Test the Model in Real Time

In [151]:
from scipy import stats

In [None]:
# Delclare variables that are used for making detections
sequence = []
sentence = []
predictions = []
threshold = 0.5

# Set up videocapture and loop through frames
cap = cv2.VideoCapture(0) #Device value 0 should correspond to the webcam
# Check if the camera opened successfully
if not cap.isOpened():
    print("Error: Could not open video capture device.")
    exit()

# Set the mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    try:
        while cap.isOpened():
            ret, frame = cap.read() # Capture the return value and frame at each point in camera
            # Make sure that the frame is read correctly
            if not ret:
                print("Can't receive frame (stream end?). Exiting ...")
                break
    
            # Make detections
            image, results = mediapipe_detection(frame, holistic)

            # Draw landmarks on frame
            draw_styled_landmarks(image, results)

            # Prediction logic keeps the most recent 30 frames for each prediction
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-30:]

            # We only want to start making predictions once we have 30 frames of data to use
            if len(sequence) == 30:
                res = model.predict(np.expand_dims(sequence, axis=0))[0] # Need to use expand_dims because our model is expecting a row indicating number of sequences
                predictions.append(np.argmax(res))

                # Visualization Logic
                # Only make predictions if it has been confident in that prediction for 10 slides in a row
                if np.unique(predictions[-10:])[0] == np.argmax(res):
                    # We only want to make a prediction if the model is confident in that sign
                    if res[np.argmax(res)] > threshold:
                        if len(sentence) > 0:
                            # We don't want to add an action to the sentence until it is a new action (not double counting actions)
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                        else:
                            sentence.append(actions[np.argmax(res)])
    
                # Make sure that we do not end with giant sentences
                if len(sentence) > 5:
                    sentence = sentence[-5:]

            # Render rectangle around prediction
            cv2.rectangle(image, (0,0), (2000, 40), (500, 117, 16), -1)
            cv2.putText(image, ' '.join(sentence), (3,30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
            cv2.imshow('OpenCV Feed', image) # Show the current frame to the user and names the feed "OpenCV Feed"
        
            # Break out of the capture feed gracefully if the q key is pressed
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break     
    finally:
        cap.release()
        cv2.destroyAllWindows()
        # There is a known bug on Mac where destroyAllWindows doesn't work unless a certain amount of time is spent waiting after
        for i in range (1,5):
            cv2.waitKey(1)

I0000 00:00:1714627206.026420       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1