import cv2
import mediapipe as mp
import os
from matplotlib import pyplot as plt
import time
import numpy as np


In [2]:

mp_holistic = mp.solutions.holistic # Holistic model(i.e. full body detection tool)
mp_drawing= mp.solutions.drawing_utils # drawing utilities
## convert image from bgr to rgb because mediapipe needs it to be in rgb
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # color conversion BGR -> RGB
    image.flags.writeable = False                  # Image is no longer writeable(unchangeable)
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable(chageable)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # color conversion RGB -> BGR
    return image, results
def draw_landmarks(image, results):
    #mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.HAND_CONNECTIONS)
    #mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) 
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                              mp_drawing.DrawingSpec(color=(0,0,0), thickness = 2), 
                              mp_drawing.DrawingSpec(color =(0,0,0), thickness = 2) 
                              )
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                              mp_drawing.DrawingSpec(color = (0,0,0), thickness = 2), 
                              mp_drawing.DrawingSpec(color = (0,0,0), thickness = 2) 
                              )

In [3]:

#print(mp_holistic.POSE_CONNECTIONS) 
cap = cv2.VideoCapture(0) 
#sets mediapipe model 
with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic: 
    while cap.isOpened(): 
        # ret returns a boolean value of if the camera could be read or not and frame is the array of pixle values of the video 
        ret, frame = cap.read() 
         
        # Makes image detections by passing in array of live feed pixles and converting from bgr to rgb  
        image, results = mediapipe_detection(frame, holistic) 
        print(results) 
         
        draw_landmarks(image, results) 
        ## Opens GUI with show live feed from camera titled "Live Feed" and returns every "frame" repeatedly 
        cv2.imshow("Live Feed", image) 
        if cv2.waitKey(10) & 0xFF == ord("q"): 
            break 
    cap.release() 
    cv2.destroyAllWindows() 
# Reads the frame and finds the landmarks in the frame 
##print(len(results.left_hand_landmarks.landmark)) 
#results 
#draw_landmarks(frame, results) 
#plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 


<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [4]:

# Puts the x,y,z values of the 21 points in the hand landmarks in a flatten in array for later use in model testing and if hand landmarks aren't detected return and array of 0's 
def get_keypoints(results):  
    leftHand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3) 
    rightHand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3) 
    return np.concatenate([leftHand,rightHand]) 
#print(get_keypoints(results).shape) 


In [5]:

# Path set up for data 
DATA_PATH = os.path.join("MP_Data") 
# Hand gestures 
actions = np.array(["Peace", "Good", "Bad"]) 
# Videos of data 
no_sequences = 30 
# Videos will be 30 frames in length 
sequence_length = 30 

In [6]:

for action in actions: 
    for sequence in range(no_sequences): 
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence))) 
        except: 
            pass

In [7]:
cap = cv2.VideoCapture(0) 
#sets mediapipe model 
with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic: 
    for action in actions:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length): 
        # ret returns a boolean value of if the camera could be read or not and frame is the array of pixle values of the video 
                ret, frame = cap.read() 
                
                # Makes image detections by passing in array of live feed pixles and converting from bgr to rgb  
                image, results = mediapipe_detection(frame, holistic) 
                print(results) 
                
                draw_landmarks(image, results)
                if frame_num == 0:
                    cv2.putText(image, "STARTING COLLECTION", (120,200),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, "Collecting frames for {} Video Number {}".format(action, sequence), (15,12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), cv2.LINE_AA)
                    cv2.waitKey(2000)
                else:
                    cv2.putText(image, "Collecting frames for {} Video Number {}".format(action, sequence), (15,12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 4, cv2.LINE_AA)
                # extract keypoints and save in folder
                keypoints = get_keypoints(results)
                #save frame in file
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)
                ## Opens GUI with show live feed from camera titled "Live Feed" and returns every "frame" repeatedly 
                cv2.imshow("Live Feed", image) 
        if cv2.waitKey(10) & 0xFF == ord("q"): 
            break 
    cap.release() 
    cv2.destroyAllWindows() 
# Reads the frame and finds the landmarks in the frame 
##print(len(results.left_hand_landmarks.landmark)) 
#results 
#draw_landmarks(frame, results) 
#plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 


<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [8]:
from sklearn.model_selection import train_test_split
from keras._tf_keras.keras.utils import to_categorical


In [9]:
label_map = {label:num for num, label in enumerate(actions)}

In [10]:
label_map

{'Peace': 0, 'Good': 1, 'Bad': 2}

In [11]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [12]:
np.array(sequences).shape

(90, 30, 126)

In [13]:
X = np.array(sequences)

In [14]:
X.shape

(90, 30, 126)

In [15]:
y = to_categorical(labels).astype(int)

In [16]:
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0,

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.05)

In [18]:
y_test.shape

(5, 3)

In [47]:
#Builds a sequential NN
#from keras._tf_keras.keras.models import Sequential
from keras.api import Sequential
# LSTM layer and
# from keras._tf_keras.keras.layers import LSTM, Dense
from keras.api.layers import LSTM, Dense
# Allows logging in tensorboard
#from keras._tf_keras.keras.callbacks import TensorBoard
from keras.api.callbacks import TensorBoard


In [48]:
log_dir = os.path.join("Logs")
tb_callback = TensorBoard(log_dir= log_dir)

In [49]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation= "relu", input_shape=(30,126)))
model.add(LSTM(128, return_sequences=True, activation= "relu"))
model.add(LSTM(32, return_sequences=False, activation= "relu"))
model.add(Dense(64, activation = "relu"))
model.add(Dense(32, activation = "relu"))
model.add(Dense(actions.shape[0], activation = "softmax"))

  super().__init__(**kwargs)


In [50]:
X.shape

(90, 30, 126)

In [53]:
model.compile(optimizer= "Adam", loss = "categorical_crossentropy", metrics = ["categorical_accuracy"])

In [54]:
model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])

Epoch 1/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 93ms/step - categorical_accuracy: 0.3033 - loss: 1.0967
Epoch 2/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - categorical_accuracy: 0.3289 - loss: 1.0904
Epoch 3/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - categorical_accuracy: 0.3954 - loss: 1.0755
Epoch 4/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - categorical_accuracy: 0.4071 - loss: 1.0150
Epoch 5/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - categorical_accuracy: 0.4345 - loss: 1.0749
Epoch 6/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - categorical_accuracy: 0.5010 - loss: 0.9482
Epoch 7/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - categorical_accuracy: 0.6478 - loss: 0.8695
Epoch 8/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/st

<keras.src.callbacks.history.History at 0x1a1bf153ef0>

In [56]:
model.summary()

In [57]:
model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 878ms/step


array([[1.97169738e-05, 1.87252991e-11, 9.99980330e-01],
       [2.35258703e-04, 9.97805417e-01, 1.95932202e-03],
       [8.49717200e-01, 1.05747625e-01, 4.45351973e-02],
       [2.60049510e-10, 9.99951720e-01, 4.82448741e-05],
       [9.99533296e-01, 3.65056403e-05, 4.30248823e-04]], dtype=float32)

In [59]:
model.save("my_model.keras")

In [60]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [61]:
yhat = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step


In [None]:
sequence = []
threshold = 0.4

cap = cv2.VideoCapture(0)
#sets mediapipe model 
with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic: 
    while cap.isOpened(): 
        # ret returns a boolean value of if the camera could be read or not and frame is the array of pixle values of the video 
        ret, frame = cap.read() 
         
        # Makes image detections by passing in array of live feed pixles and converting from bgr to rgb  
        image, results = mediapipe_detection(frame, holistic) 
        print(results) 
         
        draw_landmarks(image, results)  
        
        keypoints = get_keypoints(results)
        sequence.insert(0,keypoints)
        sequence = sequence[:30]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis= 0))[0]
            print(actions[np.argmax(res)])
            
        ## Opens GUI with show live feed from camera titled "Live Feed" and returns every "frame" repeatedly     
        cv2.imshow("Live Feed", image) 
        if cv2.waitKey(10) & 0xFF == ord("q"): 
            break 
    cap.release() 
    cv2.destroyAllWindows() 