In [1]:
import os
import numpy as np
import cv2
import mediapipe as mp

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

from scipy import stats

from gtts import gTTS
from io import BytesIO
from pygame import mixer

2024-09-22 15:50:26.748121: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-22 15:50:26.946422: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-22 15:50:26.946973: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


pygame 2.6.0 (SDL 2.28.4, Python 3.10.14)
Hello from the pygame community. https://www.pygame.org/contribute.html


# Keypoint functions and variables initialization

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [4]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [5]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

# Model Creation and Training

In [10]:
# Actions that we try to detect
actions = np.array(['explain', 'help', 'toilet', 'thank you', 'normal'])

# Sixty videos worth of data
no_sequences = 60

# Videos are going to be 30 frames in length
sequence_length = 30

# Folder start
start_folder = 0

In [11]:
# To observe training logs using tensorboard
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [12]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='elu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='elu'))
model.add(LSTM(64, return_sequences=False, activation='elu'))
model.add(Dense(64, activation='elu'))
model.add(Dense(32, activation='elu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [13]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [16]:
""" No need to run this part of code
"""
# model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])

' No need to run this part of code\n'

# Load the Model from memory

In [23]:
from tensorflow.keras.models import load_model

model = load_model('../models/action6.h5')

In [17]:
model.load_weights('action5.h5')

In [18]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 30, 64)            442112    
                                                                 
 lstm_4 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 32)                2080      
                                                                 
 dense_4 (Dense)             (None, 5)                 165       
                                                                 
Total params: 596,741
Trainable params: 596,741
Non-tr

# Test Model in Real-Time

In [19]:
colors = [(245,117,16), (117,245,16), (16,117,245), (117,18,246), (255,179,0)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        print(f"prob: {prob}")
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [20]:
def speak(text):
    mp3_fp = BytesIO()
    tts = gTTS(text, lang='en')
    tts.write_to_fp(mp3_fp)
    return mp3_fp

In [21]:
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.7

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            
        #3. Viz logic
            if np.unique(predictions[-15:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    valid_action = False
                    
                    if len(sentence) > 0:
                        if actions[np.argmax(res)] != sentence[-1]:
                            valid_action = True
                    else:
                        valid_action = True
                        
                if valid_action :
                    sentence.append(actions[np.argmax(res)])
                    mixer.init()
                    sound = speak(sentence[-1])
                    sound.seek(0)
                    mixer.music.load(sound, "mp3")
                    mixer.music.play()

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()



























INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1727000518.875580    4877 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1727000518.909572    4877 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1727000518.912688    4876 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1727000518.926343    4877 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1727000518.926351    4881 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti



<class 'mediapipe.python.solution_base.SolutionOutputs'>
normal
prob: 8.152399764647638e-30
prob: 1.7071513898337554e-34
prob: 6.053610178627351e-31
prob: 0.0
prob: 1.0
<class 'mediapipe.python.solution_base.SolutionOutputs'>
normal
prob: 5.261946830927086e-27
prob: 4.520474022556084e-31
prob: 4.908040351758918e-28
prob: 0.0
prob: 1.0
<class 'mediapipe.python.solution_base.SolutionOutputs'>
normal
prob: 6.699634740417388e-24
prob: 2.2669283595856345e-27
prob: 7.652299358556309e-25
prob: 1.105697221347773e-36
prob: 1.0
<class 'mediapipe.python.solution_base.SolutionOutputs'>
normal
prob: 8.580817861042787e-21
prob: 1.299621756739111e-23
prob: 1.6635436165299259e-21
prob: 3.96221764941837e-31
prob: 1.0
<class 'mediapipe.python.solution_base.SolutionOutputs'>
normal
prob: 3.0172971251145734e-17
prob: 2.4651814976124913e-19
prob: 1.787798527364537e-17
prob: 1.5725895800002689e-24
prob: 1.0
<class 'mediapipe.python.solution_base.SolutionOutputs'>
normal
prob: 3.7945135315103906e-13
prob: 2.



<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 8.498498849809555e-21
prob: 5.855169458003975e-14
prob: 1.8875684813853292e-20
prob: 1.0
prob: 1.001472013713743e-18
<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 7.3962983162016e-24
prob: 9.336331722651064e-15
prob: 1.4801372512758556e-24
prob: 1.0
prob: 1.295069994606106e-21
<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 7.783898860398508e-26
prob: 2.6000735107154817e-15
prob: 7.613478404074735e-27
prob: 1.0
prob: 2.619056386201572e-23
<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 6.78359365029432e-26
prob: 2.8897020274689864e-15
prob: 6.591279530511543e-27
prob: 1.0
prob: 3.3616906734653566e-23
<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 1.7410499388416976e-24
prob: 9.901312862865049e-15
prob: 3.6304855507277924e-25
prob: 1.0
prob: 1.2449294401121811e-21
<class 'mediapipe.python.solution_base.SolutionOut

<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 3.2048613389750744e-23
prob: 3.997549971710042e-14
prob: 1.8588688393913524e-23
prob: 1.0
prob: 4.3398822475290204e-20
<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 3.0277339823047314e-23
prob: 3.751019677158962e-14
prob: 1.7416753477673568e-23
prob: 1.0
prob: 3.9871359220671835e-20
<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 2.3423664213667418e-23
prob: 3.028851244791207e-14
prob: 1.3500530822072566e-23
prob: 1.0
prob: 2.956213939176383e-20
<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 2.9491022205384734e-23
prob: 3.396723413607791e-14
prob: 1.9086358650912114e-23
prob: 1.0
prob: 3.6822969250040915e-20
<class 'mediapipe.python.solution_base.SolutionOutputs'>
thank you
prob: 3.60196161243321e-23
prob: 4.1970743422598336e-14
prob: 2.2437810540916426e-23
prob: 1.0
prob: 4.3784275716042844e-20
<class 'mediapipe.python.solution_base.Sol



<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 0.0004254271916579455
prob: 3.2026234442206203e-10
prob: 0.9995734095573425
prob: 1.4179658835988818e-10
prob: 1.2179320947325323e-06
<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 2.5194120878077597e-13
prob: 3.2247164753605078e-15
prob: 1.0
prob: 4.4375368315899624e-14
prob: 7.257298061114081e-12
<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 9.09319006810877e-14
prob: 1.988229213903468e-15
prob: 1.0
prob: 2.891340866815334e-14
prob: 4.498945053305059e-12
<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 5.3885512893393165e-15
prob: 3.2886554455982683e-16
prob: 1.0
prob: 6.19461833513261e-15
prob: 7.245372379317827e-13
<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 7.231720605311328e-16
prob: 9.178156525383e-17
prob: 1.0
prob: 1.972004297798812e-15
prob: 1.7812602792505877e-13
<class 'mediapipe.python.solution_base.SolutionOutp



<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 5.2989365402436814e-17
prob: 1.6219762769439918e-17
prob: 1.0
prob: 4.294601567575928e-16
prob: 2.766435186127552e-14
<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 5.2699888583990207e-17
prob: 1.5763489943555668e-17
prob: 1.0
prob: 4.223860816892597e-16
prob: 2.7583843073704893e-14
<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 5.2380811934504e-17
prob: 1.532519671546587e-17
prob: 1.0
prob: 4.1356116304917237e-16
prob: 2.725021373644037e-14
<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 1.676747875768068e-17
prob: 4.931411756844123e-18
prob: 1.0
prob: 1.5542212843125077e-16
prob: 1.21118291448747e-14
<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
prob: 4.380854403199241e-18
prob: 1.494524598354551e-18
prob: 1.0
prob: 6.518201094019089e-17
prob: 4.586569483450631e-15
<class 'mediapipe.python.solution_base.SolutionOutputs'>
toilet
pr