In [24]:
import cv2 as cv
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from mediapipe.python.solutions.pose import PoseLandmark
from mediapipe.python.solutions.drawing_utils import DrawingSpec
import math
from scipy import stats
import asyncio

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import os
import cv2 as cv
import mediapipe as mp
from matplotlib import pyplot as plt
from mediapipe.python.solutions.pose import PoseLandmark
from collections import deque

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.regularizers import l2

In [1]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_pose = mp.solutions.pose

def media_pipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB) 
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR) 
    return image, results

def draw_land_marks(image, results):
    custom_pose_connections = list(mp_pose.POSE_CONNECTIONS)
    
    excluded_landmarks = [
        PoseLandmark.NOSE,
        PoseLandmark.LEFT_EYE_INNER,
        PoseLandmark.LEFT_EYE,
        PoseLandmark.LEFT_EYE_OUTER,
        PoseLandmark.RIGHT_EYE_INNER,
        PoseLandmark.RIGHT_EYE,
        PoseLandmark.RIGHT_EYE_OUTER,
        PoseLandmark.LEFT_EAR,
        PoseLandmark.RIGHT_EAR,
        PoseLandmark.MOUTH_LEFT,
        PoseLandmark.MOUTH_RIGHT,
        PoseLandmark.LEFT_HIP,
        PoseLandmark.RIGHT_HIP,
        PoseLandmark.LEFT_KNEE,
        PoseLandmark.RIGHT_KNEE,
        PoseLandmark.LEFT_ANKLE,
        PoseLandmark.RIGHT_ANKLE,
        PoseLandmark.LEFT_HEEL,
        PoseLandmark.RIGHT_HEEL,
        PoseLandmark.LEFT_FOOT_INDEX,
        PoseLandmark.RIGHT_FOOT_INDEX
    ]

    for landmark in excluded_landmarks:
        custom_pose_connections = [connection_tuple for connection_tuple in custom_pose_connections if landmark.value not in connection_tuple]

    mp_drawing.draw_landmarks(image, results.pose_landmarks, connections=custom_pose_connections)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

def draw_styled_handmarks(image, results):   
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2),
                             ) 
      
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
     
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 
    
def extract_keypoints_normalize(results):
    midpoint_shoulder_x, midpoint_shoulder_y = 0, 0
    shoulder_length = 1

    if results.pose_landmarks:
        left_shoulder = results.pose_landmarks.landmark[11]
        right_shoulder = results.pose_landmarks.landmark[12]

        midpoint_shoulder_x = (left_shoulder.x + right_shoulder.x) / 2
        midpoint_shoulder_y = (left_shoulder.y + right_shoulder.y) / 2

        shoulder_length = math.sqrt((left_shoulder.x - right_shoulder.x) ** 2 + (left_shoulder.y - right_shoulder.y) ** 2)

        selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        pose = np.array([[(res.x - midpoint_shoulder_x) / shoulder_length, 
                          (res.y - midpoint_shoulder_y) / shoulder_length] for res in selected_pose_landmarks]).flatten()
    else:
        pose = np.zeros(12 * 2)

    if results.left_hand_landmarks:
        left_hand = np.array([[(res.x - midpoint_shoulder_x) / shoulder_length, 
                               (res.y - midpoint_shoulder_y) / shoulder_length] for res in results.left_hand_landmarks.landmark]).flatten()
    else:
        left_hand = np.zeros(21 * 2)

    if results.right_hand_landmarks:
        right_hand = np.array([[(res.x - midpoint_shoulder_x) / shoulder_length, 
                                (res.y - midpoint_shoulder_y) / shoulder_length] for res in results.right_hand_landmarks.landmark]).flatten()
    else:
        right_hand = np.zeros(21 * 2)

    return np.concatenate([pose, left_hand, right_hand])

def extract_coordinate(results):
    if results.pose_landmarks:
        selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        for res in selected_pose_landmarks:
            x = res.x
            y = res.y
            print(f"POSE LANDMARK x: {x}, y: {y}\n")
        
    if results.right_hand_landmarks:
        for res in results.right_hand_landmarks.landmark:
            x = res.x
            y = res.y
            print(f"RIGHT HAND LANDMARK x: {x}, y: {y}\n")
    if results.left_hand_landmarks:
        for res in results.left_hand_landmarks.landmark:
            x = res.x
            y = res.y
            print(f"LEFT HAND LANDMARK x: {x}, y: {y}\n")    

In [32]:
DATA_PATH = os.path.join('My_Datasets')

# actions = np.array(["maaf", "tolong", "nama", "saya", "rumah", "start", "standby", "delete", "dimana", "translate"])

# UNTUK TESTING BEST MODEL
actions = np.array(["maaf", "tolong", "nama", "saya", "siapa", "rumah", "start", "standby", "delete"])

no_sequences = 30

sequence_length = 30

label_map = {label:num for num, label in enumerate(actions)}

sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            # res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}-norm.npy".format(frame_num)))
            
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])
        

In [36]:
# # SIMPLE MODEL -> MAIN MODEL
# model = Sequential()

# model.add(TimeDistributed(Dense(units=256, activation='tanh'), input_shape=(30, 108)))
# model.add(LSTM(128, return_sequences=False, activation='tanh', input_shape=(30,108)))
# model.add(Dropout(0.5))
# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(actions.shape[0], activation='softmax'))

In [37]:
# model = Sequential()

# model.add(TimeDistributed(Dense(units=256, activation='tanh'), input_shape=(30, 108)))
# model.add(LSTM(128, return_sequences=True, activation='tanh'))
# model.add(Dropout(0.5))
# model.add(LSTM(64, return_sequences=False, activation='tanh'))
# model.add(Dropout(0.5))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(actions.shape[0], activation='softmax'))

# model.summary()

In [38]:
# model = Sequential()

# model.add(TimeDistributed(Dense(units=128, activation='tanh'), input_shape=(30, 108)))
# model.add(LSTM(128, return_sequences=True, activation='tanh'))
# model.add(Dropout(0.5))
# model.add(LSTM(64, return_sequences=False, activation='tanh'))
# model.add(Dropout(0.5))
# model.add(Dense(32, activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(actions.shape[0], activation='softmax'))

# model.summary()

model = Sequential()

model.add(TimeDistributed(Dense(units=128, activation='tanh'), input_shape=(30, 108)))
model.add(LSTM(128, return_sequences=True, activation='tanh'))
model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=False, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(actions.shape[0], activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_distributed_1 (TimeDi  (None, 30, 128)           13952     
 stributed)                                                      
                                                                 
 lstm_2 (LSTM)               (None, 30, 128)           131584    
                                                                 
 dropout_3 (Dropout)         (None, 30, 128)           0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                      

In [39]:
# FORMAT PENAMAAN MODEL:
# [jumlah class]_lstm_[tanggal]_[versi/iterasi]
#   example: 10_lstm_25032024_1

number_of_classes = len(actions)
current_date = '25032024' # changes manually
version = 1

model_filename = str(number_of_classes) + '_' + 'lstm' + '_' + current_date + '_' + str(version)

model.save(model_filename)

model.load_weights(model_filename) 

In [None]:
colors = [
    (245, 117, 16),  # Orange
    (117, 245, 16),  # Lime Green
    (16, 117, 245),  # Bright Blue
    (245, 16, 117),  # Pink
    (16, 245, 117),  # Teal
    (117, 16, 245),  # Purple
    (245, 245, 16),   # Yellow
    (128, 0, 128),   # Purple
    (255, 192, 203), # Light Pink
    (0, 255, 255),   # Cyan
    (255, 165, 0),   # Orange4
    (128, 128, 128),  # Gray
    (245, 117, 16),  # Orange
    (117, 245, 16),  # Lime Green
]

def prob_viz(res, actions, input_frame, colors, frame_height=480, frame_width=640, opacity=0.4):
    output_frame = input_frame.copy()

    num_actions = len(actions)

    space_height = 4
    total_space_height = (num_actions + 1) * space_height

    bar_height = (frame_height - total_space_height) // num_actions

    font_scale = max(0.4, bar_height / 25)
    font_thickness = max(1, int(font_scale * 1.5))

    for num, prob in enumerate(res):
        bar_top = space_height + num * (bar_height + space_height)
        bar_bottom = bar_top + bar_height

        # Create an overlay for the semi-transparent rectangle
        overlay = output_frame.copy()
        cv.rectangle(overlay, (0, bar_top), (int(prob * frame_width), bar_bottom), colors[num], -1)

        # Blend the overlay with the original frame
        cv.addWeighted(overlay, opacity, output_frame, 1 - opacity, 0, output_frame)

        # Draw the text
        cv.putText(output_frame, actions[num], (10, bar_bottom - space_height // 2), cv.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), font_thickness, cv.LINE_AA)

    return output_frame

In [42]:
speak_lock = asyncio.Lock()

In [43]:
import asyncio
import os
from gtts import gTTS
import tempfile
import sounddevice as sd
import soundfile as sf
import pygame

async def speak_async(words, on_done=None):
    async with speak_lock:
        tts = gTTS(text=words, lang='id')
        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmpfile:
            tts.save(tmpfile.name)
            filename = tmpfile.name

        def play_audio(filename):

            data, fs = sf.read(filename, dtype='float32')
            sd.play(data, fs)
            sd.wait()  # Wait until file is played
            os.unlink(filename)  # Delete the temp file after playback
            if on_done:
                on_done()

        # Run the blocking play_audio function in a separate thread
        loop = asyncio.get_running_loop()
        await loop.run_in_executor(None, play_audio, filename)
        print("Audio has been played.")

In [44]:
def reset_speaking_flag():
    global hasSpoken
    hasSpoken = False
    print("Ready to speak again.")

def displaySentence(image, sentence):
    if len(sentence) > 5: 
        sentence = sentence[-5:]

    cv.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
    cv.putText(image, ' '.join(sentence), (3,30), 
    cv.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv.LINE_AA)

def displayProgramStatus(image, programStatus):
    if len(programStatus) > 1:
        programStatus = programStatus[-1:]

    height, width = image.shape[:2] 
    bottom_right_corner = (640, height)
    bottom_left_corner = (0, height - 40) 
    cv.rectangle(image, bottom_left_corner, bottom_right_corner, (0, 255, 0), -1)
    cv.putText(image, "Status: " + ' '.join(programStatus), (3,height - 10), 
    cv.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv.LINE_AA)

async def main():
    global hasSpoken
    global programStatus
    programStatus = []

    sequence = []
    sentence = []
    predictions = []
    threshold = 0.5 

    hasSpoken = False
    inStandby = False
  
    cap = cv.VideoCapture(0)
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():

            ret, frame = cap.read()

            image, results = media_pipe_detection(frame, holistic)
            
            draw_styled_handmarks(image, results)

            keypoints = extract_keypoints_normalize(results)

            sequence.append(keypoints)
            sequence = sequence[-30:]
            
            if len(sequence) == 30:
                sequence_array = np.array(sequence)
                if sequence_array.shape == (30, 108):
                    res = model.predict(np.expand_dims(sequence, axis=0))[0]
                    print(res)
                    print(actions[np.argmax(res)])
                    predictions.append(np.argmax(res))
                    
                    if np.unique(predictions[-10:])[0] == np.argmax(res): 
                        if res[np.argmax(res)] > threshold: 
                            if actions[np.argmax(res)] == "standby":
                                inStandby = True
                                programStatus.append("standby")

                            if inStandby and programStatus[-1:] != "translate":
                                if actions[np.argmax(res)] == "delete" and not hasSpoken: 
                                    inStandby = False
                                    hasSpoken = True
                                    
                                    programStatus.append("translate")
                                    displayProgramStatus(image, programStatus)
                                    
                                    await speak_async(' '.join(sentence), reset_speaking_flag())
                                else:
                                    if len(sentence) > 0: 
                                        if actions[np.argmax(res)] != sentence[-1]:
                                            if actions[np.argmax(res)] != "standby" and actions[np.argmax(res)] != "start" and actions[np.argmax(res)] != "delete":
                                                inStandby = False
                                                programStatus.append("not-standby")
                                                sentence.append(actions[np.argmax(res)])
                                    else:
                                        if actions[np.argmax(res)] != "standby" and actions[np.argmax(res)] != "start" and actions[np.argmax(res)] != "delete":
                                            inStandby = False
                                            programStatus.append("not-standby")
                                            sentence.append(actions[np.argmax(res)])                
                # image = prob_viz(res, actions, image, colors)
            
            displaySentence(image, sentence)
            
            displayProgramStatus(image, programStatus)

            cv.imshow('OpenCV Feed', image)

            if cv.waitKey(10) & 0xFF == ord('q'):
                break
        cap.release()
        cv.destroyAllWindows()  
 
if __name__ == "__main__":
    await main()

[0.01038265 0.03515745 0.02279743 0.0086606  0.5840523  0.01064958
 0.02025453 0.30590388 0.00214159]
siapa
[0.01025801 0.0345662  0.02247949 0.00822416 0.6161249  0.01055004
 0.02030826 0.27538437 0.00210451]
siapa
[0.0100301  0.03374841 0.02235952 0.00777252 0.64881736 0.01048018
 0.01997665 0.24470487 0.00211037]
siapa
[0.00947456 0.03161704 0.02175078 0.00714081 0.67239815 0.00966371
 0.01817722 0.22780043 0.00197729]
siapa
[0.00924232 0.03108013 0.02146699 0.00687806 0.6872616  0.00935408
 0.01749926 0.21532604 0.00189157]
siapa
[0.00934886 0.0313508  0.02145969 0.00684386 0.69491136 0.00952642
 0.01753413 0.20712171 0.00190311]
siapa
[0.00953064 0.03172865 0.02114968 0.00668614 0.70323116 0.00977985
 0.01765239 0.19833364 0.00190782]
siapa
[0.00980813 0.03210601 0.02069195 0.00635482 0.71460766 0.01010615
 0.01767156 0.18674508 0.00190864]
siapa
[0.01016123 0.03263687 0.02027082 0.00598392 0.7261708  0.01052982
 0.01778322 0.17454351 0.00191974]
siapa
[0.01034789 0.03285571 0.019

In [45]:
# sequence = deque(maxlen=30)
# sentence = []
# predictions = []
# threshold = 0.5 
# processed_frames = 0

# cap = cv.VideoCapture(0)
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened():
#         ret, frame = cap.read()

#         image, results = media_pipe_detection(frame, holistic)
#         draw_styled_handmarks(image, results)

#         keypoints = extract_keypoints_normalize(results)
#         sequence.append(keypoints)
#         # sequence = sequence[-30:]
        
#         if len(sequence) == 30 and (processed_frames == 0 or processed_frames >= 15):
#             sequence_array = np.array(sequence)
#             if sequence_array.shape == (30, 108):
#                 res = model.predict(np.expand_dims(sequence_array, axis=0))[0]
#                 print(actions[np.argmax(res)])
#                 print(res)
#                 print("")
#                 predictions.append(np.argmax(res))
                
#                 # if np.unique(predictions[-10:])[0] == np.argmax(res):
#                 #     if res[np.argmax(res)] > threshold:
#                 #         if len(sentence) > 0: 
#                 #             if actions[np.argmax(res)] != sentence[-1]:
#                 #                 sentence.append(actions[np.argmax(res)])
#                 #         else:
#                 #             sentence.append(actions[np.argmax(res)])

#                 # if len(sentence) > 5: 
#                 #     sentence = sentence[-5:]
                
#                 # image = prob_viz(res, actions, image, colors)
#                 image = prob_viz(res, actions[np.argmax(res)], image, colors)

#                 processed_frames = 0 

#         processed_frames += 1
        
#         cv.imshow('OpenCV Feed', image)

#         if cv.waitKey(10) & 0xFF == ord('q'):
#             break

#     cap.release()
#     cv.destroyAllWindows()