In [31]:
import cv2 as cv
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from mediapipe.python.solutions.pose import PoseLandmark
from mediapipe.python.solutions.drawing_utils import DrawingSpec
import math
from scipy import stats
import asyncio

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import os
import cv2 as cv
import mediapipe as mp
from matplotlib import pyplot as plt
from mediapipe.python.solutions.pose import PoseLandmark
from collections import deque

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed, Bidirectional
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.regularizers import l2

In [32]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_pose = mp.solutions.pose


def media_pipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
    return image, results


def draw_land_marks(image, results):
    custom_pose_connections = list(mp_pose.POSE_CONNECTIONS)

    excluded_landmarks = [
        PoseLandmark.NOSE,
        PoseLandmark.LEFT_EYE_INNER,
        PoseLandmark.LEFT_EYE,
        PoseLandmark.LEFT_EYE_OUTER,
        PoseLandmark.RIGHT_EYE_INNER,
        PoseLandmark.RIGHT_EYE,
        PoseLandmark.RIGHT_EYE_OUTER,
        PoseLandmark.LEFT_EAR,
        PoseLandmark.RIGHT_EAR,
        PoseLandmark.MOUTH_LEFT,
        PoseLandmark.MOUTH_RIGHT,
        PoseLandmark.LEFT_HIP,
        PoseLandmark.RIGHT_HIP,
        PoseLandmark.LEFT_KNEE,
        PoseLandmark.RIGHT_KNEE,
        PoseLandmark.LEFT_ANKLE,
        PoseLandmark.RIGHT_ANKLE,
        PoseLandmark.LEFT_HEEL,
        PoseLandmark.RIGHT_HEEL,
        PoseLandmark.LEFT_FOOT_INDEX,
        PoseLandmark.RIGHT_FOOT_INDEX
    ]

    for landmark in excluded_landmarks:
        custom_pose_connections = [
            connection_tuple for connection_tuple in custom_pose_connections if landmark.value not in connection_tuple]

    mp_drawing.draw_landmarks(
        image, results.pose_landmarks, connections=custom_pose_connections)
    mp_drawing.draw_landmarks(
        image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(
        image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)


def draw_styled_handmarks(image, results):
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(
                                  color=(80, 22, 10), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(
                                  color=(80, 44, 121), thickness=2, circle_radius=2),
                              )

    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(
                                  color=(121, 22, 76), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(
                                  color=(121, 44, 250), thickness=2, circle_radius=2)
                              )

    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(
                                  color=(245, 117, 66), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(
                                  color=(245, 66, 230), thickness=2, circle_radius=2)
                              )


def extract_keypoints_normalize(results):
    midpoint_shoulder_x, midpoint_shoulder_y = 0, 0
    shoulder_length = 1

    if results.pose_landmarks:
        left_shoulder = results.pose_landmarks.landmark[11]
        right_shoulder = results.pose_landmarks.landmark[12]

        midpoint_shoulder_x = (left_shoulder.x + right_shoulder.x) / 2
        midpoint_shoulder_y = (left_shoulder.y + right_shoulder.y) / 2

        shoulder_length = math.sqrt(
            (left_shoulder.x - right_shoulder.x) ** 2 + (left_shoulder.y - right_shoulder.y) ** 2)

        selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        pose = np.array([[(res.x - midpoint_shoulder_x) / shoulder_length,
                          (res.y - midpoint_shoulder_y) / shoulder_length] for res in selected_pose_landmarks]).flatten()
    else:
        pose = np.zeros(12 * 2)

    if results.left_hand_landmarks:
        left_hand = np.array([[(res.x - midpoint_shoulder_x) / shoulder_length,
                               (res.y - midpoint_shoulder_y) / shoulder_length] for res in results.left_hand_landmarks.landmark]).flatten()
    else:
        left_hand = np.zeros(21 * 2)

    if results.right_hand_landmarks:
        right_hand = np.array([[(res.x - midpoint_shoulder_x) / shoulder_length,
                                (res.y - midpoint_shoulder_y) / shoulder_length] for res in results.right_hand_landmarks.landmark]).flatten()
    else:
        right_hand = np.zeros(21 * 2)

    return np.concatenate([pose, left_hand, right_hand])


def extract_coordinate(results):
    if results.pose_landmarks:
        selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        for res in selected_pose_landmarks:
            x = res.x
            y = res.y
            print(f"POSE LANDMARK x: {x}, y: {y}\n")

    if results.right_hand_landmarks:
        for res in results.right_hand_landmarks.landmark:
            x = res.x
            y = res.y
            print(f"RIGHT HAND LANDMARK x: {x}, y: {y}\n")
    if results.left_hand_landmarks:
        for res in results.left_hand_landmarks.landmark:
            x = res.x
            y = res.y
            print(f"LEFT HAND LANDMARK x: {x}, y: {y}\n")

In [33]:
DATA_PATH = os.path.join('My_Datasets')

# actions = np.array(["maaf", "tolong", "nama", "saya", "rumah", "start", "standby", "delete", "dimana", "translate"])

# UNTUK TESTING BEST MODEL
actions = np.array(["standby",
                    "maaf",
                    "tolong",
                    "nama",
                    "siapa",
                    "rumah",
                    "delete",
                    "translate",
                    "dimana",
                    "berapa",
                    "ini",
                    ])

no_sequences = 30

sequence_length = 30

label_map = {label: num for num, label in enumerate(actions)}

sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            # res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            res = np.load(os.path.join(DATA_PATH, action, str(
                sequence), "{}-norm.npy".format(frame_num)))

            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [34]:
colors = [
    (245, 117, 16),  # Orange
    (117, 245, 16),  # Lime Green
    (16, 117, 245),  # Bright Blue
    (245, 16, 117),  # Pink
    (16, 245, 117),  # Teal
    (117, 16, 245),  # Purple
    (245, 245, 16),   # Yellow
    (128, 0, 128),   # Purple
    (255, 192, 203),  # Light Pink
    (0, 255, 255),   # Cyan
    (255, 165, 0),   # Orange
    (128, 128, 128),  # Gray
    (245, 117, 16),  # Orange
    (117, 245, 16),  # Lime Green
    (245, 117, 16),  # Orange
    (16, 245, 117),  # Teal
    (255, 192, 203),  # Light Pink
    (0, 255, 255),   # Cyan
    (255, 165, 0),   # Orange
    (245, 16, 117),  # Pink
    (16, 245, 117),  # Teal
    (117, 16, 245),  # Purple
]


def prob_viz(res, actions, input_frame, colors, frame_height=480, frame_width=640, opacity=0.4):
    output_frame = input_frame.copy()

    num_actions = len(actions)

    space_height = 4
    total_space_height = (num_actions + 1) * space_height

    bar_height = (frame_height - total_space_height) // num_actions

    font_scale = max(0.4, bar_height / 25)
    font_thickness = max(1, int(font_scale * 1.5))

    for num, prob in enumerate(res):
        bar_top = space_height + num * (bar_height + space_height)
        bar_bottom = bar_top + bar_height

        # Create an overlay for the semi-transparent rectangle
        overlay = output_frame.copy()
        cv.rectangle(overlay, (0, bar_top),
                     (int(prob * frame_width), bar_bottom), colors[num], -1)

        # Blend the overlay with the original frame
        cv.addWeighted(overlay, opacity, output_frame,
                       1 - opacity, 0, output_frame)

        # Draw the text
        cv.putText(output_frame, actions[num], (10, bar_bottom - space_height // 2),
                   cv.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), font_thickness, cv.LINE_AA)

    return output_frame

In [35]:
speak_lock = asyncio.Lock()

In [36]:
import asyncio
import os
from gtts import gTTS
import tempfile
import sounddevice as sd
import soundfile as sf
import pygame


async def speak_async(words, on_done=None):
    async with speak_lock:
        tts = gTTS(text=words, lang='id')
        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmpfile:
            tts.save(tmpfile.name)
            filename = tmpfile.name

        def play_audio(filename):

            data, fs = sf.read(filename, dtype='float32')
            sd.play(data, fs)
            sd.wait()  # Wait until file is played
            os.unlink(filename)  # Delete the temp file after playback
            if on_done:
                on_done()

        # Run the blocking play_audio function in a separate thread
        loop = asyncio.get_running_loop()
        await loop.run_in_executor(None, play_audio, filename)
        print("Audio has been played.")

In [37]:
model = Sequential()

model.add(TimeDistributed(
    Dense(units=128, activation='tanh'), input_shape=(30, 108)))
model.add(LSTM(64, return_sequences=True, activation='tanh'))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=True, activation='tanh'))
model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=False, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(actions.shape[0], activation='softmax'))

model.summary()



# model.load_weights('model-bimbingan7v4_2.h5')



model.load_weights('model/model_11_lstm_11042024_97.h5')

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_distributed_3 (TimeDi  (None, 30, 128)           13952     
 stributed)                                                      
                                                                 
 lstm_7 (LSTM)               (None, 30, 64)            49408     
                                                                 
 dropout_11 (Dropout)        (None, 30, 64)            0         
                                                                 
 lstm_8 (LSTM)               (None, 30, 128)           98816     
                                                                 
 dropout_12 (Dropout)        (None, 30, 128)           0         
                                                                 
 lstm_9 (LSTM)               (None, 64)                49408     
                                                      

In [38]:
sequence = []
sentence = []
predictions = []
threshold = 0.5

cap = cv.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()

        image, results = media_pipe_detection(frame, holistic)
        draw_styled_handmarks(image, results)

        keypoints = extract_keypoints_normalize(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            sequence_array = np.array(sequence)
            if sequence_array.shape == (30, 108):
                res = model.predict(np.expand_dims(sequence_array, axis=0))[0]
                print(actions[np.argmax(res)])
                print(res)
                print("")
                predictions.append(np.argmax(res))

                if np.unique(predictions[-10:])[0] == np.argmax(res):
                    if res[np.argmax(res)] > threshold:
                        if len(sentence) > 0:
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                        else:
                            sentence.append(actions[np.argmax(res)])

                if len(sentence) > 5:
                    sentence = sentence[-5:]

                image = prob_viz(res, actions, image, colors)

            else:
                image = prob_viz(res, actions, image, colors)

        cv.imshow('OpenCV Feed', image)

        if cv.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv.destroyAllWindows()

standby
[9.9861884e-01 1.4411878e-06 1.6166511e-04 4.7594731e-05 7.3611061e-04
 1.1355637e-04 1.1465610e-04 6.0555656e-05 6.9699599e-05 5.5029494e-05
 2.1023654e-05]

standby
[9.9741369e-01 5.2405767e-06 2.1862962e-04 9.0320202e-05 1.4010014e-03
 2.7093384e-04 1.3428093e-04 1.6026663e-04 1.3599408e-04 1.2623047e-04
 4.3479708e-05]

standby
[9.7960073e-01 1.3599423e-04 5.3192273e-04 3.0185017e-04 1.2385285e-02
 2.0761092e-03 3.4137335e-04 1.7091032e-03 1.1180900e-03 1.2501621e-03
 5.4947654e-04]

standby
[9.7566926e-01 1.6094443e-04 5.7010993e-04 3.1605293e-04 1.5221776e-02
 2.2589373e-03 3.6379966e-04 2.0431450e-03 1.2808105e-03 1.4695246e-03
 6.4552552e-04]

standby
[0.4980403  0.00426541 0.004107   0.000892   0.36042473 0.02099037
 0.00207196 0.01296023 0.01161649 0.01866439 0.0659672 ]

ini
[9.8549752e-03 2.2107826e-03 6.8034874e-03 1.8208205e-04 4.3171561e-01
 2.4261749e-03 1.3950759e-03 1.0594631e-03 1.0706656e-02 9.8931082e-03
 5.2375257e-01]

ini
[1.3307764e-04 3.4209856e-04 1.3

In [39]:
# TEST WITH WINDOWING (15 FRAMES)

# sequence = []


# sentence = []


# predictions = []


# threshold = 0.5


# # Setup camera


# cap = cv.VideoCapture(0)


# # Initialize MediaPipe Holistic


# mp_holistic = mp.solutions.holistic  # Holistic model


# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:


#     while cap.isOpened():


#         ret, frame = cap.read()


#         if not ret:


#             print("Ignoring empty camera frame.")


#             continue


#         image, results = media_pipe_detection(frame, holistic)


#         draw_styled_handmarks(image, results)


#         keypoints = extract_keypoints_normalize(results)


#         sequence.append(keypoints)


#         if len(sequence) == 30:


#             sequence_array = np.array(sequence)


#             if sequence_array.shape == (30, 108):  # Expected shape


#                 res = model.predict(np.expand_dims(sequence_array, axis=0))[0]


#                 print(actions[np.argmax(res)])
#                 print(res)


#                 print("")


#                 predictions.append(np.argmax(res))


#                 if np.unique(predictions[-10:])[0] == np.argmax(res):


#                     if res[np.argmax(res)] > threshold:


#                         if len(sentence) > 0 and actions[np.argmax(res)] != sentence[-1]:


#                             sentence.append(actions[np.argmax(res)])


#                         elif len(sentence) == 0:


#                             sentence.append(actions[np.argmax(res)])


#                 sequence = sequence[15:]


#                 sentence = sentence[-5:]


#         cv.imshow('OpenCV Feed', image)


#         if cv.waitKey(10) & 0xFF == ord('q'):


#             break

#     cap.release()


#     cv.destroyAllWindows()

In [40]:
# sequence = deque(maxlen=30)
# sentence = []
# predictions = []
# threshold = 0.5
# processed_frames = 0

# cap = cv.VideoCapture(0)
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
#     while cap.isOpened():
#         ret, frame = cap.read()

#         image, results = media_pipe_detection(frame, holistic)
#         draw_styled_handmarks(image, results)

#         keypoints = extract_keypoints_normalize(results)
#         sequence.append(keypoints)
#         # sequence = sequence[-30:]

#         if len(sequence) == 30 and (processed_frames == 0 or processed_frames >= 15):
#             sequence_array = np.array(sequence)
#             if sequence_array.shape == (30, 108):
#                 res = model.predict(np.expand_dims(sequence_array, axis=0))[0]
#                 print(actions[np.argmax(res)])
#                 print(res)
#                 print("")
#                 predictions.append(np.argmax(res))

#                 # if np.unique(predictions[-10:])[0] == np.argmax(res):
#                 #     if res[np.argmax(res)] > threshold:
#                 #         if len(sentence) > 0:
#                 #             if actions[np.argmax(res)] != sentence[-1]:
#                 #                 sentence.append(actions[np.argmax(res)])
#                 #         else:
#                 #             sentence.append(actions[np.argmax(res)])

#                 # if len(sentence) > 5:
#                 #     sentence = sentence[-5:]

#                 # image = prob_viz(res, actions, image, colors)
#                 image = prob_viz(res, actions[np.argmax(res)], image, colors)

#                 processed_frames = 0

#         processed_frames += 1

#         cv.imshow('OpenCV Feed', image)

#         if cv.waitKey(10) & 0xFF == ord('q'):
#             break

#     cap.release()
#     cv.destroyAllWindows()