In [26]:
import cv2 as cv
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from mediapipe.python.solutions.pose import PoseLandmark
from mediapipe.python.solutions.drawing_utils import DrawingSpec
import math

In [2]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.config.list_physical_devices())


Num GPUs Available:  0
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [27]:
#define mediapipe holistic and drawing utils
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_pose = mp.solutions.pose

In [28]:
#detect using mediapipe model
def media_pipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB) 
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR) 
    return image, results

In [29]:
#draw landmarks without style
def draw_land_marks(image, results):
    # mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    
    # mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)

    custom_pose_connections = list(mp_pose.POSE_CONNECTIONS)
    
    excluded_landmarks = [
        PoseLandmark.NOSE,
        PoseLandmark.LEFT_EYE_INNER,
        PoseLandmark.LEFT_EYE,
        PoseLandmark.LEFT_EYE_OUTER,
        PoseLandmark.RIGHT_EYE_INNER,
        PoseLandmark.RIGHT_EYE,
        PoseLandmark.RIGHT_EYE_OUTER,
        PoseLandmark.LEFT_EAR,
        PoseLandmark.RIGHT_EAR,
        PoseLandmark.MOUTH_LEFT,
        PoseLandmark.MOUTH_RIGHT,
        PoseLandmark.LEFT_HIP,
        PoseLandmark.RIGHT_HIP,
        PoseLandmark.LEFT_KNEE,
        PoseLandmark.RIGHT_KNEE,
        PoseLandmark.LEFT_ANKLE,
        PoseLandmark.RIGHT_ANKLE,
        PoseLandmark.LEFT_HEEL,
        PoseLandmark.RIGHT_HEEL,
        PoseLandmark.LEFT_FOOT_INDEX,
        PoseLandmark.RIGHT_FOOT_INDEX
    ]

    for landmark in excluded_landmarks:
        custom_pose_connections = [connection_tuple for connection_tuple in custom_pose_connections if landmark.value not in connection_tuple]

    mp_drawing.draw_landmarks(image, results.pose_landmarks, connections=custom_pose_connections)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [30]:
#draw landmarks with style (different color)
def draw_styled_handmarks(image, results):
    # mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION , 
    #                          mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
    #                          mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
    #                          ) 
   
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2),
                             ) 
      
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
     
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [31]:
# def extract_keypoints_normalize(results):
#     # face = np.array([[res.x, res.y] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*2)

#     # pose = np.array([[res.x, res.y] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*2)
#     shoulder_width = 1
#     midpoint_shoulder_x = 1
#     midpoint_shoulder_y = 1

#     if results.pose_landmarks:
#         selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        
#         midpoint_shoulder_x = (selected_pose_landmarks[0].x + selected_pose_landmarks[1].x) / 2
#         midpoint_shoulder_y = (selected_pose_landmarks[0].y + selected_pose_landmarks[1].y) / 2

#         shoulder_width = math.sqrt(pow((selected_pose_landmarks[1].x - selected_pose_landmarks[0].x) , 2) + pow((selected_pose_landmarks[1].y - selected_pose_landmarks[0].y) ,2))

#         pose = np.array([[(res.x - midpoint_shoulder_x) / shoulder_width, (res.y - midpoint_shoulder_y) / shoulder_width] for res in selected_pose_landmarks]).flatten()
#     else:
#         pose = np.zeros(22*2)

        
#     left_hand = np.array([[(res.x - midpoint_shoulder_x) / shoulder_width, (res.y - midpoint_shoulder_y) / shoulder_width] for res in  results.left_hand_landmarks]).flatten() if results.left_hand_landmarks else np.zeros(21*2)
#     right_hand = np.array([[(res.x - midpoint_shoulder_x) / shoulder_width, (res.y - midpoint_shoulder_y) / shoulder_width] for res in results.right_hand_landmarks]).flatten() if results.right_hand_landmarks else np.zeros(21*2)
   
#     # return np.concatenate([pose, face, left_hand, right_hand])
#     return np.concatenate([pose, left_hand, right_hand])

def extract_keypoints_normalize(results):
    # Initialize default values for midpoint and shoulder length
    midpoint_shoulder_x, midpoint_shoulder_y = 0, 0
    shoulder_length = 1

    # Calculate midpoint and shoulder length if pose landmarks are available
    if results.pose_landmarks:
        left_shoulder = results.pose_landmarks.landmark[11]
        right_shoulder = results.pose_landmarks.landmark[12]

        # Midpoint of the shoulders
        midpoint_shoulder_x = (left_shoulder.x + right_shoulder.x) / 2
        midpoint_shoulder_y = (left_shoulder.y + right_shoulder.y) / 2

        # Length of the shoulders
        shoulder_length = math.sqrt((left_shoulder.x - right_shoulder.x) ** 2 + (left_shoulder.y - right_shoulder.y) ** 2)

        # Extract and normalize pose landmarks (only selected landmarks)
        selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        pose = np.array([[(res.x - midpoint_shoulder_x) / shoulder_length, 
                          (res.y - midpoint_shoulder_y) / shoulder_length] for res in selected_pose_landmarks]).flatten()
    else:
        pose = np.zeros(22 * 2)

    # Extract and normalize left hand landmarks
    if results.left_hand_landmarks:
        left_hand = np.array([[(res.x - midpoint_shoulder_x) / shoulder_length, 
                               (res.y - midpoint_shoulder_y) / shoulder_length] for res in results.left_hand_landmarks.landmark]).flatten()
    else:
        left_hand = np.zeros(21 * 2)

    # Extract and normalize right hand landmarks
    if results.right_hand_landmarks:
        right_hand = np.array([[(res.x - midpoint_shoulder_x) / shoulder_length, 
                                (res.y - midpoint_shoulder_y) / shoulder_length] for res in results.right_hand_landmarks.landmark]).flatten()
    else:
        right_hand = np.zeros(21 * 2)

    return np.concatenate([pose, left_hand, right_hand])

def extract_keypoints(results):
    # face = np.array([[res.x, res.y] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*2)

    # pose = np.array([[res.x, res.y] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*2)

    if results.pose_landmarks:
        selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        pose = np.array([[res.x, res.y] for res in selected_pose_landmarks]).flatten()
    else:
        pose = np.zeros(22*2)

        
    left_hand = np.array([[res.x, res.y] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*2)
    right_hand = np.array([[res.x, res.y] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*2)
   
    # return np.concatenate([pose, face, left_hand, right_hand])
    return np.concatenate([pose, left_hand, right_hand])

In [32]:
def extract_coordinate(results):
    # if results.face_landmarks:
    #     for res in results.face_landmarks.landmark:
    #         x = res.x
    #         y = res.y
    #         visibility = res.visibility
    #         print(f"FACE LANDMARK x: {x}, y: {y}\n")    
        
    # NORMAL POSE LANDMARK
    # if results.pose_landmarks:
    #     for res in results.pose_landmarks.landmark:
    #         x = res.x
    #         y = res.y
    #         print(f"POSE LANDMARK x: {x}, y: {y}\n")

    if results.pose_landmarks:
        selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        for res in selected_pose_landmarks:
            x = res.x
            y = res.y
            print(f"POSE LANDMARK x: {x}, y: {y}\n")
        
    if results.right_hand_landmarks:
        for res in results.right_hand_landmarks.landmark:
            x = res.x
            y = res.y
            print(f"RIGHT HAND LANDMARK x: {x}, y: {y}\n")
    if results.left_hand_landmarks:
        for res in results.left_hand_landmarks.landmark:
            x = res.x
            y = res.y
            print(f"LEFT HAND LANDMARK x: {x}, y: {y}\n")    

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import os
import cv2 as cv
import mediapipe as mp
from matplotlib import pyplot as plt
from mediapipe.python.solutions.pose import PoseLandmark

In [34]:
#FOR UPGRADED SEQUENCE
# DATA_PATH = os.path.join('Sign_Data_Upgrade')
DATA_PATH = os.path.join('My_Datasets')

actions = np.array(['maaf', 'tolong', "terimakasih", "nama", "saya", "kamu", "siapa"])

# actions = np.array(['terimakasih'])

# actions = np.array(['maaf', 'tolong', "terimakasih", "nama", "saya", "kamu", "siapa"])

# menggunakan xx video
no_sequences = 30

# setiap video berisi 30 frame
sequence_length = 30

In [35]:
label_map = {label:num for num, label in enumerate(actions)}

In [36]:
sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            # res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}-norm.npy".format(frame_num)))
            
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint

In [39]:
# SIMPLE MODEL -> MAIN MODEL
model = Sequential()

model.add(TimeDistributed(Dense(units=128, activation='tanh'), input_shape=(30, 108)))
model.add(LSTM(64, return_sequences=False, activation='tanh', input_shape=(30,108)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(actions.shape[0], activation='softmax'))

In [12]:
# # SIMPLE MODEL -> MAIN MODEL
# model = Sequential()

# model.add(LSTM(64, return_sequences=False, activation='tanh', input_shape=(30,108)))
# model.add(Dropout(0.5))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.1))
# model.add(Dense(actions.shape[0], activation='softmax'))

In [40]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_distributed_2 (TimeDi  (None, 30, 128)           13952     
 stributed)                                                      
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 64)                4160      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 7)                 455       
                                                      

In [41]:
model.load_weights('test.h5')

In [42]:
from scipy import stats

In [46]:
colors = [
    (245, 117, 16),  # Orange
    (117, 245, 16),  # Lime Green
    (16, 117, 245),  # Bright Blue
    (245, 16, 117),  # Pink
    (16, 245, 117),  # Teal
    (117, 16, 245),  # Purple
    (245, 245, 16)   # Yellow
]

# def prob_viz(res, actions, input_frame, colors):
#     output_frame = input_frame.copy()
#     for num, prob in enumerate(res):
#         cv.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
#         cv.putText(output_frame, actions[num], (0, 85+num*40), cv.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv.LINE_AA)
        
#     return output_frame

def prob_viz(res, actions, input_frame, colors, frame_height=480, frame_width=640):
    output_frame = input_frame.copy()

    num_actions = len(actions)

    space_height = 4
    total_space_height = (num_actions + 1) * space_height

    bar_height = (frame_height - total_space_height) // num_actions

    font_scale = max(0.4, bar_height / 25)
    font_thickness = max(1, int(font_scale * 1.5))

    for num, prob in enumerate(res):
        bar_top = space_height + num * (bar_height + space_height)
        bar_bottom = bar_top + bar_height

        cv.rectangle(output_frame, (0, bar_top), (int(prob * frame_width), bar_bottom), colors[num], -1)
        cv.putText(output_frame, actions[num], (10, bar_bottom - space_height // 2), cv.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), font_thickness, cv.LINE_AA)

    return output_frame


In [47]:
sequence = []
sentence = []
predictions = []
threshold = 0.5 

cap = cv.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        ret, frame = cap.read()

        image, results = media_pipe_detection(frame, holistic)
        
        draw_styled_handmarks(image, results)
        
        # if results.pose_landmarks:
        #     keypoints = extract_keypoints_normalize(results)
        # else:
        #     continue

        keypoints = extract_keypoints_normalize(results)
        # keypoints = extract_keypoints(results)
        # print(keypoints.shape)

        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(res)
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]
                
            image = prob_viz(res, actions, image, colors)
            
        # cv.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        # cv.putText(image, ' '.join(sentence), (3,30), 
        #                cv.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv.LINE_AA)
        
        cv.imshow('OpenCV Feed', image)

        if cv.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv.destroyAllWindows()

[5.3235836e-04 2.8263661e-03 9.7230500e-01 1.7436707e-03 2.8515221e-03
 9.9398419e-03 9.8012993e-03]
terimakasih
[5.3300872e-04 2.8227209e-03 9.7231638e-01 1.7431341e-03 2.8538296e-03
 9.9421907e-03 9.7886855e-03]
terimakasih
[5.3249719e-04 2.8234494e-03 9.7231323e-01 1.7444289e-03 2.8551202e-03
 9.9406652e-03 9.7906757e-03]
terimakasih
[5.3210050e-04 2.8630949e-03 9.7211218e-01 1.7551793e-03 2.9029602e-03
 9.9520972e-03 9.8823169e-03]
terimakasih
[5.2818900e-04 2.9221070e-03 9.7194189e-01 1.7679760e-03 2.9483864e-03
 9.9359974e-03 9.9553606e-03]
terimakasih
[5.31894795e-04 2.96949432e-03 9.71679807e-01 1.78758544e-03
 2.99526844e-03 1.00205345e-02 1.00154281e-02]
terimakasih
[5.39500616e-04 2.97441194e-03 9.71602857e-01 1.80143374e-03
 3.00335465e-03 1.00891385e-02 9.98932868e-03]
terimakasih
[5.4306153e-04 2.9652403e-03 9.7158271e-01 1.8064586e-03 3.0028815e-03
 1.0139825e-02 9.9598281e-03]
terimakasih
[5.4622878e-04 2.9766860e-03 9.7146386e-01 1.8152024e-03 3.0071994e-03
 1.0198568e