# **Import and Install Dependencies**

In [4]:
import cv2 as cv
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from mediapipe.python.solutions.pose import PoseLandmark
from mediapipe.python.solutions.drawing_utils import DrawingSpec

# **Access using Using MediaPipe**

In [5]:
#define mediapipe holistic and drawing utils
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_pose = mp.solutions.pose

In [6]:
#detect using mediapipe model
def media_pipe_detection(image, model):
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB) 
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR) 
    return image, results

In [7]:
#draw landmarks without style
def draw_land_marks(image, results):
    # mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    
    # mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)

    custom_pose_connections = list(mp_pose.POSE_CONNECTIONS)
    
    excluded_landmarks = [
        PoseLandmark.NOSE,
        PoseLandmark.LEFT_EYE_INNER,
        PoseLandmark.LEFT_EYE,
        PoseLandmark.LEFT_EYE_OUTER,
        PoseLandmark.RIGHT_EYE_INNER,
        PoseLandmark.RIGHT_EYE,
        PoseLandmark.RIGHT_EYE_OUTER,
        PoseLandmark.LEFT_EAR,
        PoseLandmark.RIGHT_EAR,
        PoseLandmark.MOUTH_LEFT,
        PoseLandmark.MOUTH_RIGHT,
        PoseLandmark.LEFT_HIP,
        PoseLandmark.RIGHT_HIP,
        PoseLandmark.LEFT_KNEE,
        PoseLandmark.RIGHT_KNEE,
        PoseLandmark.LEFT_ANKLE,
        PoseLandmark.RIGHT_ANKLE,
        PoseLandmark.LEFT_HEEL,
        PoseLandmark.RIGHT_HEEL,
        PoseLandmark.LEFT_FOOT_INDEX,
        PoseLandmark.RIGHT_FOOT_INDEX
    ]

    for landmark in excluded_landmarks:
        custom_pose_connections = [connection_tuple for connection_tuple in custom_pose_connections if landmark.value not in connection_tuple]

    mp_drawing.draw_landmarks(image, results.pose_landmarks, connections=custom_pose_connections)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [8]:
#draw landmarks with style (different color)
def draw_styled_handmarks(image, results):
    # mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION , 
    #                          mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
    #                          mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
    #                          ) 
   
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2),
                             ) 
      
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
     
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [None]:
cap = cv.VideoCapture(0) 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()

        image, results = media_pipe_detection(frame, holistic)
        draw_land_marks(image, results)

        # FOR BLACK BACKGROUND
        # image, results = media_pipe_detection(frame, holistic)
        # black_bg = np.zeros((frame.shape[0], frame.shape[1], frame.shape[2]))
        # draw_land_marks(black_bg, results)
        
        cv.imshow('Media Pipe Test', image)
        
        if(cv.waitKey(10) & 0xFF == ord(' ')):
            break

cap.release()
cv.destroyAllWindows()

In [None]:
plt.imshow(cv.cvtColor(image, cv.COLOR_BGR2RGB))

In [None]:
print(frame.shape)

In [None]:
cap = cv.VideoCapture(0) 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()

        black_bg = np.zeros((frame.shape[0], frame.shape[1], frame.shape[2]))

        image, results = media_pipe_detection(frame, holistic)
        
        draw_land_marks(black_bg, results)

        cv.imshow('Media Pipe Test', black_bg)
        
        if(cv.waitKey(10) & 0xFF == ord('q')):
            break

cap.release()
cv.destroyAllWindows()

# **Extract Keypoint Values**

In [None]:
# len(pose)

# only use x and y coordinates
# body pose only for shoulder to arm only
# currently NOT implementing face pose

12*2 + 21*2*2

In [9]:
def extract_keypoints(results):
    # face = np.array([[res.x, res.y] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*2)

    # pose = np.array([[res.x, res.y] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*2)

    if results.pose_landmarks:
        selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        pose = np.array([[res.x, res.y] for res in selected_pose_landmarks]).flatten()
    else:
        pose = np.zeros(22*2)

        
    left_hand = np.array([[res.x, res.y] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*2)
    right_hand = np.array([[res.x, res.y] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*2)
   
    # return np.concatenate([pose, face, left_hand, right_hand])
    return np.concatenate([pose, left_hand, right_hand])

In [10]:
def extract_coordinate(results):
    # if results.face_landmarks:
    #     for res in results.face_landmarks.landmark:
    #         x = res.x
    #         y = res.y
    #         visibility = res.visibility
    #         print(f"FACE LANDMARK x: {x}, y: {y}\n")    
        
    # NORMAL POSE LANDMARK
    # if results.pose_landmarks:
    #     for res in results.pose_landmarks.landmark:
    #         x = res.x
    #         y = res.y
    #         print(f"POSE LANDMARK x: {x}, y: {y}\n")

    if results.pose_landmarks:
        selected_pose_landmarks = results.pose_landmarks.landmark[11:23]
        for res in selected_pose_landmarks:
            x = res.x
            y = res.y
            print(f"POSE LANDMARK x: {x}, y: {y}\n")
        
    if results.right_hand_landmarks:
        for res in results.right_hand_landmarks.landmark:
            x = res.x
            y = res.y
            print(f"RIGHT HAND LANDMARK x: {x}, y: {y}\n")
    if results.left_hand_landmarks:
        for res in results.left_hand_landmarks.landmark:
            x = res.x
            y = res.y
            print(f"LEFT HAND LANDMARK x: {x}, y: {y}\n")    

In [None]:
extract_coordinate(results)

In [None]:
extract_keypoints(results)

In [None]:
len(extract_keypoints(results))

# **Setup Folders for Collection**

In [None]:
# DATA_PATH = os.path.join('Sign_Data')

# actions = np.array(['maaf', 'tolong'])

# # menggunakan 30 video
# no_sequences = 30

# # setiap video berisi 30 frame
# sequence_length = 30

# start_folder = 30

In [18]:
#FOR UPGRADED SEQUENCE
DATA_PATH = os.path.join('Sign_Data_Upgrade')

actions = np.array(['maaf', 'tolong'])

# menggunakan 60 video
no_sequences = 40

# setiap video berisi 30 frame
sequence_length = 30

In [None]:
for action in actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [None]:
# FOR TESTING TEMP DATA
TEMP_DATA_PATH = os.path.join('data_temp')

test_actions = np.array(['test'])

In [None]:
for action in test_actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(TEMP_DATA_PATH, action, str(sequence)))
        except:
            print("Hello")
            pass

In [None]:
# FOR CHANGING FOLDER NUMBER
DATA_PATH_CHANGE = os.path.join(r'C:\Users\krisn\OneDrive\Desktop\Learning\machine-learning-study\testing-space\Sign_Data_2')

temp = 59

for action in np.array(['tolong']):
    for new_sequence in range(sequence):
        PATH_OLD = os.path.join(DATA_PATH_CHANGE, action, str(temp))
        PATH_NEW = os.path.join(DATA_PATH_CHANGE, action, str(new_sequence+30))

        print(PATH_OLD + "\n")
        print(PATH_NEW)
        print("------------")
        os.rename(PATH_OLD, PATH_NEW)
        temp += 1

# **Collect Keypoint Values for Training and Testing**

In [None]:
cap = cv.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:

    for action in actions:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):

                ret, frame = cap.read()

                if frame_num == 0: 
                    cv.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv.LINE_AA)
                    cv.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2, cv.LINE_AA)
                    cv.imshow('OpenCV Feed', image)
                    cv.waitKey(2000)
                else: 
                    cv.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2, cv.LINE_AA)
                    cv.imshow('OpenCV Feed', image)
                black_bg = np.zeros((frame.shape[0], frame.shape[1], frame.shape[2]))

                image, results = media_pipe_detection(frame, holistic)

                draw_land_marks(image, results)
                draw_land_marks(black_bg, results)

                cv.imwrite(os.path.join(DATA_PATH, action, str(sequence), f"{frame_num}.jpg"), image)
                cv.imwrite(os.path.join(DATA_PATH, action, str(sequence), f"{frame_num}-black.jpg"), black_bg)
                
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                if cv.waitKey(10) & 0xFF == ord('q'):
                    break
                    
    cap.release()
    cv.destroyAllWindows()

In [None]:
#FOR CREATING TEMP DATA
cap = cv.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:

    for action in test_actions:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):

                ret, frame = cap.read()

                black_bg = np.zeros((frame.shape[0], frame.shape[1], frame.shape[2]))

                image, results = media_pipe_detection(frame, holistic)

                draw_land_marks(image, results)
                draw_land_marks(black_bg, results)
                
                if frame_num == 0: 
                    cv.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv.LINE_AA)
                    cv.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2, cv.LINE_AA)
                    cv.imshow('OpenCV Feed', image)
                    cv.waitKey(2000)
                else: 
                    cv.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2, cv.LINE_AA)
                    cv.imshow('OpenCV Feed', image)
                
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(TEMP_DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)
                
                cv.imwrite(os.path.join(TEMP_DATA_PATH, action, str(sequence), f"{frame_num}.jpg"), image)
                cv.imwrite(os.path.join(TEMP_DATA_PATH, action, str(sequence), f"{frame_num}-black.jpg"), black_bg)

                if cv.waitKey(10) & 0xFF == ord('q'):
                    break
                    
    cap.release()
    cv.destroyAllWindows()

In [None]:
cap.release()
cv.destroyAllWindows()

# **Preprocess Data and Create Labels and Features**

In [19]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import tensorflow as tf
import keras

In [20]:
label_map = {label:num for num, label in enumerate(actions)}

In [21]:
label_map

{'maaf': 0, 'tolong': 1}

In [22]:
sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [None]:
np.array(sequences).shape

In [None]:
np.array(labels).shape

In [None]:
X = np.array(sequences)

In [None]:
X.shape

In [None]:
y = to_categorical(labels).astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# **Build and Train LSTM Neural Network**

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard

In [None]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [None]:
# SIMPLE MODEL
model = Sequential()

model.add(LSTM(32, return_sequences=True, activation='tanh', input_shape=(30,108)))
model.add(LSTM(32, return_sequences=False, activation='tanh'))
model.add(Dense(16, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [None]:
# COMPLEX MODEL
model = Sequential()

model.add(LSTM(64, return_sequences=True, activation='tanh', input_shape=(30,108)))
model.add(LSTM(128, return_sequences=True, activation='tanh'))
model.add(LSTM(64, return_sequences=False, activation='tanh'))
# model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [25]:
# COMPLEX MODEL - USED
model = Sequential()

model.add(LSTM(128, return_sequences=True, activation='tanh', input_shape=(30,108)))
model.add(LSTM(64, return_sequences=True, activation='tanh'))
model.add(LSTM(32, return_sequences=False, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(actions.shape[0], activation='softmax'))

In [None]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=25, callbacks=[tb_callback])

In [None]:
model.summary()

In [None]:
plt.plot(model.history['acc'])
plt.plot(model.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# **Make Predictions**

In [None]:
res = model.predict(X_test)

In [None]:
actions[np.argmax(res[10])]

In [None]:
actions[np.argmax(y_test[10])]

# **Save Weights**

In [None]:
model.save('action.h5')

In [None]:
# del model

In [26]:
model.load_weights('action.h5')

# **Evaluation using Confusion Matrix and Accuracy**

In [27]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.tree import DecisionTreeClassifier

In [None]:
yhat = model.predict(X_test)

In [None]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue, yhat)

In [None]:
accuracy_score(ytrue, yhat)

# **Test in Real Time**

In [28]:
from scipy import stats

In [29]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv.putText(output_frame, actions[num], (0, 85+num*40), cv.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv.LINE_AA)
        
    return output_frame

In [None]:
# plt.figure(figsize=(18,18))
# plt.imshow(prob_viz(res, actions, image, colors))

In [31]:
sequence = []
sentence = []
predictions = []
threshold = 0.55

cap = cv.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        ret, frame = cap.read()

        image, results = media_pipe_detection(frame, holistic)
        
        draw_styled_handmarks(image, results)
        
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]
                
            image = prob_viz(res, actions, image, colors)
            
        cv.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv.putText(image, ' '.join(sentence), (3,30), 
                       cv.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv.LINE_AA)
        
        cv.imshow('OpenCV Feed', image)

        if cv.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv.destroyAllWindows()

tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
maaf
maaf
maaf
maaf
maaf
maaf
maaf
maaf
maaf
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tolong
tol

  a = asanyarray(a)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

: 