# Combining dynamic gestures training data 

In [2]:
from pathlib import Path
import shutil

# merge main dataset with smaller datasets for each individual word
def merge_datasets(first_filepath, second_filepath):
    folder_A = Path(first_filepath)  
    folder_B = Path(second_filepath) 

    existing_folders = [int(f.name) for f in folder_A.iterdir() if f.is_dir()]
    max_index = max(existing_folders) if existing_folders else -1  

    for subfolder in sorted(folder_B.iterdir()):
        if subfolder.is_dir():  
            new_index = max_index + 1  
            new_name = folder_A / str(new_index)  
            shutil.move(str(subfolder), str(new_name))
            max_index += 1  


# Adding to dataset for dynamic gestures

In [3]:
import numpy as np
X_train = []
y_train = []  

# building x_train, y_train for a specific folder_path containing either dynamic/static data (e.g mp_data/ mp_data_num_alphabet), which will be labelled with the label argument
def append_training_data(folder_path,label):
    folder_path = Path(folder_path)
    for action_folder in folder_path.iterdir():
        if not action_folder.is_dir():
            continue
        data_samples = []
        for subfolder in action_folder.iterdir():
            if subfolder.is_dir() and subfolder.name != ".DS_Store":  
                sequence_arr = []
                for file_path in sorted(subfolder.glob("*.npy")):  
                    data = np.load(file_path)
                    sequence_arr.append(data)
                if sequence_arr:
                    data_samples.append(np.stack(sequence_arr))
        if data_samples:
            new_data = np.stack(data_samples)
            X_train.append(new_data)
            num_sequences = new_data.shape[0]
            y_train.extend([label] * num_sequences)

append_training_data("mp_data",1)
append_training_data("mp_data_num_alphabet",0)
if len(X_train) > 0:
    X_train = np.concatenate(X_train, axis=0)  
    y_train = np.array(y_train)  

print(f"Final X_train shape: {X_train.shape}")  # (num_samples, num_frames, feature_dim)
print(f"Final y_train shape: {y_train.shape}")  # (num_samples,)



Final X_train shape: (1840, 60, 1662)
Final y_train shape: (1840,)


In [4]:
import numpy as np

# Count occurrences of each label (works if labels are 0 and 1)
category_counts = np.bincount(y_train)

# Print results
print(f"Static Gestures (0): {category_counts[0]}")
print(f"Dynamic Gestures (1): {category_counts[1]}")

Static Gestures (0): 1080
Dynamic Gestures (1): 760


In [3]:
print(f"X_train shape: {X_train.shape}")  
print(f"y_train shape: {y_train.shape}")  
print(f"Unique labels in y_train: {np.unique(y_train)}") 


X_train shape: (1840, 60, 1662)
y_train shape: (1840,)
Unique labels in y_train: [0 1]


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the LSTM model
model = Sequential([
    LSTM(128, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),  # Prevents overfitting
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")  # Sigmoid for binary classification
])

# Compile the model
model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Save the trained model
model.save("lstm_binary_model.h5")


  super().__init__(**kwargs)


Epoch 1/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 160ms/step - accuracy: 0.6537 - loss: 0.6165 - val_accuracy: 1.0000 - val_loss: 0.1360
Epoch 2/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 118ms/step - accuracy: 0.8549 - loss: 0.3787 - val_accuracy: 0.9620 - val_loss: 0.2641
Epoch 3/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 160ms/step - accuracy: 0.8728 - loss: 0.3046 - val_accuracy: 0.9429 - val_loss: 0.3476
Epoch 4/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 179ms/step - accuracy: 0.8859 - loss: 0.2717 - val_accuracy: 0.9973 - val_loss: 0.1230
Epoch 5/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 328ms/step - accuracy: 0.9086 - loss: 0.2546 - val_accuracy: 0.9918 - val_loss: 0.1446
Epoch 6/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 176ms/step - accuracy: 0.9175 - loss: 0.2304 - val_accuracy: 0.9973 - val_loss: 0.1082
Epoch 7/50
[1m46/46[0m 



In [None]:
import cv2
import mediapipe as mp
import numpy as np
from tensorflow.keras.models import load_model

# Load LSTM Model
model = load_model("lstm_binary_model.h5")

# Initialize MediaPipe Holistic Model
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Parameters
sequence = []  
sequence_length = 60  
feature_dim = 1662  # Correct feature size for Holistic model
predictions_buffer = []

cap = cv2.VideoCapture(0)

def extract_keypoints(results):
    """
    Extracts keypoints from MediaPipe Holistic model.
    Returns a NumPy array of shape (1662,).
    """
    pose = np.array([[lm.x, lm.y, lm.z, lm.visibility] for lm in results.pose_landmarks.landmark]
                    ) if results.pose_landmarks else np.zeros((33, 4))

    lh = np.array([[lm.x, lm.y, lm.z] for lm in results.left_hand_landmarks.landmark]
                  ) if results.left_hand_landmarks else np.zeros((21, 3))
    
    rh = np.array([[lm.x, lm.y, lm.z] for lm in results.right_hand_landmarks.landmark]
                  ) if results.right_hand_landmarks else np.zeros((21, 3))

    face = np.array([[lm.x, lm.y, lm.z] for lm in results.face_landmarks.landmark]
                    ) if results.face_landmarks else np.zeros((468, 3))

    keypoints = np.concatenate([pose.flatten(), lh.flatten(), rh.flatten(), face.flatten()])

    if keypoints.shape[0] != 1662:
        print(f"⚠ Warning! Extracted keypoints shape = {keypoints.shape[0]}, expected (1662,)")
        keypoints = np.zeros(1662)  # Fallback to zeros if shape mismatch occurs

    return keypoints  # Shape: (1662,)


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = holistic.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    keypoints = extract_keypoints(results)
    sequence.append(keypoints)

    if len(sequence) > sequence_length:
        sequence.pop(0)  # Keep last 60 frames

    # Predict when enough frames are collected
    if len(sequence) == sequence_length:
        input_data = np.array(sequence).reshape(1, sequence_length, feature_dim)
        prediction = model.predict(input_data)[0]
        confidence = round(float(prediction), 2)  # Convert to float and round to 2 decimal places

        predictions_buffer.append((1 if prediction > 0.5 else 0, confidence))

        if len(predictions_buffer) > 10:
            predictions_buffer.pop(0)

        # Use majority vote for stability
        final_prediction = max(set([p[0] for p in predictions_buffer]), key=[p[0] for p in predictions_buffer].count)
        avg_confidence = np.mean([p[1] for p in predictions_buffer])  # Average confidence

        class_map = {0: "Static Gesture", 1: "Dynamic Gesture"}
        predicted_class = class_map[final_prediction]

        # Display prediction & confidence on frame
        text = f"Prediction: {predicted_class} ({avg_confidence:.2f})"
        cv2.putText(image, text, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    # Draw landmarks
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)

    cv2.imshow("Live Gesture Recognition", image)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


I0000 00:00:1740406583.367541 3546706 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1740406583.502511 3566707 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740406583.520270 3566711 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740406583.522818 3566709 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740406583.522955 3566711 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740406583.523517 3566706 inference_feedback_manager.cc:114] Feedback manager requires a mod

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 206ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


  confidence = round(float(prediction), 2)  # Convert to float and round to 2 decimal places
W0000 00:00:1740406589.297934 3566705 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26

KeyboardInterrupt: 

: 

In [2]:
data = np.load("/Users/limjunkiat/Desktop/sku/T8/38 CDS/Project/Data_Science_Project/classification_model/mp_data/hello/0/0.npy")
data.shape

(1662,)