In [3]:
##          DATA SHAPE DEFINITION           ##

import numpy as np

# Define the parameters for the data shape
num_handsigns = 4
videos_per_handsign = 50
frames_per_video = 30
num_landmarks = 51
num_coordinates = 3

# Generate dummy data
data = [np.random.rand(videos_per_handsign, frames_per_video, num_landmarks, num_coordinates) for _ in range(num_handsigns)]

# Convert the list to a numpy array with shape (num_handsigns, videos_per_handsign, frames_per_video, num_landmarks, num_coordinates)
data_array = np.array(data)

# Save the data array to a .npy file
np.save('handsigns_data.npy', data_array)




In [4]:
##          PROCESS VIDEO DATASET FUNC DEFINITIONS         ##
import cv2
import mediapipe as mp
import numpy as np
import os
from tqdm import tqdm

def normalize_landmarks(landmarks, head_idx=0):
    # Normalize based on the head as the origin (using the head_idx landmark as the reference point)
    origin = landmarks[head_idx]
    normalized_landmarks = landmarks - origin
    return normalized_landmarks

def rotate_landmarks(landmarks, angle):
    # Apply rotation to the 3D landmarks
    rotation_matrix = np.array([
        [np.cos(angle), -np.sin(angle), 0],
        [np.sin(angle), np.cos(angle), 0],
        [0, 0, 1]
    ])
    rotated_landmarks = np.dot(landmarks, rotation_matrix)
    return rotated_landmarks

def extract_landmarks(image, hands_results, pose_results, apply_rotation=False, rotation_angle=0.1, head_idx=0):
    landmarks = []
    
    # Extract left hand landmarks (21 landmarks)
    if hands_results.multi_hand_landmarks and len(hands_results.multi_hand_landmarks) > 0:
        hand_landmarks = [(lm.x, lm.y, lm.z) for lm in hands_results.multi_hand_landmarks[0].landmark]
        if apply_rotation:
            hand_landmarks = rotate_landmarks(np.array(hand_landmarks), rotation_angle)
        landmarks.extend(hand_landmarks)
    else:
        landmarks.extend([(0, 0, 0)] * 21)
    
    # Extract right hand landmarks (21 landmarks)
    if hands_results.multi_hand_landmarks and len(hands_results.multi_hand_landmarks) > 1:
        hand_landmarks = [(lm.x, lm.y, lm.z) for lm in hands_results.multi_hand_landmarks[1].landmark]
        if apply_rotation:
            hand_landmarks = rotate_landmarks(np.array(hand_landmarks), rotation_angle)
        landmarks.extend(hand_landmarks)
    else:
        landmarks.extend([(0, 0, 0)] * 21)
    
    # Extract selected body landmarks (9 landmarks)
    selected_body_landmarks = [0, 11, 12, 13, 14, 15, 16, 23, 24]  # Landmarks for nose, arms, and shoulders
    if pose_results.pose_landmarks:
        for idx in selected_body_landmarks:
            lm = pose_results.pose_landmarks.landmark[idx]
            landmarks.append((lm.x, lm.y, lm.z))
    else:
        landmarks.extend([(0, 0, 0)] * 9)

    landmarks = np.array(landmarks)
    landmarks = normalize_landmarks(landmarks, head_idx)  # Normalize landmarks with the head as the origin
    
    return landmarks

def process_video(video_path, apply_rotation=False, rotation_angle=0.1, head_idx=0):
    cap = cv2.VideoCapture(video_path)
    frames = []
    
    mp_hands = mp.solutions.hands
    mp_pose = mp.solutions.pose

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = np.linspace(0, total_frames - 1, frames_per_video, dtype=int)
    frame_set = set(indices)
    frame_count = 0
    
    with mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5) as hands, \
         mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose:
        
        while cap.isOpened() and len(frames) < frames_per_video:
            ret, frame = cap.read()
            if not ret:
                break
            
            if frame_count in frame_set:
                # Convert the BGR image to RGB
                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                
                # Process the image and extract landmarks
                hands_results = hands.process(image)
                pose_results = pose.process(image)
                
                # Extract landmarks
                landmarks = extract_landmarks(image, hands_results, pose_results, apply_rotation, rotation_angle, head_idx)
                
                frames.append(landmarks)
            
            frame_count += 1
    
    cap.release()
    
    # Pad if we don't have enough frames
    if len(frames) < frames_per_video:
        frames.extend([np.zeros((51, 3))] * (frames_per_video - len(frames)))
    
    return np.array(frames)

def process_dataset(root_path, apply_rotation=False, rotation_angle=0.1, head_idx=0):
    data = []
    
    for handsign in tqdm(range(num_handsigns), desc="Processing handsigns"):
        handsign_path = os.path.join(root_path, f"handsign_{handsign+1}")  # Changed to match your folder naming
        if not os.path.exists(handsign_path):
            print(f"Warning: Directory {handsign_path} does not exist. Skipping.")
            data.append(np.zeros((videos_per_handsign, frames_per_video, 51, 3)))  # 51 landmarks total
            continue
        
        videos = [f for f in os.listdir(handsign_path) if f.endswith(('.mp4', '.avi', '.mov'))]
        videos = videos[:videos_per_handsign]  # Limit to videos_per_handsign
        
        handsign_data = []
        for video in tqdm(videos, desc=f"Processing videos for handsign {handsign}", leave=False):
            video_path = os.path.join(handsign_path, video)
            video_data = process_video(video_path, apply_rotation, rotation_angle, head_idx)
            handsign_data.append(video_data)
        
        # Pad if we don't have enough videos
        if len(handsign_data) < videos_per_handsign:
            handsign_data.extend([np.zeros((frames_per_video, 51, 3))] * (videos_per_handsign - len(handsign_data)))
        
        data.append(np.array(handsign_data))
    
    return np.array(data)





In [5]:
##          PROCESS VIDEOS DATASET FUNC CALLING         ##

if __name__ == "__main__":
    root_path = "TestDataset"  # Replace with your dataset root path
    apply_rotation_augmentation = False  # Set this to False to skip rotation augmentation
    rotation_angle = 0.1  # Set the desired rotation angle
    head_idx = 0  # Set the index of the head landmark (change as per dataset)
    
    data_array = process_dataset(root_path, apply_rotation_augmentation, rotation_angle, head_idx)
    
    # Save the data array to a .npy file
    np.save('handsigns_data.npy', data_array)
    print("Data saved to handsigns_data.npy")




Processing handsigns:   0%|          | 0/4 [00:00<?, ?it/s]

Processing videos for handsign 0:   2%|▏         | 1/50 [00:02<02:25,  2.98s/it][A
Processing videos for handsign 0:   4%|▍         | 2/50 [00:05<02:24,  3.00s/it][A
Processing videos for handsign 0:   6%|▌         | 3/50 [00:08<02:17,  2.92s/it][A
Processing videos for handsign 0:   8%|▊         | 4/50 [00:11<02:13,  2.89s/it][A
Processing videos for handsign 0:  10%|█         | 5/50 [00:14<02:06,  2.81s/it][A
Processing videos for handsign 0:  12%|█▏        | 6/50 [00:17<02:03,  2.81s/it][A
Processing videos for handsign 0:  14%|█▍        | 7/50 [00:19<02:00,  2.80s/it][A
Processing videos for handsign 0:  16%|█▌        | 8/50 [00:22<01:56,  2.77s/it][A
Processing videos for handsign 0:  18%|█▊        | 9/50 [00:25<01:53,  2.76s/it][A
Processing videos for handsign 0:  20%|██        | 10/50 [00:28<01:49,  2.75s/it][A
Processing videos for handsign 0:  22%|██▏       | 11/50 [00:30<01:46,  2.72s/it][A
Processing vi

Data saved to handsigns_data.npy





In [7]:
##          MODEL DEFINITION            ##
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Conv1D, Dense, Reshape, Dropout, BatchNormalization, Input

# Define the model
model = Sequential([
    # Input layer with the original shape
    Input(shape=(frames_per_video, num_landmarks, num_coordinates)),
    
    # Reshape layer to convert input shape from (30, 51, 3) to (30, 153)
    Reshape((frames_per_video, num_landmarks * num_coordinates)),
    
    # Convolution layers for feature extraction
    Conv1D(64, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(),
    Conv1D(128, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(),
    
    # No flatten layer here; keeping the output 3D for the LSTM
    # Output shape at this point will be (frames_per_video, features)
    
    # LSTM layer to process temporal information
    LSTM(128, return_sequences=False),  # No need to return sequences as this is the final LSTM layer
    Dropout(0.5),
    
    # Fully connected layers
    Dense(128, activation='relu'),
    Dropout(0.5),
    
    # Output layer with softmax activation for classification
    Dense(num_handsigns, activation='softmax')  # Assuming num_handsigns is defined globally
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


In [8]:
##          MODEL TRAINING          ##

# Load the data from the .npy file
data_array = np.load('handsigns_data.npy')

# X remains unchanged
X = data_array  # Shape: (4, 50, 30, 51, 3)

# Assuming you have labels for your handsigns, define y as follows:
y = np.array([i for i in range(num_handsigns) for _ in range(videos_per_handsign)])

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the data for training
X_train_reshaped = X_train.reshape(-1, frames_per_video, num_landmarks, num_coordinates)
X_test_reshaped = X_test.reshape(-1, frames_per_video, num_landmarks, num_coordinates)

# Train the model
history = model.fit(X_train_reshaped, y_train, validation_data=(X_test_reshaped, y_test), epochs=10, batch_size=16)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

ValueError: Found input variables with inconsistent numbers of samples: [4, 200]