In [1]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
%pip install mediapipe
import mediapipe as mp

Collecting mediapipe
  Downloading mediapipe-0.10.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.0-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.9/35.9 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.0-py3-none-any.whl (32 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

dataset_path = '/content/drive/My Drive/Datasets/Greetings'

Mounted at /content/drive


In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

mp_holistic = mp.solutions.holistic

def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, lh, rh])

def convert_video_to_pose_embedded_np_array(video_path, frames_to_extract=45):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        raise ValueError(f"Unable to open video file: {video_path}")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    np_array = []

    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        if total_frames >= frames_to_extract:
            frame_indices = np.linspace(0, total_frames - 1, frames_to_extract, dtype=int)
            for i in frame_indices:
                cap.set(cv2.CAP_PROP_POS_FRAMES, i)
                ret, frame = cap.read()
                if not ret:
                    break
                image, results = mediapipe_detection(frame, holistic)
                keypoints = extract_keypoints(results)
                np_array.append(keypoints)
        else:
            key_points_shape = None
            for i in range(total_frames):
                ret, frame = cap.read()
                if not ret:
                    break
                image, results = mediapipe_detection(frame, holistic)
                keypoints = extract_keypoints(results)
                if key_points_shape is None:
                    key_points_shape = keypoints.shape
                np_array.append(keypoints)

            for _ in range(frames_to_extract - total_frames):
                np_array.append(np.zeros(shape=key_points_shape))

    cap.release()
    return np.array(np_array)

def process_dataset(dataset_path, frames_to_extract=45):
    labels = []
    data = []
    label_map = {folder: idx for idx, folder in enumerate(os.listdir(dataset_path))}

    for folder in os.listdir(dataset_path):
        folder_path = os.path.join(dataset_path, folder)
        if not os.path.isdir(folder_path):
            continue
        for video_file in os.listdir(folder_path):
            video_path = os.path.join(folder_path, video_file)
            np_array = convert_video_to_pose_embedded_np_array(video_path, frames_to_extract)
            data.append(np_array)
            labels.append(label_map[folder])

    data = np.array(data)
    labels = np.array(labels)
    return data, labels, label_map

In [4]:
frames_to_extract = 45

data, labels, label_map = process_dataset(dataset_path, frames_to_extract)

X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

input_shape = (frames_to_extract, data.shape[2])

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam

model = Sequential()

model.add(Bidirectional(LSTM(128, return_sequences=True, activation='relu'), input_shape=input_shape))
model.add(Dropout(0.5))
model.add(BatchNormalization())

model.add(Bidirectional(LSTM(128, return_sequences=True, activation='relu')))
model.add(Dropout(0.5))
model.add(BatchNormalization())

model.add(Bidirectional(LSTM(64, return_sequences=False, activation='relu')))
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(32, activation='relu'))

model.add(Dense(len(label_map), activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [33]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=16, epochs=200)

Epoch 1/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.6476 - loss: 0.9309 - val_accuracy: 0.6776 - val_loss: 1.1353
Epoch 2/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.6793 - loss: 0.8431 - val_accuracy: 0.6382 - val_loss: 1.7909
Epoch 3/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.6588 - loss: 0.8501 - val_accuracy: 0.6316 - val_loss: 2.2244
Epoch 4/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.6619 - loss: 0.8758 - val_accuracy: 0.6513 - val_loss: 1.8254
Epoch 5/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.6828 - loss: 0.7962 - val_accuracy: 0.5987 - val_loss: 2.7199
Epoch 6/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.6777 - loss: 0.7707 - val_accuracy: 0.6118 - val_loss: 4.1298
Epoch 7/200
[1m38/38[0m [

In [34]:
def predict_on_video(model, video_path, frames_to_extract=45):
    np_array = convert_video_to_pose_embedded_np_array(video_path, frames_to_extract)
    np_array = np.expand_dims(np_array, axis=0)  # Expand dimensions to match input shape
    prediction = model.predict(np_array)
    predicted_class = np.argmax(prediction, axis=1)
    return predicted_class[0]

# Test the model on a single input video
test_video_path = '/content/drive/My Drive/Datasets/Greetings/48. Hello/MVI_0029.MOV'  # Replace with your test video path
predicted_class = predict_on_video(model, test_video_path, frames_to_extract)
print(f"Predicted class: {list(label_map.keys())[list(label_map.values()).index(predicted_class)]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Predicted class: 48. Hello


In [35]:
model.save('/content/drive/My Drive/Weights/SignLanguage/sign_language.h5')
model.save_weights('/content/drive/My Drive/Weights/SignLanguage/sign_language_weights.weights.h5')

