Import dependencies

In [1]:
import landmark_detector as ld
import os
import random
import numpy as np
import keras
from tqdm.notebook import tqdm
from keras import layers

Set variables

In [2]:
words = ['deaf', 'eat', 'fish', 'friend', 'like', 'milk', 'nice', 'no', 'orange', 'teacher', 'want', 'what', 'where', 'yes']
select_words = ['no', 'eat', 'teacher', 'want', 'fish']
modes = ['train', 'val', 'test']
path = '../preprocessing/dataset/'
num_features = 126
model_name = 'draft_model'
fps = 20

Get data from mediapipe


In [4]:
def get_data(mode, words, path, detector_path):
    detector = ld.get_detector(detector_path)

    training_X = []
    training_y = []

    num_videos = 0
    highest_frame = 0

    bad_videos = 0

    for word in tqdm(words, desc=mode):
        word_path = os.path.join(path, mode, word)
        
        video_files = [f for f in os.listdir(word_path) if f.endswith('.mp4')]
        
        for video_file in tqdm(video_files, desc=word):
            video_path = os.path.join(word_path, video_file)
            
            try:
                video_X = []
                landmarks, current_frames = ld.get_landmarks(video_path, detector)
                
                if len(landmarks) == 0:
                    bad_videos+=1
                    continue
                
                if current_frames > highest_frame:
                    highest_frame = current_frames
                
                for frame in range(len(landmarks)):
                    features = np.array(landmarks[frame]).flatten()
                    video_X.append(features)
                
                training_X.append(video_X)
                training_y.append(words.index(word))
                num_videos += 1

            except Exception as e:
                print(f"Error processing video {video_file}: {e}")
                continue 

    return training_X, training_y, num_videos, highest_frame, bad_videos

training_X, training_y, num_videos, highest_frame, bad_videos = get_data('train', select_words, path, '../models/hand_landmarker.task')

print('Number of videos:', num_videos)
print('Highest frame:', highest_frame)
print('Videos with no landmarkers detected: ', bad_videos)


I0000 00:00:1733931393.718252  241459 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1 Pro
W0000 00:00:1733931393.734725  242285 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1733931393.744334  242285 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


train:   0%|          | 0/5 [00:00<?, ?it/s]

no:   0%|          | 0/32 [00:00<?, ?it/s]

eat:   0%|          | 0/43 [00:00<?, ?it/s]

teacher:   0%|          | 0/37 [00:00<?, ?it/s]

want:   0%|          | 0/38 [00:00<?, ?it/s]

fish:   0%|          | 0/42 [00:00<?, ?it/s]

Number of videos: 167
Highest frame: 161
Videos with no landmarkers detected:  25


Padding and Masking X

In [5]:
def padX(X, num_videos, highest_frame, num_features):
    padded_X = np.zeros((num_videos, highest_frame, num_features))
    mask = np.ones((num_videos, highest_frame, num_features)) 
    for i in range(num_videos):
        video = X[i]
        for j in range(len(video)):
            frame = video[j]
            if len(frame) < num_features:
                padded_X[i, j, :] = np.pad(frame, (0, num_features - len(frame)), 'constant')
                mask[i, j, len(frame):] = 0
            else:
                padded_X[i, j, :] = frame
        if len(video) < highest_frame:
            mask[i, len(video):, :] = 0

    return padded_X, mask

padded_X, mask = padX(training_X, num_videos, highest_frame, num_features)
print(padded_X.shape)

(167, 161, 126)


Create model

In [6]:
model = keras.Sequential()

model.add(keras.Input(shape=(highest_frame, num_features)))
#model.add(keras.layers.SimpleRNN(len(select_words), activation='relu'))
model.add(layers.Masking(mask_value=0.0))
model.add(layers.LSTM(64))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(len(select_words), activation='softmax'))


model.summary()

Train model

In [7]:
val_X, val_y, num_val_videos, highest_frame_val, bad_videos = get_data('val', select_words, path, '../models/hand_landmarker.task')
padded_val_X, val_mask = padX(val_X, num_val_videos, highest_frame, num_features)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(padded_X, np.array(training_y), epochs=10, validation_data=(padded_val_X, np.array(val_y)))

model.save(f'../models/{model_name}.keras')

with open(f"../models/{model_name}.env", "w") as file:
    file.write(f"MAX_FRAMES={highest_frame}\n")
    file.write(f"NUM_FEATURES={num_features}\n")
    file.write(f"WORDS={",".join(select_words)}\n")
    file.write(f"FPS={fps}\n")

I0000 00:00:1733931670.047556  241459 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1 Pro
W0000 00:00:1733931670.062890  249170 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1733931670.073577  249170 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


val:   0%|          | 0/5 [00:00<?, ?it/s]

no:   0%|          | 0/13 [00:00<?, ?it/s]

eat:   0%|          | 0/10 [00:00<?, ?it/s]

teacher:   0%|          | 0/11 [00:00<?, ?it/s]

want:   0%|          | 0/5 [00:00<?, ?it/s]

fish:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 87ms/step - accuracy: 0.2995 - loss: 1.6223 - val_accuracy: 0.1351 - val_loss: 1.5919
Epoch 2/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.2730 - loss: 1.5429 - val_accuracy: 0.2973 - val_loss: 1.4989
Epoch 3/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.3606 - loss: 1.4760 - val_accuracy: 0.4324 - val_loss: 1.4495
Epoch 4/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.3834 - loss: 1.4045 - val_accuracy: 0.4054 - val_loss: 1.4066
Epoch 5/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.4829 - loss: 1.3138 - val_accuracy: 0.4324 - val_loss: 1.3253
Epoch 6/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.4390 - loss: 1.3088 - val_accuracy: 0.4595 - val_loss: 1.2972
Epoch 7/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━

Evaluate model

In [8]:
test_X, test_y, num_test_videos, highest_frame_test, bad_videos = get_data('test', select_words, path, '../models/hand_landmarker.task')
padded_test_X, test_mask = padX(test_X, num_test_videos, highest_frame, num_features)
results = model.evaluate(padded_test_X, np.array(test_y))

print('Test loss:', results)

I0000 00:00:1733931754.320265  241459 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1 Pro
W0000 00:00:1733931754.339085  252493 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1733931754.347562  252498 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


test:   0%|          | 0/5 [00:00<?, ?it/s]

no:   0%|          | 0/8 [00:00<?, ?it/s]

eat:   0%|          | 0/5 [00:00<?, ?it/s]

teacher:   0%|          | 0/10 [00:00<?, ?it/s]

want:   0%|          | 0/6 [00:00<?, ?it/s]

fish:   0%|          | 0/4 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.4348 - loss: 1.2578
Test loss: [1.2577627897262573, 0.43478259444236755]


Use model

In [9]:
detector = ld.get_detector('../models/hand_landmarker.task')

prediction_X = []
prediction_y = []

test_path = path + 'test/teacher/'
files = os.listdir(test_path)
mp4_files = [f for f in files if f.endswith('.mp4')]
random_file = random.choice(mp4_files)
video_path = os.path.join(test_path, random_file)

video_X = []
landmarks, frame_count = ld.get_landmarks(video_path, detector)
if len(landmarks) == 0:
    print('No landmarks detected')
else:
    for frame in range(len(landmarks)):
        features = np.array(landmarks[frame]).flatten()
        features = np.pad(features, (0, num_features - len(features)), 'constant')
        video_X.append(features)
    for i in range(highest_frame-len(video_X)):
        temp = np.zeros((num_features))
        video_X.append(temp)

    prediction_X.append(video_X)
    prediction_y.append(select_words.index('teacher'))


    print(np.shape(prediction_X))
    print(model.predict(np.array(prediction_X)))

I0000 00:00:1733931813.589535  241459 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1 Pro
W0000 00:00:1733931813.610589  253796 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1733931813.622089  253795 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


(1, 161, 126)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[[0.13081026 0.2460382  0.3506925  0.13141038 0.14104864]]
