Import dependencies

In [26]:
import landmark_detector as ld
import os
import numpy as np
import json
import cv2 as cv

import tensorflow as tf
import keras
from keras import layers

Set variables

In [None]:
words = ['deaf', 'eat', 'fish', 'friend', 'like', 'milk', 'nice', 'no', 'orange', 'teacher', 'want', 'what', 'where', 'yes']
select_words = ['no', 'eat', 'teacher']
modes = ['train', 'val', 'test']
path = '../preprocessing/dataset/'
num_features = 126

Check highest number of frames

In [53]:
def get_max_frame(modes, select_words, path):
    highest_frame = 0

    for mode in modes:
        for word in select_words:
            i = 1
            video_path = path + mode + '/' + word + '/0001.mp4'
            while os.path.exists(video_path):
                try:
                    cap = cv.VideoCapture(video_path)
                    if not cap.isOpened():
                        raise FileNotFoundError("The video file not found")
                    curr_frame= int(cap.get(cv.CAP_PROP_FRAME_COUNT))
                    if curr_frame > highest_frame:
                        highest_frame = curr_frame
                    i += 1
                except:
                    break
                video_path = path + mode + '/' + word + '/' + str(i).zfill(4) + '.mp4'

    return highest_frame

highest_frame = get_max_frame(modes, select_words, path)
print('Highest frame count:', highest_frame)


Highest frame count: 42


Data schema:

(number of videos,max frames, 2, 21, 3)


In [54]:
def get_data(mode, words, path, detector_path):
    detector = ld.get_detector(detector_path)

    training_X = []
    training_y = []

    num_videos = 0

    for word in words:
        i = 1
        video_path = path + mode + '/' + word + '/0001.mp4'
        while os.path.exists(video_path):
            try:
                video_X = []
                landmarks = ld.get_landmarks(video_path, detector)
                for frame in range(len(landmarks)):
                    features = np.array(landmarks[frame]).flatten()
                    video_X.append(features)
                training_X.append(video_X)
                training_y.append(words.index(word))
                i += 1
                num_videos += 1
            except Exception as e:
                print(e)
                break
            video_path = path + mode + '/' + word + '/' + str(i).zfill(4) + '.mp4'

    return training_X, training_y, num_videos

training_X, training_y, num_videos = get_data('train', select_words, path, '../models/hand_landmarker.task')

print('Number of videos:', num_videos)
print(training_X)
print(training_y)
    

Number of videos: 118
[[array([ 5.39468527e-01,  6.34025156e-01,  6.37598561e-08,  6.18700683e-01,
        6.09508097e-01,  7.84634426e-03,  6.55369878e-01,  5.70102513e-01,
        5.42990584e-03,  6.78646207e-01,  5.42455554e-01, -4.86166216e-03,
        6.91332579e-01,  5.20948052e-01, -1.07166274e-02,  5.82029581e-01,
        4.97460902e-01,  1.67569816e-02,  6.56631529e-01,  4.80033338e-01,
       -8.54443014e-03,  6.70501411e-01,  5.05593359e-01, -2.61138696e-02,
        6.60215557e-01,  5.26126087e-01, -3.21097076e-02,  5.74771106e-01,
        5.03408313e-01,  1.02922390e-03,  6.64904654e-01,  4.86882001e-01,
       -1.83722116e-02,  6.78297400e-01,  5.14250934e-01, -2.52131838e-02,
        6.66194916e-01,  5.34571171e-01, -2.59987433e-02,  5.79059839e-01,
        5.18237829e-01, -1.79288127e-02,  6.71368003e-01,  5.08837283e-01,
       -3.45865563e-02,  6.83238029e-01,  5.38284957e-01, -3.02553084e-02,
        6.67639256e-01,  5.60410380e-01, -2.24736165e-02,  5.93773305e-01,
 

Normalize X

In [25]:
'''

def  featureNormalize(X, num_features):
    X_norm = X.copy()
    mu = np.zeros(num_features)
    sigma = np.zeros(num_features)

    for i in range(num_features):
        lst = [frame[i] for video in X for frame in video if len(frame) > i]
        mu[i] = np.mean(lst)
        sigma[i] = np.std(lst)
        for video in X_norm:
            for frame in video:
                if len(frame) > i:
                    frame[i] -= mu[i]
                    frame[i] /= sigma[i]

    return X_norm, mu, sigma

normalized_X, mu, sigma = featureNormalize(training_X, num_features)
print(normalized_X)


'''

'\n\ndef  featureNormalize(X, num_features):\n    X_norm = X.copy()\n    mu = np.zeros(num_features)\n    sigma = np.zeros(num_features)\n\n    for i in range(num_features):\n        lst = [frame[i] for video in X for frame in video if len(frame) > i]\n        mu[i] = np.mean(lst)\n        sigma[i] = np.std(lst)\n        for video in X_norm:\n            for frame in video:\n                if len(frame) > i:\n                    frame[i] -= mu[i]\n                    frame[i] /= sigma[i]\n\n    return X_norm, mu, sigma\n\nnormalized_X, mu, sigma = featureNormalize(training_X, num_features)\nprint(normalized_X)\n\n\n'

Padding and Masking X

In [55]:
def padX(X, num_videos, highest_frame, num_features):
    padded_X = np.zeros((num_videos, highest_frame, num_features))
    mask = np.ones((num_videos, highest_frame, num_features)) 
    for i in range(num_videos):
        video = X[i]
        for j in range(len(video)):
            frame = video[j]
            if len(frame) < num_features:
                padded_X[i, j, :] = np.pad(frame, (0, num_features - len(frame)), 'constant')
                mask[i, j, len(frame):] = 0
            else:
                padded_X[i, j, :] = frame
        if len(video) < highest_frame:
            mask[i, len(video):, :] = 0

    return padded_X, mask

padded_X, mask = padX(training_X, num_videos, highest_frame, num_features)
print(padded_X)

[[[ 5.39468527e-01  6.34025156e-01  6.37598561e-08 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 5.81657350e-01  6.59672081e-01 -3.04205514e-08 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 6.96257055e-01  7.73334503e-01 -4.23549665e-07 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]]

 [[ 2.85132617e-01  4.56699103e-01 -2.08529291e-10 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 3.37124765e-01  6.62779331e-01 -2.83490976e-07 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 2.86495477e-01  6.92171931e-01 -1.70053028e-07 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [ 0.00000000e+00  0.0

Create model

In [56]:
model = keras.Sequential()

model.add(keras.Input(shape=(highest_frame, num_features)))
#model.add(keras.layers.SimpleRNN(len(select_words), activation='relu'))
model.add(layers.Masking(mask_value=0.0))
model.add(layers.LSTM(64))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(len(select_words), activation='softmax'))


model.summary()

Train model

In [57]:
# training

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(padded_X, np.array(training_y), epochs=10)

model.save('../models/draft_model.keras')

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.3718 - loss: 1.0946
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3921 - loss: 1.0822
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.4307 - loss: 1.0752
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.4575 - loss: 1.0415
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4784 - loss: 1.0358
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5510 - loss: 1.0000
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5370 - loss: 1.0013
Epoch 8/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5058 - loss: 0.9844
Epoch 9/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

Validate model

In [60]:
val_X, val_y, num_val_videos = get_data('val', select_words, path, '../models/hand_landmarker.task')
padded_val_X, val_mask = padX(val_X, num_val_videos, highest_frame, num_features)
results = model.evaluate(padded_val_X, np.array(val_y))

print('Validation loss:', results)

test_X, test_y, num_test_videos = get_data('test', select_words, path, '../models/hand_landmarker.task')
padded_test_X, test_mask = padX(test_X, num_test_videos, highest_frame, num_features)
results = model.evaluate(padded_test_X, np.array(test_y))

print('Test loss:', results)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5000 - loss: 0.9916
Validation loss: [0.9915978312492371, 0.5]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1765 - loss: 1.0698
Test loss: [1.0698089599609375, 0.1764705926179886]


Validate model when only considering videos that return results when applying mediapipe

In [72]:
def get_data_clean(mode, words, path, detector_path):
    detector = ld.get_detector(detector_path)

    training_X = []
    training_y = []

    num_videos = 0

    for word in words:
        i = 1
        video_path = path + mode + '/' + word + '/0001.mp4'
        while os.path.exists(video_path):
            try:
                video_X = []
                landmarks = ld.get_landmarks(video_path, detector)
                if len(landmarks) == 0:
                    i += 1
                    break
                for frame in range(len(landmarks)):
                    features = np.array(landmarks[frame]).flatten()
                    video_X.append(features)
                training_X.append(video_X)
                training_y.append(words.index(word))
                i += 1
                num_videos += 1
            except Exception as e:
                print(e)
                break
            video_path = path + mode + '/' + word + '/' + str(i).zfill(4) + '.mp4'

    return training_X, training_y, num_videos

clean_val_X, clean_val_y, num_clean_val_videos = get_data_clean('val', select_words, path, '../models/hand_landmarker.task')
padded_clean_val_X, clean_val_mask = padX(clean_val_X, num_clean_val_videos, highest_frame, num_features)
results = model.evaluate(padded_clean_val_X, np.array(clean_val_y))

print('Clean validation loss:', results)

clean_test_X, clean_test_y, num_clean_test_videos = get_data_clean('test', select_words, path, '../models/hand_landmarker.task')
padded_clean_test_X, clean_test_mask = padX(clean_test_X, num_clean_test_videos, highest_frame, num_features)
results = model.evaluate(padded_clean_test_X, np.array(clean_test_y))

print('Clean test loss:', results)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.8333 - loss: 0.8222
Clean validation loss: [0.8221733570098877, 0.8333333134651184]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.2500 - loss: 0.9249
Clean test loss: [0.9248908758163452, 0.25]


Use model

In [89]:
detector = ld.get_detector('../models/hand_landmarker.task')

prediction_X = []
prediction_y = []

video_path = path + 'test/teacher/0004.mp4'

video_X = []
landmarks = ld.get_landmarks(video_path, detector)
if len(landmarks) == 0:
    print('No landmarks detected')
else:
    for frame in range(len(landmarks)):
        features = np.array(landmarks[frame]).flatten()
        features = np.pad(features, (0, num_features - len(features)), 'constant')
        video_X.append(features)
    for i in range(42-len(video_X)):
        temp = np.zeros((num_features))
        video_X.append(temp)

    prediction_X.append(video_X)
    prediction_y.append(words.index('teacher'))


    print(np.shape(prediction_X))
    print(model.predict(np.array(prediction_X)))

(1, 42, 126)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[[0.01992831 0.03217128 0.9479004 ]]
