Import dependencies

In [1]:
import landmark_detector as ld
import os
import numpy as np
import json
import cv2 as cv

import tensorflow as tf
import keras
from keras import layers

Set variables

In [2]:
words = ['deaf', 'eat', 'fish', 'friend', 'like', 'milk', 'nice', 'no', 'orange', 'teacher', 'want', 'what', 'where', 'yes']
select_words = ['deaf', 'eat', 'fish']
path = '../preprocessing/dataset/train/'
num_features = 126

Get the number of videos

In [3]:
num_videos = 0

for word in select_words:
    i = 1
    video_path = path + word + '/0001.mp4'
    while os.path.exists(video_path):
        try:
            num_videos += 1
            i += 1
        except:
            break
        video_path = path + word + '/' + str(i).zfill(4) + '.mp4'

print('Number of videos:', num_videos)

Number of videos: 118


Check highest number of frames

In [4]:
highest_frame = 0

for word in select_words:
    i = 1
    video_path = path + word + '/0001.mp4'
    while os.path.exists(video_path):
        try:
            cap = cv.VideoCapture(video_path)
            if not cap.isOpened():
                raise FileNotFoundError("The video file not found")
            curr_frame= int(cap.get(cv.CAP_PROP_FRAME_COUNT))
            if curr_frame > highest_frame:
                highest_frame = curr_frame
            i += 1
        except:
            break
        video_path = path + word + '/' + str(i).zfill(4) + '.mp4'

print('Highest frame count:', highest_frame)


Highest frame count: 42


Data schema:

(number of videos,max frames, 2, 21, 3)


In [15]:
detector = ld.get_detector('../models/hand_landmarker.task')

#training_X = np.zeros((num_videos, highest_frame, num_features))
training_X = []
training_y = np.zeros(num_videos)

video = 0

for word in select_words:
    i = 1
    video_path = path + word + '/0001.mp4'
    while os.path.exists(video_path):
        try:
            video_X = []
            landmarks = ld.get_landmarks(video_path, detector)
            for frame in range(len(landmarks)):
                features = np.array(landmarks[frame]).flatten()
                video_X.append(features)
                #training_X[video,frame,:] = np.pad(features, (0, num_features - len(features)), 'constant')
            training_X.append(video_X)
            training_y[video] = words.index(word)
            video += 1
            i += 1
        except Exception as e:
            print(e)
            break
        video_path = path + word + '/' + str(i).zfill(4) + '.mp4'

print(training_X)
print(training_y)

[[array([ 5.39468527e-01,  6.34025156e-01,  6.37598561e-08,  6.18700683e-01,
        6.09508097e-01,  7.84634426e-03,  6.55369878e-01,  5.70102513e-01,
        5.42990584e-03,  6.78646207e-01,  5.42455554e-01, -4.86166216e-03,
        6.91332579e-01,  5.20948052e-01, -1.07166274e-02,  5.82029581e-01,
        4.97460902e-01,  1.67569816e-02,  6.56631529e-01,  4.80033338e-01,
       -8.54443014e-03,  6.70501411e-01,  5.05593359e-01, -2.61138696e-02,
        6.60215557e-01,  5.26126087e-01, -3.21097076e-02,  5.74771106e-01,
        5.03408313e-01,  1.02922390e-03,  6.64904654e-01,  4.86882001e-01,
       -1.83722116e-02,  6.78297400e-01,  5.14250934e-01, -2.52131838e-02,
        6.66194916e-01,  5.34571171e-01, -2.59987433e-02,  5.79059839e-01,
        5.18237829e-01, -1.79288127e-02,  6.71368003e-01,  5.08837283e-01,
       -3.45865563e-02,  6.83238029e-01,  5.38284957e-01, -3.02553084e-02,
        6.67639256e-01,  5.60410380e-01, -2.24736165e-02,  5.93773305e-01,
        5.40576220e-01,

Normalize X

In [18]:
def  featureNormalize(X, num_features):
    X_norm = X.copy()
    mu = np.zeros(num_features)
    sigma = np.zeros(num_features)

    for i in range(num_features):
        lst = [frame[i] for video in X for frame in video if len(frame) > i]
        mu[i] = np.mean(lst)
        sigma[i] = np.std(lst)
        for video in X_norm:
            for frame in video:
                if len(frame) > i:
                    frame[i] -= mu[i]
                    frame[i] /= sigma[i]

    return X_norm, mu, sigma

normalized_X, mu, sigma = featureNormalize(training_X, num_features)
print(normalized_X)

[[array([ 1.19618916e+00,  3.01348484e-01, -4.12243579e-02,  1.75007202e+00,
        3.84737312e-01,  1.18316337e+00,  1.97371756e+00,  3.55441814e-01,
        1.19562380e+00,  2.04042409e+00,  3.26171802e-01,  8.74509407e-01,
        1.99404893e+00,  2.87085108e-01,  8.19298115e-01,  1.40410226e+00,
       -5.56421193e-04,  1.73589334e+00,  1.85991468e+00,  7.43121812e-02,
        1.04603946e+00,  1.85082484e+00,  2.54999746e-01,  6.42698701e-01,
        1.67463889e+00,  3.79079843e-01,  5.53450371e-01,  1.36140902e+00,
       -4.80356033e-02,  1.38505366e+00,  1.82869406e+00, -1.24937162e-02,
        8.68020559e-01,  1.80980516e+00,  1.03408778e-01,  7.12602292e-01,
        1.64875654e+00,  1.82416118e-01,  7.05188918e-01,  1.38562415e+00,
       -6.06391476e-02,  7.32184315e-01,  1.87987600e+00, -9.44308273e-03,
        3.80363348e-01,  1.88301494e+00,  1.15759833e-01,  4.64529769e-01,
        1.71121697e+00,  2.11079645e-01,  6.37193531e-01,  1.46598952e+00,
       -3.87758064e-02,

Create model

In [None]:
model = keras.Sequential()

model.add(keras.Input(shape=(highest_frame, num_features)))

model.add(keras.SimpleRNN(len(select_words), activation='relu'))