# Action Recognition - LSTM Model Implementation Study

This script implements and tune a LSTM model for ASL.

Created by:
- Marcus Vinicius da Silva Fernandes.

2023-06-12.

#### References:
- https://www.youtube.com/watch?v=pG4sUNDOZFg


### Importing necessary libraries

In [1]:
import numpy as np
import os
import csv

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import TensorBoard
# import keras_tuner as kt

from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

### Accessing the landmarks

Set up the paths of folders to locate the landmarks and the list (csv file) that associates the name of the video to the corresponding word in English.

In [2]:
# Set up of the extracted landmarks save path
# landmarks_path = '/Users/marcus/Library/CloudStorage/OneDrive-Personal/Documentos/Loyalist_College/AISC2006/cleaned_extracted_landmarks_xy/'
landmarks_path = 'C:/Users/marcu/OneDrive/Documentos/Loyalist_College/AISC2006/'
train_folder = 'C:/Users/marcu/OneDrive/Documentos/Loyalist_College/AISC2006/train_dir/'
valid_folder = 'C:/Users/marcu/OneDrive/Documentos/Loyalist_College/AISC2006/valid_dir/'
test_folder =  'C:/Users/marcu/OneDrive/Documentos/Loyalist_College/AISC2006/test_dir/'

Creation of the dictionary to associate the videos and the words.

In [3]:
# Opening the file dataset_analysis.csv to load the association of landmark ids to words and its number of frames
id_dict = {}  # initializing the dictionary that will receive the data
num_frames = []  # initializing the list that will contain the number of frames of each landmark

with open(landmarks_path + "Updated Dataset2.csv", "r") as csv_file:
    csv_reader = csv.reader(csv_file)  # reading the data
    next(csv_reader)  # to skip the header
    for row in csv_reader:
        if int(row[0]) <= 10000:
            id_dict['0' * (5 - len(row[0])) + row[0]] = row[1]  # storing the content into a dictionary
        else:
            id_dict[row[0]] = row[1]  # storing the content into a dictionary
        num_frames.append(int(row[7]))

In [4]:
# Maximum number of frames of all the landmarks
max_num_frames = max(num_frames)
print('Maximum number of frames of all the landmarks =', max_num_frames)

# Minimum number of frames of all the landmarks
min_num_frames = min(num_frames)
print('Minimum number of frames of all the landmarks =', min_num_frames)

Maximum number of frames of all the landmarks = 149
Minimum number of frames of all the landmarks = 26


### Shaping the data for the LSTM model

Desired number of frames
- Each video will be reshaped to have the number of rows (or frames) equal to the desired number of frames defined below.

In [5]:
NUM_FRAMES = 60

Creation of the X array

- Time-based sampling: we will reduce the dimension of the array to the desired NUM_FRAMES.
- Padding the videos: we will add rows with zeros to increase the dimension of the array to the desired NUM_FRAMES.
- No change: the array already has the desired NUM_FRAMES.

def load_landmarks(path):
    videos, labels = [], []
    i = 0

    for item in os.listdir(path):
        if item.endswith('.npy'):  # working with npy files only
            if i == 80:
                return np.array(videos), labels
            data = np.load(os.path.join(path, item))  # loading the numpy array into memory
            if data.shape[0] > NUM_FRAMES:  # time-based sampling
                indices = np.arange(0, data.shape[0], data.shape[0] // NUM_FRAMES)[:NUM_FRAMES]
                data = data[indices]
                videos.append(data)
            elif data.shape[0] < NUM_FRAMES:  # padding the videos
                data = np.pad(data, ((0, NUM_FRAMES - data.shape[0]), (0, 0)), mode='constant')
                videos.append(data)
            else:  # no change
                videos.append(data)
            labels.append(id_dict[item.split('.npy')[0]])
            # i += 1

    return np.array(videos), np.array(labels)

x_train, y_train = load_landmarks(train_folder)
x_val, y_val = load_landmarks(valid_folder)

In [6]:
def load_landmarks(path):
    videos, labels = [], []
    i = 0

    for item in os.listdir(path):
        if item.endswith('.npy') and item.split('.npy')[0] in id_dict:  # working with npy files only
            if i == 800:
                return np.array(videos), labels
            data = np.load(os.path.join(path, item))  # loading the numpy array into memory
            if data.shape[0] > NUM_FRAMES:  # time-based sampling
                indices = np.arange(0, data.shape[0], data.shape[0] // NUM_FRAMES)[:NUM_FRAMES]
                data = data[indices]
                videos.append(data)
            elif data.shape[0] < NUM_FRAMES:  # padding the videos
                data = np.pad(data, ((0, NUM_FRAMES - data.shape[0]), (0, 0)), mode='constant')
                videos.append(data)
            else:  # no change
                videos.append(data)
            labels.append(id_dict[item.split('.npy')[0]])
            # i += 1

    return np.array(videos), np.array(labels)

x_train, y_train = load_landmarks(train_folder)
x_val, y_val = load_landmarks(valid_folder)

Creation of the Y array

- One-hot encoding

all_labels = np.concatenate((y_train, y_val), axis=0)

labels_unique = np.unique(all_labels)

labels_encoded = []
for i in all_labels:
    labels_encoded = np.append(labels_encoded, np.where(labels_unique == i))

labels_encoded = to_categorical(labels_encoded).astype(int)

y_train = labels_encoded[ : len(y_train)]
y_val = labels_encoded[len(y_train) : ]

In [7]:
all_labels = np.concatenate((y_train, y_val), axis=0)

label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

num_classes = len(label_encoder.classes_)
y_train = to_categorical(y_train_encoded, num_classes=num_classes)
y_val = to_categorical(y_val_encoded, num_classes=num_classes)

### LSTM Architecture definition and tuning

def build_model(hp):
    model = Sequential()
    model.add(Masking(mask_value=0, input_shape=(x_train.shape[1], x_train.shape[2])))
    
    # Add LSTM layers
    for i in range(hp.Int('num_lstm_layers', min_value=1, max_value=3)):
        return_sequences = (i < hp.Int('num_lstm_layers', min_value=1, max_value=3) - 1)
        model.add(LSTM(units=hp.Int(f'units_lstm_{i}', min_value=32, max_value=256, step=32),
                       activation=hp.Choice(f'activation_lstm_{i}', values=['relu', 'tanh']),
                       recurrent_dropout=hp.Float(f'recurrent_dropout_{i}', min_value=0.1, max_value=0.5, step=0.1),
                       return_sequences=return_sequences))
        
        # Add dropout between LSTM layers
        dropout_rate = hp.Float(f'dropout_rate_lstm_{i}', min_value=0.1, max_value=0.5, step=0.1)
        model.add(Dropout(rate=dropout_rate))
    
    # Add Dense layers
    for j in range(hp.Int('num_dense_layers', min_value=1, max_value=3)):
        model.add(Dense(units=hp.Int(f'units_dense_{j}', min_value=32, max_value=256, step=32),
                        activation=hp.Choice(f'activation_dense_{j}', values=['relu', 'sigmoid'])))
        
        # Add dropout between Dense layers
        dropout_rate = hp.Float(f'dropout_rate_dense_{j}', min_value=0.1, max_value=0.5, step=0.1)
        model.add(Dropout(rate=dropout_rate))
    
    model.add(Dense(units=y_train.shape[1], activation='softmax'))
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]))
    
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=50,
    factor=3,
    directory='keras_tuner_dir',
    project_name='lstm_hyperparameter_tuning'
)

tuner.search(x_train, y_train, epochs=50, validation_data=(x_val, y_val))


In [8]:
# best_hps = tuner.get_best_hyperparameters(num_trials=3)[0]

In [9]:
# best_model = tuner.hypermodel.build(best_hps)

In [10]:
# best_model.fit(x_train, to_categorical(y_train), epochs=10, validation_data=(x_val, to_categorical(y_val)))

In [11]:
# Second try - masking layer added
model = Sequential()
model.add(Masking(mask_value=0, input_shape=(x_train.shape[1], x_train.shape[2])))  # Input shape with variable-length sequences
model.add(LSTM(128, activation='relu'))
model.add(Dropout(rate=0.1))
model.add(Dense(y_train.shape[1], activation='softmax'))



In [12]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [13]:
history = model.fit(x_train, y_train, epochs=300, validation_data=(x_val, y_val), callbacks=[tb_callback])

Epoch 1/300


2023-07-14 00:30:18.849556: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300