# Action Recognition - LSTM Model Implementation Study

This script implements a LSTM model for ASL. It will be used for study purposes.

Created by:
- Marcus Vinicius da Silva Fernandes.
- Yamini Sharma.

2023-06-05.

#### References:
- https://www.youtube.com/watch?v=pG4sUNDOZFg
- https://numpy.org/doc/stable/reference/generated/numpy.pad.html

### Importing necessary libraries

In [1]:
import numpy as np
import os
import csv
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Masking
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

### Accessing the landmarks

Set up the paths of folders to locate the landmarks and the list (csv file) that associates the name of the video to the corresponding word in English.

In [None]:
# Set up of the extracted landmarks save path
landmarks_path = '/Users/marcus/ASL/Training/third_run/landmarks/'

Creation of the dictionary to associate the videos and the words.

In [None]:
# Opening the file dataset_analysis.csv to load the association of landmark ids to words and its number of frames
id_dict = {}  # initializing the dictionary that will receive the data
num_frames = []  # initializing the list that will contain the number of frames of each landmark

balanced_video_sample = []
with open(landmarks_path + "dataset_analysis_5_videos.csv", "r") as csv_file:
    csv_reader = csv.reader(csv_file)  # reading the data
    next(csv_reader)  # to skip the header
    for row in csv_reader:
        balanced_video_sample.append('0' * (5 - len(row[0])) + row[0])
        id_dict['0' * (5 - len(row[0])) + row[0]] = row[1]  # storing the content into a dictionary
        num_frames.append(int(row[7]))

In [None]:
# Maximum number of frames of all the landmarks
max_num_frames = max(num_frames)
print('Maximum number of frames of all the landmarks =', max_num_frames)

# Minimum number of frames of all the landmarks
min_num_frames = min(num_frames)
print('Minimum number of frames of all the landmarks =', min_num_frames)

### Shaping the data for the LSTM model

Desired number of frames
- Each video will be reshaped to have the number of rows (or frames) equal to the desired number of frames defined below.

In [None]:
NUM_FRAMES = 30

Creation of the X array

- Time-based sampling: we will reduce the dimension of the array to the desired NUM_FRAMES.
- Padding the videos: we will add rows with zeros to increase the dimension of the array to the desired NUM_FRAMES.
- No change: the array already has the desired NUM_FRAMES.

In [None]:
videos, labels = [], []

for item in os.listdir(landmarks_path):
    if item.endswith('.npy') and item.split('.npy')[0] in balanced_video_sample:  # working with npy files only
        data = np.load(os.path.join(landmarks_path, item))  # loading the numpy array into memory
        if data.shape[0] > NUM_FRAMES:  # time-based sampling
            indices = np.arange(0, data.shape[0], data.shape[0] // NUM_FRAMES)[:30]
            data = data[indices]
            videos.append(data)
        elif data.shape[0] < NUM_FRAMES:  # padding the videos
            data = np.pad(data, ((0, NUM_FRAMES - data.shape[0]), (0, 0)), mode='constant')
            videos.append(data)
        else:  # no change
            videos.append(data)
        labels.append(id_dict[item.split('.npy')[0]])

X = np.array(videos)
print(X.shape)

Creation of the Y array

In [None]:
labels_unique = np.unique(labels)

labels_encoded = []
for i in labels:
    # labels_encoded = np.append(labels_encoded, np.where(labels_unique == i))
    labels_encoded.append(np.where(labels_unique == i)[0][0])


Y = to_categorical(labels_encoded).astype(int)
print(Y.shape)

Splitting the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

y_true = np.argmax(y_test, axis=1)
print(y_true)

### LSTM model

Model build

In [None]:
# Second try - masking layer added
model = Sequential()
model.add(Masking(mask_value=0, input_shape=(X.shape[1], X.shape[2])))  # Input shape with variable-length sequences
model.add(LSTM(32, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))

Model compile

In [None]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Model load

In [None]:
# from tensorflow.keras.models import load_model
# loaded_model = load_model('ARM_LSTM_third_run_5_videos.h5')

Model fitment

In [None]:
history = model.fit(x_train, y_train, epochs=1, validation_data=(x_test, y_test), callbacks=[tb_callback])

In [None]:
model.summary()

Saving the model

### Prediction

In [None]:
# Make predictions
predictions = model.predict(X_test)

# Get the predicted class labels
predicted_labels = np.argmax(predictions, axis=1)

# Print the predicted labels
print(f'{predicted_labels} = Predictions')
print(f'{y_true} = Target')

# Calculate the accuracy score
accuracy = accuracy_score(y_true, predicted_labels)
# accuracy = accuracy_score(y_true, predicted_labels.flatten())


# Print the accuracy score
print("Accuracy:", accuracy)

for i in range(len(yhat)):
    print('Expected result = ' + labels[ytrue[i]])
    print('Model result = ' + labels[yhat[i]])
    print()

In [None]:
# model.save('ARM_LSTM_third_run_5_videos.h5')