In [None]:
"""
File: asl_pipeline.ipynb
Description: Model training pipeline.

Contributors:
Michael Koenig
Parisa Babaei
Adam Faundez Laurokari
Teo Portase

Created: 2024-12-03
Last Modified: 2025-01-02

Project: A Sign From Above
URL: https://git.chalmers.se/courses/dit826/2024/group4

License: MIT License (see LICENSE file for details)
"""

Import dependencies

In [27]:
import data_prep as prep
import random
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, classification_report

Set variables

In [2]:
words = ['deaf', 'eat', 'fish', 'friend', 'like', 'milk', 'nice', 'no', 'orange', 'teacher', 'want', 'what', 'where', 'white', 'yes']
select_words = ['eat', 'teacher', 'want', 'no']
path = '../../preprocessing/dataset/'
detector_path = '../models/hand_landmarker.task'
num_features = 126
model_name = 'draft_model'
fps = 20

Get data from mediapipe


In [3]:
X, y, num_videos, highest_frame, bad_videos = prep.get_data(select_words, path, detector_path)

print('Number of videos:', num_videos)
print('Highest frame:', highest_frame)
print('Videos with no landmarkers detected: ', bad_videos)


data prep


  0%|          | 0/4 [00:00<?, ?it/s]

eat


eat:   0%|          | 0/98 [00:00<?, ?it/s]

teacher


teacher:   0%|          | 0/101 [00:00<?, ?it/s]

want


want:   0%|          | 0/80 [00:00<?, ?it/s]

no


no:   0%|          | 0/69 [00:00<?, ?it/s]

Number of videos: 320
Highest frame: 230
Videos with no landmarkers detected:  28


Padding and Masking X

In [4]:
padded_X, mask = prep.padX(X, num_videos, highest_frame, num_features)
print(padded_X.shape)

(320, 230, 126)


Split data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(padded_X, y, test_size=0.2, random_state=42)


X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

Create model

In [6]:
model = keras.Sequential()

model.add(keras.Input(shape=(highest_frame, num_features)))
model.add(layers.Masking(mask_value=0.0))
model.add(layers.LSTM(64))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(len(select_words), activation='sigmoid'))


model.summary()

Train model

In [18]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

model.fit(X_train, y_train, epochs=100, callbacks=[early_stopping])

model.save(f'../models/{model_name}.keras')

Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 112ms/step - accuracy: 0.7300 - loss: 0.6695
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step - accuracy: 0.6645 - loss: 0.7886
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step - accuracy: 0.6803 - loss: 0.8125
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step - accuracy: 0.7466 - loss: 0.6410
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 103ms/step - accuracy: 0.6989 - loss: 0.8231
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step - accuracy: 0.7172 - loss: 0.7565
Epoch 7/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 105ms/step - accuracy: 0.7532 - loss: 0.6724
Epoch 8/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step - accuracy: 0.7555 - loss: 0.6258
Epoch 9/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━

Evaluate model

In [35]:
results = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', results[0])
print('Test accuracy:', results[1])

y_pred = model.predict(X_test, verbose=0)
y_pred_bool = np.argmax(y_pred, axis=1)

confusion = confusion_matrix(y_test, y_pred_bool)
print('\nConfusion matrix:')
print(confusion)

print('\nClassification report:')
print(classification_report(y_test, y_pred_bool, target_names=select_words))


Test loss: 0.72332364320755
Test accuracy: 0.78125

Confusion matrix:
[[16  1  3  2]
 [ 2 12  0  1]
 [ 1  1 15  0]
 [ 1  2  0  7]]

Classification report:
              precision    recall  f1-score   support

         eat       0.80      0.73      0.76        22
     teacher       0.75      0.80      0.77        15
        want       0.83      0.88      0.86        17
          no       0.70      0.70      0.70        10

    accuracy                           0.78        64
   macro avg       0.77      0.78      0.77        64
weighted avg       0.78      0.78      0.78        64



Use model

In [37]:
i = random.randint(0,X_test.shape[0]-1)


X_prediction = X_test[i,:,:]
y_prediction = select_words[y_test[i]]

print(model.predict(np.array([X_prediction])))
print("should be", y_prediction)
print("predicted", select_words[np.argmax(model.predict(np.array([X_prediction])))])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[[0.27270207 0.33485556 0.48453373 0.61507744]]
should be no
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
predicted no


Get test accuracy for each word

In [38]:
dic = prep.get_word_accuracy(select_words, model, X_test, y_test)

for key in dic:
    print(f"{key}: {dic[key][0]}/{dic[key][1]}")

eat: 16/22
teacher: 12/15
want: 15/17
no: 7/10


Save model info

In [39]:
with open(f"../models/{model_name}.env", "w") as file:
    file.write(f"MAX_FRAMES={highest_frame}\n")
    file.write(f"NUM_FEATURES={num_features}\n")
    file.write(f"WORDS={",".join(select_words)}\n")
    file.write(f"FPS={fps}\n")
    file.write(f"TEST_ACC={results[1]}\n")
    file.write(f'WORD_ACC="{dic}"\n')