In [8]:
import cv2 as cv
import numpy as np
import mediapipe as mp
import os

from docutils.io import Input
from keras.src.utils.module_utils import tensorflow
from torch.ao.nn.quantized import Dropout

In [10]:
mp_holistic = mp.solutions.holistic
mp_draw = mp.solutions.drawing_utils
targets = np.array(["hello", "thank you", "name", "country", "time", "good", "morning", "afternoon", "night", "day", "nothing"])

# Initializing useful functions

In [11]:
def recognizer(frame, holistic):
    image = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    result = holistic.process(image)
    image.flags.writeable = True
    return result
    

In [12]:
def draw_all_landmarks(image, landmarks):
    mp_draw.draw_landmarks(image, landmarks.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                           mp_draw.DrawingSpec(thickness=1, 
                                               circle_radius=1), 
                           mp_draw.DrawingSpec(thickness=1,
                                               circle_radius=1))
    mp_draw.draw_landmarks(image, landmarks.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_draw.draw_landmarks(image, landmarks.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_draw.draw_landmarks(image, landmarks.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    

In [13]:
def get_coordinates(result):
    pose = np.zeros(132)
    i = 0
    if result.pose_landmarks is not None:
        for el in result.pose_landmarks.landmark:
            pose[i] = el.x
            pose[i + 1] = el.y
            pose[i + 2] = el.z
            pose[i + 3] = el.visibility
            i += 4
    left_hand = np.zeros(63)
    i = 0
    if result.left_hand_landmarks is not None:
        for el in result.left_hand_landmarks.landmark:
            left_hand[i] = el.x
            left_hand[i + 1] = el.y
            left_hand[i + 2] = el.z
            i += 3
    right_hand = np.zeros(63)
    i = 0
    if result.right_hand_landmarks is not None:
        for el in result.right_hand_landmarks.landmark:
            right_hand[i] = el.x
            right_hand[i + 1] = el.y
            right_hand[i + 2] = el.z
            i += 3
    face = np.zeros(1404)
    i = 0
    if result.face_landmarks is not None:
        for el in result.face_landmarks.landmark:
            face[i] = el.x
            face[i + 1] = el.y
            face[i + 2] = el.z
            i += 3
    return np.concatenate((pose, left_hand, right_hand, face)) 

## Initializing Functions for Data Collection

In [15]:
def create_folders(data, targets, num_of_videos):
    for target in targets:
        os.mkdir(os.path.join(data, target))
        for i in range(num_of_videos):
            try:
                os.mkdir(os.path.join(data, target, str(i + 1)))
            except FileExistsError:
                print("WARNING: Folder already exists")
            except Exception as e:
                print("ERROR with creating folders for data")
                print(e)

In [8]:
def webcam_to_array(targets, num_of_videos, duration, webcam):
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        for target in targets:
            for i in range(num_of_videos):
                for t in range(duration):
                    ret, frame = webcam.read()
                    result = recognizer(frame, holistic)
                    draw_all_landmarks(frame, result)
                    if t == 0: 
                        cv.putText(frame, 'STARTING COLLECTION', (120,200), 
                                   cv.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv.LINE_AA)
                        cv.putText(frame, 'Collecting frames for {} Video Number {}'.format(target, i + 1), (15,12), 
                                   cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv.LINE_AA)
                        # Show to screen
                        cv.imshow('webcam', frame)
                        cv.waitKey(1000)
                    else: 
                        cv.putText(frame, 'Collecting frames for {} Video Number {}'.format(target, i + 1), (15,12), 
                                   cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv.LINE_AA)
                        # Show to screen
                        cv.imshow('webcam', frame)
                    
                    
                    coors = get_coordinates(result)
                    path = os.path.join(data, target, str(i + 1), str(t))
                    np.save(path, coors)
                    
                    if cv.waitKey(1) & 0xFF == ord("q"):
                        webcam.release()
                        cv.destroyAllWindows()
                        return None
            
        webcam.release()
        cv.destroyAllWindows()
        

# Data Collection

In [14]:
num_of_videos = 30
data = os.path.join("data")
duration = 30

In [9]:
create_folders(data, ["nothing"], num_of_videos)

In [11]:
webcam = cv.VideoCapture(0)
try:
    webcam_to_array(["nothing"], num_of_videos, duration, webcam)
except Exception as e:
    webcam.release()
    print(e)



# Data Loading

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [16]:
x_data, y_data = [], []
for num, target in enumerate(targets):
    for i in range(num_of_videos):
        sequence = []
        for t in range(duration):
            data = np.load(os.path.join("data", target, str(i + 1), str(t) + ".npy"))
            sequence.append(data)
        x_data.append(sequence)
        y_data.append(num)
x_data = np.array(x_data)
y_data = np.array(y_data)
print("X:", x_data.shape)
print("Y:", y_data.shape)

X: (330, 30, 1662)
Y: (330,)


In [17]:
y_data = OneHotEncoder().fit_transform(y_data.reshape(-1, 1)).toarray()

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.05, random_state=42, shuffle=True, stratify=y_data)

In [19]:
y_train.shape

(313, 11)

# Tensorflow Model Creating

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout
from tensorflow.keras.callbacks import TensorBoard

In [64]:
log_dir = os.path.join('logs')
callback = TensorBoard(log_dir=log_dir)

In [140]:
model = Sequential([
    Input(shape=(x_train.shape[1], x_train.shape[2])),
    LSTM(128, return_sequences=True, activation='relu'),
    LSTM(256, return_sequences=True, activation='relu'),
    LSTM(128, return_sequences=False, activation='relu'),
    Dense(128, activation='elu'),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='elu'),
    Dense(targets.size, activation='softmax'),
])

In [141]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [142]:
model.summary()

## Model Training

In [143]:
model.fit(x_train, y_train, epochs=1000, callbacks=[callback])

Epoch 1/1000
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - categorical_accuracy: 0.0914 - loss: 2.5620
Epoch 2/1000
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - categorical_accuracy: 0.0834 - loss: 2.4156
Epoch 3/1000
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - categorical_accuracy: 0.1211 - loss: 2.3140
Epoch 4/1000
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - categorical_accuracy: 0.1133 - loss: 2.3130
Epoch 5/1000
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - categorical_accuracy: 0.1413 - loss: 2.3719
Epoch 6/1000
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - categorical_accuracy: 0.1635 - loss: 2.3080
Epoch 7/1000
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - categorical_accuracy: 0.1764 - loss: 2.1678
Epoch 8/1000
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

KeyboardInterrupt: 

In [144]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [145]:
%tensorboard --logdir logs/train

Reusing TensorBoard on port 6006 (pid 27084), started 2:09:24 ago. (Use '!kill 27084' to kill it.)

In [146]:
model.save("saved_models/HandSignLanuguageDetection.keras")

In [147]:
del model

# Model Evaluation

In [22]:
model = Sequential([
    Input(shape=(x_train.shape[1], x_train.shape[2])),
    LSTM(128, return_sequences=True, activation='relu'),
    LSTM(256, return_sequences=True, activation='relu'),
    LSTM(128, return_sequences=False, activation='relu'),
    Dense(128, activation='elu'),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='elu'),
    Dense(targets.size, activation='softmax'),
])

In [23]:
model.load_weights("saved_models/HandSignLanuguageDetection.keras")

In [24]:
model.summary()

In [25]:
pred = model.predict(x_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398ms/step


In [26]:
predictions = np.argmax(pred, axis=1)
actual = np.argmax(y_test, axis=1)
print(predictions, actual, sep='\n')

[ 6  9  3  8  9  1  4  2  3  4  3  7  7 10  6  5  0]
[ 6  9  2  8  9  1  4  2  3  4  3  7  7 10  6  5  0]


In [28]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix

In [29]:
confusion_matrix(actual, predictions)

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [30]:
for i, matrix in enumerate(multilabel_confusion_matrix(actual, predictions)):
    print("=========================")
    print(f"Rest vs. \"{targets[i]}\":")
    print(matrix)
    print("=========================\n")

Rest vs. "hello":
[[16  0]
 [ 0  1]]

Rest vs. "thank you":
[[16  0]
 [ 0  1]]

Rest vs. "name":
[[15  0]
 [ 1  1]]

Rest vs. "country":
[[14  1]
 [ 0  2]]

Rest vs. "time":
[[15  0]
 [ 0  2]]

Rest vs. "good":
[[16  0]
 [ 0  1]]

Rest vs. "morning":
[[15  0]
 [ 0  2]]

Rest vs. "afternoon":
[[15  0]
 [ 0  2]]

Rest vs. "night":
[[16  0]
 [ 0  1]]

Rest vs. "day":
[[15  0]
 [ 0  2]]

Rest vs. "nothing":
[[16  0]
 [ 0  1]]



In [31]:
print(classification_report(actual, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       1.00      0.50      0.67         2
           3       0.67      1.00      0.80         2
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         2
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         2
          10       1.00      1.00      1.00         1

    accuracy                           0.94        17
   macro avg       0.97      0.95      0.95        17
weighted avg       0.96      0.94      0.94        17

