In [1]:
import cv2 as cv
import mediapipe as mp
import numpy as np
import os

from collections import namedtuple

In [2]:
mpHands = mp.solutions.hands
mpConnect = mp.solutions.hands_connections
mpDraw = mp.solutions.drawing_utils

In [3]:
def get_coordinates(hand: namedtuple) -> np.ndarray:
    """
    Converts NamedTuple of coordinates to numpy array
    :param hand: - output of mediapipe hand recognizer
    :return: flatten numpy array of coordinates
    """
    hand_arr = np.zeros(63)
    i = 0
    for el in hand.landmark:
        hand_arr[i] = el.x
        hand_arr[i + 1] = el.y
        hand_arr[i + 2] = el.z
        i += 3
    return hand_arr

In [4]:
def recognizer(frame: np.ndarray, model: mpHands.Hands) -> namedtuple:
    """
    Recognizing hands landmarks on frame
    :param frame: - np.ndarray representing hands landmarks
    :param model: - MediaPipe solution for hand recognition 
    :return: - NamedTuple of hands landmarks
    """
    image = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    result = model.process(image).multi_hand_landmarks
    image.flags.writeable = True
    return result

# Data Collection

In [5]:
digits = np.array(["one", "two", "three", "four", "five"])
digits_map = {
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5
}
num_of_samples = 100
data = os.path.join("data")

In [19]:
def create_folders(path: str, targets: np.ndarray) -> None:
    """
    Creates folders for each target if they do not exist
    :param path: - data folder path
    :param targets: - array of classification targets
    :return: 
    """
    for target in targets:
        new_path = os.path.join(path, target)
        os.mkdir(new_path)

In [8]:
create_folders(data, digits)

In [34]:
def collect_images():
    """
    Collect data samples in numpy array format (.npy files)
    :return:
    """
    webcam = cv.VideoCapture(0)
    if not webcam.isOpened():
        print("Error: Camera not found.")
        return
    
    print("Press 's' to start capturing images, and 'q' to quit.")
    try:
        with mpHands.Hands(max_num_hands=1) as mp_hands:
            for digit in digits:
                print(f"Get ready to show '{digit}' gesture.")
                input("Press Enter when ready.")
                count = 0
                while count < num_of_samples:
                    ret, frame = webcam.read()
                    if not ret:
                        print("Failed to grab frame.")
                        break
                    if count == 0:
                        cv.putText(frame, 'Press \'s\' to start capturing '
                                          'images, and \'q\' to quit.', (50,200),
                                   cv.FONT_HERSHEY_SIMPLEX, 0.7, (240, 240, 240), 1, cv.LINE_AA)
                    cv.putText(frame, f'Collecting frames for {digit}'
                                      f' Video Number {count + 1}', (15,12), 
                               cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, 
                               cv.LINE_AA)
                    
                    result = recognizer(frame, mp_hands)
                    if result is not None:
                        mpDraw.draw_landmarks(frame, result[0], 
                                              mpConnect.HAND_CONNECTIONS)
                    
                    cv.imshow("webcam", frame)
                    key = cv.waitKey(1) & 0xFF
                    if key == ord('s') and result is not None:
                        npy_data = get_coordinates(hand=result[0])
                        filename = os.path.join(data, digit, f"{count + 1}.npy")
                        np.save(filename, npy_data)
        
                        print(f"Saved {filename}")
                        count += 1
        
                    elif key == ord('q'):
                        print("Exiting early.")
                        break
    except Exception as e:
        print(e)
    finally:
        webcam.release()
        cv.destroyAllWindows()


In [33]:
collect_images()

Press 's' to start capturing images, and 'q' to quit.
Get ready to show 'one' gesture.
Saved data\one\1.npy
Saved data\one\2.npy
Saved data\one\3.npy
Saved data\one\4.npy
Saved data\one\5.npy
Saved data\one\6.npy
Saved data\one\7.npy
Saved data\one\8.npy
Saved data\one\9.npy
Saved data\one\10.npy
Saved data\one\11.npy
Saved data\one\12.npy
Saved data\one\13.npy
Saved data\one\14.npy
Saved data\one\15.npy
Saved data\one\16.npy
Saved data\one\17.npy
Saved data\one\18.npy
Saved data\one\19.npy
Saved data\one\20.npy
Saved data\one\21.npy
Saved data\one\22.npy
Saved data\one\23.npy
Saved data\one\24.npy
Saved data\one\25.npy
Saved data\one\26.npy
Saved data\one\27.npy
Saved data\one\28.npy
Saved data\one\29.npy
Saved data\one\30.npy
Saved data\one\31.npy
Saved data\one\32.npy
Saved data\one\33.npy
Saved data\one\34.npy
Saved data\one\35.npy
Saved data\one\36.npy
Saved data\one\37.npy
Saved data\one\38.npy
Saved data\one\39.npy
Saved data\one\40.npy
Saved data\one\41.npy
Saved data\one\42.np

# Data Loading

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [65]:
digits = np.array(["one", "two", "three", "four", "five"])
digits_map = {
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5
}
num_of_samples = 100
data = os.path.join("data")

In [66]:
x_data, y_data = [], []
for digit in digits:
    for i in range(num_of_samples):
        arr = np.load(os.path.join(data, digit, f"{i + 1}.npy"))
        x_data.append(arr)
        y_data.append(digits_map[digit])
x_data = np.array(x_data)
y_data = OneHotEncoder().fit_transform(np.array(y_data).reshape(-1, 1)).toarray()
print("X_DATA:", x_data.shape)
print("Y_DATA:", y_data.shape)

X_DATA: (500, 63)
Y_DATA: (500, 5)


In [67]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.15, random_state=42)

# Model Initialization

In [68]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam

In [85]:
model = nn.Sequential(
        nn.Linear(63, 16),
        nn.ReLU(),
        nn.Linear(16, 5),
        nn.Softmax(dim=1)
    )

In [86]:
optimizer = Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
epochs = 500

In [87]:
dataset = TensorDataset(torch.tensor(x_data).float(), torch.tensor(y_data).float())
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [88]:
for epoch in range(epochs):
    running_loss = []
    for batch, (inputs, actual) in enumerate(dataloader):
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, actual)
        running_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        print(f"EPOCH {epoch + 1} === BATCH: {batch + 1} === LOSS: {loss.item()}")
    print(f"EPOCH {epoch + 1} LOSS: {np.mean(running_loss)}")

EPOCH 1 === BATCH: 1 === LOSS: 0.16110463440418243
EPOCH 1 === BATCH: 2 === LOSS: 0.16104494035243988
EPOCH 1 === BATCH: 3 === LOSS: 0.16175904870033264
EPOCH 1 === BATCH: 4 === LOSS: 0.1627247929573059
EPOCH 1 === BATCH: 5 === LOSS: 0.15988093614578247
EPOCH 1 === BATCH: 6 === LOSS: 0.1597270667552948
EPOCH 1 === BATCH: 7 === LOSS: 0.16200926899909973
EPOCH 1 === BATCH: 8 === LOSS: 0.1600249856710434
EPOCH 1 === BATCH: 9 === LOSS: 0.1596592515707016
EPOCH 1 === BATCH: 10 === LOSS: 0.15907110273838043
EPOCH 1 === BATCH: 11 === LOSS: 0.15757450461387634
EPOCH 1 === BATCH: 12 === LOSS: 0.1607411801815033
EPOCH 1 === BATCH: 13 === LOSS: 0.15908324718475342
EPOCH 1 === BATCH: 14 === LOSS: 0.16088569164276123
EPOCH 1 === BATCH: 15 === LOSS: 0.16136077046394348
EPOCH 1 === BATCH: 16 === LOSS: 0.16319260001182556
EPOCH 1 LOSS: 0.16061525139957666
EPOCH 2 === BATCH: 1 === LOSS: 0.16072580218315125
EPOCH 2 === BATCH: 2 === LOSS: 0.1616397202014923
EPOCH 2 === BATCH: 3 === LOSS: 0.15959659218788

In [89]:
torch.save(model.state_dict(), os.path.join("saved_models", "FingerCounter(alp-1).pth"))

In [90]:
del model

# Model Evaluation

In [91]:
import torch
import torch.nn as nn

In [92]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report

In [93]:
model = nn.Sequential(
        nn.Linear(63, 16),
        nn.ReLU(),
        nn.Linear(16, 5),
        nn.Softmax(dim=1)
    )

In [94]:
model.load_state_dict(torch.load(os.path.join("saved_models", "FingerCounter(alp-1).pth")))

  model.load_state_dict(torch.load(os.path.join("saved_models", "FingerCounter(alp-1).pth")))


<All keys matched successfully>

In [95]:
model.eval()

Sequential(
  (0): Linear(in_features=63, out_features=16, bias=True)
  (1): ReLU()
  (2): Linear(in_features=16, out_features=5, bias=True)
  (3): Softmax(dim=1)
)

In [96]:
pred = model(torch.tensor(x_test).float()).detach().numpy()
print("PRED:", pred.shape)
print("Y_TEST:", y_test.shape)

PRED: (75, 5)
Y_TEST: (75, 5)


In [97]:
predicted = np.argmax(pred, axis=-1) + 1
actual = np.argmax(y_test, axis=-1) + 1

In [98]:
confusion_matrix(actual, predicted)

array([[21,  1,  0,  0,  0],
       [ 0, 11,  0,  0,  0],
       [ 0,  1,  7,  0,  0],
       [ 0,  0,  0, 17,  0],
       [ 0,  0,  0,  0, 17]], dtype=int64)

In [99]:
for i, matrix in enumerate(multilabel_confusion_matrix(actual, predicted)):
    print("=========================")
    print(f"Rest vs. \"{digits[i]}\":")
    print(matrix)
    print("=========================\n")

Rest vs. "one":
[[53  0]
 [ 1 21]]

Rest vs. "two":
[[62  2]
 [ 0 11]]

Rest vs. "three":
[[67  0]
 [ 1  7]]

Rest vs. "four":
[[58  0]
 [ 0 17]]

Rest vs. "five":
[[58  0]
 [ 0 17]]



In [100]:
print(classification_report(actual, predicted))

              precision    recall  f1-score   support

           1       1.00      0.95      0.98        22
           2       0.85      1.00      0.92        11
           3       1.00      0.88      0.93         8
           4       1.00      1.00      1.00        17
           5       1.00      1.00      1.00        17

    accuracy                           0.97        75
   macro avg       0.97      0.97      0.97        75
weighted avg       0.98      0.97      0.97        75

