# ASL Recognition with Pose estimation

In [None]:
import matplotlib.pyplot as plt
import os
import string
import numpy as np
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
import torchvision.transforms as transforms
import pickle
import random
import seaborn as sn
import pandas as pd
import copy
from sklearn.metrics import confusion_matrix
import cv2

Import MediaPipe Hands API

In [None]:
!python -V
!python -m pip install --upgrade pip

!pip install mediapipe

import mediapipe as mp
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands

Define classes

In [None]:
classes = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
           'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
           'W', 'X', 'Y', 'Z', 'del', 'space')

classes_NULL = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
           'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
           'W', 'X', 'Y', 'Z', 'del', 'space', 'null')

Helper functions

In [None]:
# Normalizes hand keypoints
def transformPoints(handcoords):
    x_coords = []
    y_coords = []
    for idx, coord in enumerate(handcoords):
        if idx % 2: y_coords.append(coord)
        else: x_coords.append(coord)
    x_min = min(x_coords)
    y_min = min(y_coords)
    x_max = max(x_coords)
    y_max = max(y_coords)
    transformed = []
    for idx, coord in enumerate(handcoords):
        if idx % 2:
            tf_coord = (coord - y_min) / (y_max - y_min)
        else:
            tf_coord = (coord - x_min) / (x_max - x_min)
        transformed.append(tf_coord)
    return transformed

In [None]:
# Runs MediaPipe Hands on given image file
def dataFromImage(file):
    with mp_hands.Hands(
        static_image_mode=True,
        max_num_hands=2,
        min_detection_confidence=0.5) as hands:
        image = cv2.imread(file)
        # Convert the BGR image to RGB before processing.
        results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        if not results.multi_hand_landmarks:
            return torch.tensor([0]), False
        righthand_idx = 0
        is_righthand = False
        for i, handedness in enumerate(results.multi_handedness):
            if handedness.classification[0].label == 'Left': #since it is mirrored by default
                righthand_idx = i
                is_righthand = True
                break
        handcoords = []
        hand_landmarks = results.multi_hand_landmarks[righthand_idx]
        for point in hand_landmarks.landmark:
            handcoords.append(point.x)
            handcoords.append(point.y)
        handcoords = transformPoints(handcoords)
        coords_tensor = torch.Tensor(handcoords)
    return coords_tensor, True

In [None]:
# Loads the images from the folder structure and runs pose estimation
def loadFromFolders(rootpath, classes):
    dataset = []
    for label, cl in enumerate(classes):
        path = rootpath + cl
        if os.path.exists(path):
            for idx, filename in enumerate(os.listdir(path)):
                f = os.path.join(path, filename)
                # checking if it is a file
                if os.path.isfile(f):
                    coords, success = dataFromImage(f)
                    if success:
                        element = (coords, label)
                        dataset.append(element)
                        # Print progress
                        if idx%100 == 0: print(classes[label], ":", idx, " ready")
            print("Class", cl, "is ready")
        else: print("Class", cl, "is not in current dir")
    return dataset

In [None]:
# Plot and save train statistics
def saveTrainPlots(best_result, statistics, name):
    train_loss = statistics[0]
    valid_loss = statistics[1]
    accuracy = statistics[2]
    validation_num = statistics[3]
    textoffset = int(len(validation_num)*0.0375)
    text = 'Val.: '+str(best_result[0])+' Loss.: '+str(round(best_result[1], 3))
    plt.figure(dpi=500)
    plt.xlabel('Validation number')
    plt.ylabel('Loss')
    plt.plot(validation_num, train_loss, 'r', label='Train loss')
    plt.plot(validation_num, valid_loss, 'b', label='Validation loss')
    plt.plot(best_result[0], best_result[1], 'ko')
    plt.text(best_result[0]+textoffset, best_result[1], text, backgroundcolor = (0.9, 0.9, 0.9))
    plt.legend()
    plt.savefig('./' + name + '_losses.png')

    plt.figure(dpi=500)
    plt.xlabel('Validation number')
    plt.ylabel('Accuracy')
    plt.plot(validation_num, accuracy, 'g')
    plt.savefig('./' + name + '_accuracy.png')

In [None]:
# Plot and save test confusion matrix
def saveConfMtx(truth_array, pred_array, modelName, NULL):
    if NULL: cl = classes_NULL
    else: cl = classes
    outputconfmtx = './'+modelName+'conf_mtx.png'
    cf_matrix = confusion_matrix(truth_array, pred_array)
    df_cm = pd.DataFrame(cf_matrix, index = [i for i in cl],
                        columns = [i for i in cl])
    plt.figure(figsize = (12,7))
    sn.heatmap(df_cm, annot=True, fmt="1.0f")
    plt.savefig(outputconfmtx)

In [None]:
# Print test accuracy
def printAccuracy(truth_array, pred_array):
    correct = 0
    total = len(truth_array)
    for i, truth in enumerate(truth_array):
        if truth == pred_array[i]: correct += 1
    print(f'Accuracy of the network on the test images: {100 * correct // total} %')

Import or generate the dataset

In [None]:
generatedata = False # Needs to be true for running MediaPipe Hands

# If keypoint coordinate tensors are not extracted yet
if generatedata:
    dataset_letters_path = "/kaggle/input/asl-alphabet/asl_alphabet_train/asl_alphabet_train/"
    dataset_null_path = "/kaggle/input/nothing/"

    dataset_letters = loadFromFolders(dataset_letters_path, classes)
    dataset_null = loadFromFolders(dataset_null_path, classes)

    file = open('dataset_letters-null.dat', 'wb')
    pickle.dump(dataset_letters, file)
    pickle.dump(dataset_null, file)
    file.close()
    
# Load keypoint dataset from file if already extracted and saved
else:
    file = open('/kaggle/input/datatensors/dataset_letters-null.dat', 'rb')
    dataset_letters = pickle.load(file)
    dataset_null = pickle.load(file)
    file.close()

In [None]:
# Print number of elements for each class
classrep = [0]*len(classes_NULL)
for coords, label in (dataset_letters + dataset_null):
    classrep[label] = classrep[label] + 1
    
for label, num in enumerate(classrep):
    print(classes_NULL[label] + ': ' + str(num))

# Creating models and training them

Define models

In [None]:
class SmallNet(nn.Module):
    def __init__(self, numofclasses):
        super().__init__()
        self.fc1 = nn.Linear(42, 400)
        self.fc2 = nn.Linear(400, 400)
        self.fc3 = nn.Linear(400, 100)
        self.fc4 = nn.Linear(100, numofclasses)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

class LargeNet(nn.Module):
    def __init__(self, numofclasses):
        super().__init__()
        self.fc1 = nn.Linear(42, 500)
        self.fc2 = nn.Linear(500, 1000)
        self.fc3 = nn.Linear(1000, 1000)
        self.fc4 = nn.Linear(1000, 1000)
        self.fc5 = nn.Linear(1000, 500)
        self.fc6 = nn.Linear(500, numofclasses)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x

Set training parameters

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

Prepare the dataset and the  for training

In [None]:
def prepareDataAndModel(modelsize, NULL):
    #PREPARE MODEL
    if NULL: numofclasses = len(classes_NULL)
    else:  numofclasses = len(classes)
    if modelsize == 'small':
        net = SmallNet(numofclasses)
    elif modelsize == 'large':
        net = LargeNet(numofclasses)
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    model = net.to(device)
    #PREPARE DATASET
    batch_size = 16
    if NULL:
        dataset = dataset_letters + dataset_null
        modelName = modelsize +'_NULL'
    else:
        dataset = dataset_letters
        modelName = modelsize
    #split train-valid-test 80-10-10
    trainset, testset = random_split(dataset, [int(0.8 * len(dataset)),
                        len(dataset) - int(0.8 * len(dataset))], generator=torch.Generator().manual_seed(42))
    testset, validationset = random_split(testset, [int(0.5 * len(testset)),
                        len(testset) - int(0.5 * len(testset))], generator=torch.Generator().manual_seed(42))
    
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    validationloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True)
    testloader = torch.utils.data.DataLoader(validationset, batch_size=batch_size, shuffle=True)
    return model, modelName, optimizer, trainloader, validationloader, testloader

Functions for training

In [None]:
def validation(model, device, valid_loader, loss_function):
    # Settings
    model.eval()
    loss_total = 0
    correct = 0
    total = 0

    # Test validation data
    with torch.no_grad():
        for data in valid_loader:
            inputs, labels = data[0].to(device), data[1].to(device)
            # calculate outputs by running images through the network
            outputs = model(inputs)
            
            #loss
            loss = loss_function(outputs, labels)
            loss_total += loss.item()
            
            #accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return loss_total / len(valid_loader), 100 * correct // total

In [None]:
def train(model, optimizer, trainloader, validationloader):
    train_loss = []
    valid_loss = []
    accuracy = []
    validation_num = []
    min_loss = 1000
    val_cntr = 0
    for epoch in range(10):  # Loop over the dataset multiple times
        model.train()
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # Get the inputs; data is a list of [inputs, labels]
            inputs, labels = data[0].to(device), data[1].to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            # Validate and save statistics after every 50 batches
            if i % 50 == 49:
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss:.3f}')
                val_cntr = val_cntr + 1
                the_current_loss, the_current_acc = validation(model, device, validationloader, criterion)
                train_loss.append(running_loss)
                valid_loss.append(the_current_loss)
                accuracy.append(the_current_acc)
                validation_num.append(val_cntr)
                print("validation loss =", str(the_current_loss))
                # If the current model is the best yet
                if the_current_loss < min_loss:
                    min_loss = the_current_loss
                    best_model_wts = copy.deepcopy(model.state_dict())
                    print("NEW MIN LOSS =", str(min_loss))
                    best_result = (val_cntr, min_loss)
            
            running_loss = 0.0
            
    print('Finished Training')
    # Load the best model
    model.load_state_dict(best_model_wts)
    statistics = (train_loss, valid_loss, accuracy, validation_num)
    return model, best_result, statistics

In [None]:
def test(model, testloader):
    truth_array = []
    pred_array = []
    model.eval()
    # Since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in testloader:
            features, labels = data[0].to(device), data[1].to(device)
            # Calculate outputs by running the data through the network
            outputs = model(features)
            soft_outputs = torch.nn.functional.softmax(outputs, dim=1)
            # The class with the highest energy is what we choose as prediction
            confidence, predicted = torch.max(soft_outputs.data, 1)
            truth_array += labels.tolist()
            pred_array += predicted.tolist()
    return truth_array, pred_array

In [None]:
# Training a model and saving its statistics
def doProcess(modelsize, NULL):
    # Prepare the data and the model
    model, modelName, optimizer, trainloader, validationloader, testloader = prepareDataAndModel(modelsize, NULL)
    # Train model
    print('TRAINING AND SAVING MODEL: '+modelName)
    model, best_result, statistics = train(model, optimizer, trainloader, validationloader)
    
    # Save training statistics
    saveTrainPlots(best_result, statistics, modelName)
    
    # Run test
    truth_array, pred_array = test(model, testloader)
    
    # Print and save test statistics
    printAccuracy(truth_array, pred_array)
    saveConfMtx(truth_array, pred_array, modelName, NULL)
    
    # Save the weights of the best model
    torch.save(model.state_dict(), './'+modelName+'.pth')
    print('---------------------------------------------------')

In [None]:
# Run the whole process on different configurations
doProcess(modelsize='small', NULL=False)
doProcess(modelsize='large', NULL=False)
doProcess(modelsize='small', NULL=True)
doProcess(modelsize='large', NULL=True)