In [1]:
from torch.utils.data import Dataset
import cv2
import numpy as np
import torch
import os
import json
import pandas as pd 
from pytube import YouTube

In [2]:
def open_json_file(fname):
    with open(fname, 'r') as f:
        return json.load(f)

In [3]:
df_train = pd.read_json('datasets/MS-ASL/MSASL_train.json')
df_val = pd.read_json('datasets/MS-ASL/MSASL_val.json')
df_test = pd.read_json('datasets/MS-ASL/MSASL_test.json')

top_10_signs = open_json_file('datasets/MS-ASL/hmap_vids.json').keys()
# val_hmap = open_json_file('datasets/MS-ASL/hmap_vids_val.json')
# test_hmap = open_json_file('datasets/MS-ASL/hmap_vids_test.json')


df_train.shape[0], \
df_val.shape[0], \
df_test.shape[0]

(16054, 5287, 4172)

In [4]:
df_train = df_train[df_train.clean_text.isin(top_10_signs)]
df_val = df_val[df_val.clean_text.isin(top_10_signs)]
df_test = df_test[df_test.clean_text.isin(top_10_signs)]

In [5]:
df_train['vid'] = df_train.url.apply(lambda x: YouTube(x).video_id)
df_val['vid'] = df_val.url.apply(lambda x: YouTube(x).video_id)
df_test['vid'] = df_test.url.apply(lambda x: YouTube(x).video_id)

### merge_path expects a df column with the format 'sign+video_id+end_time'
merge_paths = lambda x: os.path.join('datasets','MS-ASL','trimmed_videos', x.split('+')[0], x.split('+')[1] + '_' + x.split('+')[2] + '.mp4')


df_train['fpath'] = df_train.clean_text + '+' + df_train.vid  + '+' + df_train.end_time.astype(int).astype(str)
df_val['fpath']   = df_val.clean_text   + '+' + df_val.vid  + '+' + df_val.end_time.astype(int).astype(str)
df_test['fpath']  = df_test.clean_text  + '+' + df_test.vid  + '+' + df_test.end_time.astype(int).astype(str)

df_train['fpath'] = df_train.fpath.apply(merge_paths)
df_val['fpath']   = df_val.fpath.apply(merge_paths)
df_test['fpath']  = df_test.fpath.apply(merge_paths)

df_train.head()

Unnamed: 0,org_text,clean_text,start_time,signer_id,signer,start,end,file,label,height,fps,end_time,url,text,box,width,review,vid,fpath
15,like,like,0.0,269,53,0,52,SignSchool really like,6,360,29.97,1.735,www.youtube.com/watch?v=7y5Ye-2-ZBs,like,"[0.040461480617523006, 0.335311889648437, 0.99...",640,,7y5Ye-2-ZBs,datasets\MS-ASL\trimmed_videos\like\7y5Ye-2-ZB...
44,Want (mouth “WA“),want,385.765,6,-1,9594,9751,LASL - Ch 6 Vocab,8,358,24.87,392.077,https://www.youtube.com/watch?v=jQb9NL9_S6U,want,"[0.008866041898727, 0.11897420883178701, 0.843...",640,,jQb9NL9_S6U,datasets\MS-ASL\trimmed_videos\want\jQb9NL9_S6...
56,teacher,teacher,13.995,144,-1,419,464,teacher - ASL sign for teacher,2,360,29.94,15.498,https://www.youtube.com/watch?v=_HOx2QkkTsg,teacher,"[0.045998364686965006, 0.037116646766662, 1.0,...",480,1.0,_HOx2QkkTsg,datasets\MS-ASL\trimmed_videos\teacher\_HOx2Qk...
57,teacher,teacher,15.498,144,-1,464,510,teacher - ASL sign for teacher,2,360,29.94,17.034,https://www.youtube.com/watch?v=_HOx2QkkTsg,teacher,"[0.045998364686965006, 0.037116646766662, 1.0,...",480,1.0,_HOx2QkkTsg,datasets\MS-ASL\trimmed_videos\teacher\_HOx2Qk...
72,EAT,eat,12.913,8,20,387,452,Basic ASL Vocabulary for Babies,3,360,29.97,15.082,https://www.youtube.com/watch?v=htsdwxJ-fTo,eat,"[0.09897658228874201, 0.294420778751373, 1.0, ...",640,,htsdwxJ-fTo,datasets\MS-ASL\trimmed_videos\eat\htsdwxJ-fTo...


In [6]:
df_train.shape[0], \
df_val.shape[0], \
df_test.shape[0]

(503, 138, 71)

In [7]:
verify_path = lambda x: os.path.exists(x)

df_train = df_train[df_train.fpath.apply(verify_path)]
df_val = df_val[df_val.fpath.apply(verify_path)]
df_test = df_test[df_test.fpath.apply(verify_path)]


df_train.shape[0], \
df_val.shape[0], \
df_test.shape[0]

(374, 85, 51)

In [8]:
class MSASLVideoDataset(Dataset):
    def __init__(self, video_paths, labels, num_frames=32, img_size=224, transforms=None):
        self.video_paths = video_paths
        self.labels = labels
        self.num_frames = num_frames
        self.img_size = img_size
        self.transforms = transforms

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.load_video(video_path)
        if self.transforms:
            frames = self.transforms(frames)

        return frames, label

    def load_video(self, path):
        frames = read_frames(path, self.img_size)
        # Uniformly sample self.num_frames frames
        total_frames = frames.shape[0]
        if total_frames >= self.num_frames:
            idxs = np.linspace(0, total_frames-1, self.num_frames).astype(int)
            frames = frames[idxs]
        else:
            # pad by repeating last frame
            pad_len = self.num_frames - total_frames
            pad_frames = np.repeat(frames[-1:], pad_len, axis=0)
            frames = np.concatenate((frames, pad_frames), axis=0)

        # frames = frames.transpose(0, 3, 1, 2)  # (Frames, Channels, Height, Width)
        frames = torch.from_numpy(frames).float() / 255.0  # normalize 0-1
        return frames
    
def read_frames(path, img_size):
    cap = cv2.VideoCapture(path)
    frames = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (img_size, img_size))
        frames.append(frame)

    cap.release()

    frames = np.array(frames)
    return frames


import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML


def plot_video_gif(video_tensor, fps=5, label=None):
    """
    Displays a video tensor as an animated GIF inline.
    
    Args:
        video_tensor: A torch tensor or numpy array with shape [T, H, W, C] or [T, C, H, W]
        fps: Frames per second for playback
    """
    fig = plt.figure(figsize=(6, 6))
    img = plt.imshow(video_tensor[0])

    def animate(i):
        img.set_array(video_tensor[i])
        return [img]
    plt.axis('off')
    if label:
        plt.title(label)
    ani = animation.FuncAnimation(fig, animate, frames=len(video_tensor), interval=1000/fps, blit=True)
    plt.close(fig)
    return HTML(ani.to_jshtml())


In [9]:
from sklearn.preprocessing import LabelEncoder

sign_encoder = LabelEncoder()

df_train['label'] = sign_encoder.fit_transform(df_train.clean_text)
df_val['label'] = sign_encoder.transform(df_val.clean_text)
df_test['label'] = sign_encoder.transform(df_test.clean_text)

In [11]:
from fastai.data.core import DataLoaders

train_ds = MSASLVideoDataset(df_train.fpath.to_list(), df_train.label.to_list(), num_frames=32, img_size=224)
valid_ds = MSASLVideoDataset(df_val.fpath.to_list(), df_val.label.to_list(), num_frames=32, img_size=224)
# dls = DataLoaders.from_dsets(train_ds, valid_ds, bs=8, shuffle=True, num_workers=4)

In [17]:
ind = 0
plot_video_gif(train_ds[ind][0], 16, sign_encoder.inverse_transform(train_ds[ind][1:]))

In [12]:
import mediapipe as mp

In [52]:
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands
mp_face = mp.solutions.face_mesh
mp_pose = mp.solutions.pose

landmark_style = mp_drawing_styles.get_default_hand_landmarks_style()
connection_style = mp_drawing_styles.get_default_hand_connections_style()

# Reduce the marker size and line thickness
for k in landmark_style:
    landmark_style[k].circle_radius = 1
    landmark_style[k].thickness = 1

for k in connection_style:
    connection_style[k].thickness = 1

In [70]:
ind = 5
with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.4, min_tracking_confidence=0.6) as hands:
    #  mp_face.FaceMesh(max_num_faces=1, min_detection_confidence=0.8, min_tracking_confidence=0.6) as face_mesh, \
    #  mp_pose.Pose(min_detection_confidence=0.8, min_tracking_confidence=0.6) as pose:
    
    vid = train_ds[ind][0]
    all_hand_keypoints = []
    all_images = []
    for v in vid:
        v_np = (v * 255).type(torch.uint8).clip(0, 255).numpy()
        image = v_np.copy()
        results = hands.process(image)

        keypoints = []
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # Draw landmarks on image
                # print(hand_landmarks)
                mp_drawing.draw_landmarks(
                    image, hand_landmarks, mp_hands.HAND_CONNECTIONS, 
                    landmark_style,
                    connection_style
                    )

                # Extract landmarks
                hand_keypoints = []
                for lm in hand_landmarks.landmark:
                    h, w, _ = image.shape
                    x, y, z = int(lm.x * w), int(lm.y * h), lm.z
                    hand_keypoints.append((x, y, z))
                keypoints.append(hand_keypoints)
        
        all_hand_keypoints.append(keypoints)
        all_images.append(image)
    all_images = np.array(all_images)
plot_video_gif(torch.from_numpy(all_images), 8, sign_encoder.inverse_transform(train_ds[ind][1:]))