In [2]:
import os
import random
import cv2
import mediapipe as mp
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
import torch.nn.functional as F

In [1]:
# Paths to datasets
video_path = '../data/ZJ-videos'
alphabet_path = '../data/mnist_asl_alphabet_train'

In [3]:
# Set the number of frames to pad to
selected_frame_dim = 180  # Example value
padding_value = (0, 0, 0)  # Padding value
# Target size for resizing frames and images
target_size = (224, 224)
label_mapping = {'J': 0, 'Z': 1, 'nothing': 3}

In [4]:
# Initialize Mediapipe Hands
# Confidence level allows you to adjust the sensitivity of hand detection
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)

I0000 00:00:1721157682.229822  780138 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1721157682.260647  780207 gl_context.cc:357] GL version: 3.1 (OpenGL ES 3.1 Mesa 23.2.1-1ubuntu3.1~22.04.2), renderer: D3D12 (NVIDIA GeForce RTX 3060)


In [5]:
# Augments an image to prevent overfitting
# We don't need to resize the images or video frames here, because all we care about are the landmark coordinates given
def augment_image(image):
    # Random rotation
    angle = random.uniform(-15, 15)
    height, width = image.shape[:2]
    M = cv2.getRotationMatrix2D((width // 2, height // 2), angle, 1)
    rotated = cv2.warpAffine(image, M, (width, height))
    
    # Random horizontal flip
    if random.random() > 0.5:
        rotated = cv2.flip(rotated, 1)
    
    # Adding random noise
    noise = np.random.normal(0, 0.05, rotated.shape)
    noisy_image = np.clip(rotated + noise, 0, 255).astype(np.uint8)
    
    return noisy_image

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [6]:
# Returns list of tuples of videos, with their corresponding label
def load_videos(path, label):
    video_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.avi')]
    video_files.sort()
    return [(f, label_mapping[label]) for f in video_files]

# Returns list of tuples of images, with their corresponding label
def load_images(path, labels):
    image_files = []
    for label in labels:
        files = [os.path.join(path, label, f) for f in os.listdir(os.path.join(path, label)) if f.endswith('.jpg')]
        files = random.sample(files, 12)  # Take 12 images per label
        image_files.extend([(f, label_mapping['nothing']) for f in files])  # Label all images as 'nothing'
    return image_files


In [7]:
def extract_landmarks_from_video(video_file, target_size=(224, 224)):
    cap = cv2.VideoCapture(video_file)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, target_size)  # Resize frame
        frame = augment_image(frame)  # Apply augmentation
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(frame_rgb)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                frame_landmarks = [[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]
                frames.append(frame_landmarks)
        else:
            frames.append([[0, 0, 0]] * 21)  # If no hand detected, append zero landmarks
    cap.release()
    if len(frames) == 0:
        return None
    return frames

def extract_landmarks_from_image(image_file, target_size=(224, 224)):
    image = cv2.imread(image_file)
    if image is None:
        return None
    image = cv2.resize(image, target_size)  # Resize image
    augmented_image = augment_image(image)  # Apply augmentation
    image_rgb = cv2.cvtColor(augmented_image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            return [[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]
    return [[0, 0, 0]] * 21  # If no hand detected, return zero landmarks


In [8]:
def pad_sequence(sequence, target_length, padding_value):
    while len(sequence) < target_length:
        sequence.append([padding_value] * 21)
    return sequence[:target_length]

In [9]:
def process_videos(video_files):
    data = []
    labels = []
    for video_file, label in tqdm(video_files, desc='Processing videos'):
        landmarks = extract_landmarks_from_video(video_file)
        if landmarks:
            padded_landmarks = pad_sequence(landmarks, selected_frame_dim, padding_value)
            data.append(padded_landmarks)
            labels.append(label)
    return torch.tensor(data, dtype=torch.float32), torch.tensor(labels, dtype=torch.int64)

def process_images(image_files):
    data = []
    labels = []
    for image_file, label in tqdm(image_files, desc='Processing images'):
        landmarks = extract_landmarks_from_image(image_file)
        padded_landmarks = pad_sequence([landmarks], selected_frame_dim, padding_value)
        data.append(padded_landmarks)
        labels.append(label)
    return torch.tensor(data, dtype=torch.float32), torch.tensor(labels, dtype=torch.int64)

In [10]:
len(train_videos)

NameError: name 'train_videos' is not defined

In [11]:
video_files = load_videos(os.path.join(video_path, 'j'), 'J') + load_videos(os.path.join(video_path, 'z'), 'Z')
image_files = load_images(alphabet_path, list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))

train_videos, val_videos = train_test_split(video_files, test_size=0.2, random_state=42)
train_images, val_images = train_test_split(image_files, test_size=0.2, random_state=42)

In [12]:
train_video_data, train_video_labels = process_videos(train_videos)
val_video_data, val_video_labels = process_videos(val_videos)
train_image_data, train_image_labels = process_images(train_images)
val_image_data, val_image_labels = process_images(val_images)

train_data = torch.cat((train_video_data, train_image_data), dim=0)
train_labels = torch.cat((train_video_labels, train_image_labels), dim=0)
val_data = torch.cat((val_video_data, val_image_data), dim=0)
val_labels = torch.cat((val_video_labels, val_image_labels), dim=0)

torch.save((train_data, train_labels), 'train_data.pt')
torch.save((val_data, val_labels), 'val_data.pt')

Processing videos:   1%|▌                                                               | 5/569 [00:09<17:52,  1.90s/it][mjpeg @ 0x87a30c0] overread 8
Processing videos:   1%|▉                                                               | 8/569 [00:13<15:44,  1.68s/it][mjpeg @ 0x89c9800] overread 8
Processing videos:   2%|█▎                                                             | 12/569 [00:19<14:50,  1.60s/it][mjpeg @ 0x87a0500] overread 8
Processing videos:   2%|█▌                                                             | 14/569 [00:22<14:05,  1.52s/it][mjpeg @ 0x87a3300] overread 8
Processing videos:   5%|███▏                                                           | 29/569 [00:50<16:55,  1.88s/it][mjpeg @ 0x87400c0] overread 8
Processing videos:   5%|███▍                                                           | 31/569 [00:52<14:28,  1.61s/it][mjpeg @ 0x895ba00] overread 8
Processing videos:   7%|████                                                           | 37/56

In [None]:
def load_data(filename):
    return torch.load(filename)

# Load data
train_data, train_labels = load_data('train_data.pt')
val_data, val_labels = load_data('val_data.pt')
