In [1]:
# pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.7-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.7/35.7 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[?25hDownloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.4.7-py3-none-any.whl (32 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing installat

In [2]:
import os
import random
import cv2
import mediapipe as mp
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

2024-07-16 17:21:12.724924: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-16 17:21:12.725048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-16 17:21:12.825854: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Paths to datasets
video_path = '/kaggle/input/asl-dataset-z-and-j/UCF50'
alphabet_path = '/kaggle/input/asl-alphabet/asl_alphabet_train/asl_alphabet_train'

In [4]:
# Set the number of frames to pad to
selected_frame_dim = 180  # Example value
padding_value = (0, 0, 0)  # Padding value

In [5]:
# Initialize Mediapipe Hands
# Confidence level allows you to adjust the sensitivity of hand detection
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1721150493.566159     176 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1721150493.590550     176 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [6]:
# Returns list of tuples of videos, with their corresponding label
def load_videos(path, label):
    video_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.avi')]
    video_files.sort()
    return [(f, label) for f in video_files]

In [7]:
# Returns list of tuples of videos, with their corresponding label
# Gets the corresponding image paths and appends them to a list
def load_images(path, labels):
    image_files = []
    for label in labels:
        files = [os.path.join(path, label, f) for f in os.listdir(os.path.join(path, label)) if f.endswith('.jpg')]
        files = random.sample(files, 5)  # Take 12 images per label
        image_files.extend([(f, label) for f in files])
    return image_files

In [18]:
# Augments an image to prevent overfitting
def augment_image(image):
    # Random rotation
    angle = random.uniform(-15, 15)
    height, width = image.shape[:2]
    M = cv2.getRotationMatrix2D((width // 2, height // 2), angle, 1)
    rotated = cv2.warpAffine(image, M, (width, height))
    
    # Random horizontal flip
    if random.random() > 0.5:
        rotated = cv2.flip(rotated, 1)
    
    # Adding random noise
    noise = np.random.normal(0, 0.05, rotated.shape)
    noisy_image = np.clip(rotated + noise, 0, 255).astype(np.uint8)
    
    return noisy_image

In [19]:
# Returns the landmarks from the frames, if existing
def extract_landmarks_from_video(video_file):
    cap = cv2.VideoCapture(video_file)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = augment_image(frame)  # Apply augmentation
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(frame_rgb)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                frame_landmarks = [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]
                frames.append(frame_landmarks)
        else:
            continue
    cap.release()
    if len(frames) == 0:
        return None
    return frames

In [20]:
# Returns the landmarks from the images
def extract_landmarks_from_image(image_file):
    image = cv2.imread(image_file)
    if image is None:
        return None
    augmented_image = augment_image(image)  # Apply augmentation
    image_rgb = cv2.cvtColor(augmented_image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            return [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]
    return None

In [21]:
def pad_sequence(sequence, target_length, padding_value):
    while len(sequence) < target_length:
        sequence.append([padding_value] * 21)
    return sequence[:target_length]

In [22]:
# Load and split datasets
video_files = load_videos(os.path.join(video_path, 'j'), 'J') + load_videos(os.path.join(video_path, 'z'), 'Z')
image_files = load_images(alphabet_path, list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))

# Tuples of video paths, with their corresponding char label
train_videos, val_videos = train_test_split(video_files, test_size=0.2, random_state=42)
train_images, val_images = train_test_split(image_files, test_size=0.2, random_state=42)

In [23]:
len(train_videos)

569

In [24]:
# Extract landmarks and pad sequences
def process_videos(video_files):
    data = []
    for video_file, label in tqdm(video_files, desc='Processing videos'):
        landmarks = extract_landmarks_from_video(video_file)
        if landmarks:
            padded_landmarks = pad_sequence(landmarks, selected_frame_dim, padding_value)
            data.append((padded_landmarks, label))
    return data

def process_images(image_files):
    data = []
    for image_file, label in tqdm(image_files, desc='Processing images'):
        landmarks = extract_landmarks_from_image(image_file)
        if landmarks:
            padded_landmarks = pad_sequence([landmarks], selected_frame_dim, padding_value)
            data.append((padded_landmarks, label))
    return data

In [None]:
# Process training and validation data
train_data = process_videos(train_videos) + process_images(train_images)
val_data = process_videos(val_videos) + process_images(val_images)

# Batching and shuffling
def get_batches(data, batch_size):
    random.shuffle(data)
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]
        
def save_batches(data, batch_size, filename):
    batches = list(get_batches(data, batch_size))
    with open(filename, 'wb') as f:
        pickle.dump(batches, f)

def load_batches(filename):
    with open(filename, 'rb') as f:
        batches = pickle.load(f)
    return batches

Processing videos:   1%|          | 3/569 [00:15<47:44,  5.06s/it]

In [None]:


# Example usage
num_epochs = 10
batch_size = 32
for epoch in range(num_epochs):
    for batch in get_batches(train_data, batch_size):
        # Train your model on the batch
        pass
    random.shuffle(train_data)