# WLASL Preprocessing

The Word-Level American Sign Language (WLASL) dataset consists of approximately 12,000 videos of around 2,000 common words.

For preprocessing this dataset, we use MediaPipe for hand landmark extraction from videos.

## Set-Up

### Install Dependencies

In [None]:
import os
import sys
import subprocess

packages = [
    "numpy==1.26.4",
    "protobuf==4.25.3",
    "mediapipe==0.10.21",
    "opencv-python-headless==4.8.1.78",
    "scikit-learn==1.3.2",
    "matplotlib"
]

print("Installing dependencies...")
print("="*60)

command = [sys.executable, "-m", "pip", "install"] + packages
subprocess.check_call(command)

print("\nInstallation complete.")
print("="*60)

# forcing runtime restart
os.kill(os.getpid(), 9)

Installing dependencies...


In [2]:
import sys
import mediapipe as mp
import cv2
import sklearn
import tensorflow as tf
import numpy as np

print("="*60)
print("Verifying installations...")
print(f"  Python version: {sys.version.split()[0]}")
print(f"  TensorFlow: {tf.__version__}")
print(f"  MediaPipe: {mp.__version__}")
print(f"  OpenCV: {cv2.__version__}")
print(f"  Scikit-learn: {sklearn.__version__}")
print(f"  NumPy: {np.__version__}") # Should be 1.26.4

print("="*60)

Verifying installations...
  Python version: 3.12.12
  TensorFlow: 2.19.0
  MediaPipe: 0.10.21
  OpenCV: 4.8.1
  Scikit-learn: 1.3.2
  NumPy: 1.26.4


In [3]:
# Download MediaPipe Hand Landmarker model
import urllib.request
import os

model_url = "https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task"
model_path = "hand_landmarker.task"

if not os.path.exists(model_path):
    print("Downloading MediaPipe Hand Landmarker model...")
    try:
        urllib.request.urlretrieve(model_url, model_path)
        print(f"Model downloaded to {model_path}")
    except Exception as e:
        print(f"Error downloading model: {e}")
        print("Trying alternative download method...")
        !wget -q {model_url} -O {model_path}
        print(f"Model downloaded to {model_path}")
else:
    print(f"Model already exists at {model_path}")

Downloading MediaPipe Hand Landmarker model...
Model downloaded to hand_landmarker.task


### Mount Google Drive and Extract Dataset

In [14]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("Google Drive mounted successfully!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully!


In [5]:
import zipfile
import os

# Path to your dataset in Google Drive
DRIVE_ZIP_PATH = "/content/drive/MyDrive/WLASL Dataset/archive.zip"
EXTRACT_PATH = "/content/wlasl_data"

# Check if the zip file exists
if not os.path.exists(DRIVE_ZIP_PATH):
    raise FileNotFoundError(f"Dataset not found at: {DRIVE_ZIP_PATH}\n"
                          f"Please ensure 'archive.zip' is in 'WLASL Dataset' folder in your Google Drive.")

print(f"Found dataset at: {DRIVE_ZIP_PATH}")

# Extract the dataset
print(f"\nExtracting dataset to {EXTRACT_PATH}...")
os.makedirs(EXTRACT_PATH, exist_ok=True)

with zipfile.ZipFile(DRIVE_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(EXTRACT_PATH)

print("Dataset extracted successfully!")

# List the contents to understand the structure
print("\nDataset structure:")
for root, dirs, files in os.walk(EXTRACT_PATH):
    level = root.replace(EXTRACT_PATH, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    sub_indent = ' ' * 2 * (level + 1)
    for file in files[:5]:  # Show first 5 files in each directory
        print(f'{sub_indent}{file}')
    if len(files) > 5:
        print(f'{sub_indent}... and {len(files) - 5} more files')
    if level > 2:  # Limit depth to avoid too much output
        break

Found dataset at: /content/drive/MyDrive/WLASL Dataset/archive.zip

Extracting dataset to /content/wlasl_data...
Dataset extracted successfully!

Dataset structure:
wlasl_data/
  nslt_2000.json
  nslt_1000.json
  WLASL_v0.3.json
  missing.txt
  nslt_300.json
  ... and 2 more files
  videos/
    20235.mp4
    08890.mp4
    65111.mp4
    34832.mp4
    26012.mp4
    ... and 11975 more files


### Configuration

In [24]:

WLASL_JSON_PATH = "/content/wlasl_data/WLASL_v0.3.json"
VIDEO_ROOT = "/content/wlasl_data/videos"

# Output paths
OUTPUT_NPZ = "wlasl_landmarks.npz"
MODEL_OUTPUT = "wlasl_sequence_model.keras"
LABELS_OUTPUT = "wlasl_labels.npy"

# Processing parameters
SEQUENCE_LENGTH = 32  # Fixed number of frames per sequence
FRAME_STRIDE = 2      # Sample every Nth frame
MIN_FRAMES = 8        # Discard sequences shorter than this

# Glosses to train on (words/signs)
# SELECTED_GLOSSES = [
#     "book", "drink", "computer", "before", "chair", "go", "clothes",
#     "who", "candy", "cousin", "deaf", "fine", "help", "no", "orange",
#     "pizza", "please", "restaurant", "store", "thanksgiving", "thin",
#     "walk", "year", "yes", "all"
# ]

# Training parameters
MIN_VIDEOS_PER_GLOSS = 5      # Minimum number of instances needed for gloss to be used
MAX_GLOSSES = 25              # Number of glosses if SELECTED_GLOSSES is None
MAX_SAMPLES_PER_GLOSS = None  # Limit samples per gloss (None = no limit)
TEST_SIZE = 0.2
EPOCHS = 80
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
LSTM_UNITS = [128, 64]
DENSE_UNITS = 64
DROPOUT = 0.5
PATIENCE = 10                 # Early stopping patience

print("Configuration loaded")
print(f"\nLooking for:")
print(f"  JSON: {WLASL_JSON_PATH}")
print(f"  Videos: {VIDEO_ROOT}")

Configuration loaded

Looking for:
  JSON: /content/wlasl_data/WLASL_v0.3.json
  Videos: /content/wlasl_data/videos


In [7]:
# Verify that the paths exist
import os

print("Verifying dataset paths...\n")

# Check JSON file
if os.path.exists(WLASL_JSON_PATH):
    print(f"JSON file found: {WLASL_JSON_PATH}")
    file_size = os.path.getsize(WLASL_JSON_PATH) / (1024 * 1024)
    print(f"  Size: {file_size:.2f} MB")
else:
    print(f"JSON file NOT found at: {WLASL_JSON_PATH}")
    print("\nSearching for JSON files in extracted directory...")
    for root, dirs, files in os.walk("/content/wlasl_data"):
        for file in files:
            if file.endswith('.json'):
                full_path = os.path.join(root, file)
                print(f"  Found: {full_path}")
    print("\nUpdate WLASL_JSON_PATH in the configuration cell above with the correct path")

# Check video directory
print()
if os.path.exists(VIDEO_ROOT):
    print(f"Video directory found: {VIDEO_ROOT}")
    video_files = [f for f in os.listdir(VIDEO_ROOT) if f.endswith(('.mp4', '.avi', '.mov'))]
    print(f"  Number of video files: {len(video_files)}")
    if video_files:
        print(f"  Sample videos: {video_files[:3]}")
else:
    print(f"Video directory NOT found at: {VIDEO_ROOT}")
    print("\nSearching for video directories...")
    for root, dirs, files in os.walk("/content/wlasl_data"):
        video_files = [f for f in files if f.endswith(('.mp4', '.avi', '.mov'))]
        if video_files:
            print(f"  Found {len(video_files)} videos in: {root}")
            break
    print("\nUpdate VIDEO_ROOT in the configuration cell above with the correct path")

print("\n" + "="*60)
if os.path.exists(WLASL_JSON_PATH) and os.path.exists(VIDEO_ROOT):
    print("All paths verified! You can proceed to the next steps.")
else:
    print("Please update the configuration paths based on the information above.")
print("="*60)

Verifying dataset paths...

JSON file found: /content/wlasl_data/WLASL_v0.3.json
  Size: 11.38 MB

Video directory found: /content/wlasl_data/videos
  Number of video files: 11980
  Sample videos: ['20235.mp4', '08890.mp4', '65111.mp4']

All paths verified! You can proceed to the next steps.


### Import Libraries and Utility Functions

In [8]:
# Import all required libraries
print("Importing libraries...")

# Standard library imports
import json
import os
import sys
from collections import Counter
from typing import List, Sequence

# Third-party imports
try:
    import cv2
    print("OpenCV imported")
except ImportError as e:
    print(f"OpenCV import failed: {e}")
    sys.exit(1)

try:
    import mediapipe as mp
    from mediapipe.tasks import python as mp_python
    from mediapipe.tasks.python import vision
    print("MediaPipe imported")
except ImportError as e:
    print(f"MediaPipe import failed: {e}")
    print("  Run the installation cell again.")
    sys.exit(1)

try:
    import numpy as np
    print("NumPy imported")
except ImportError as e:
    print(f"NumPy import failed: {e}")
    sys.exit(1)

try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    print("TensorFlow/Keras imported")
except ImportError as e:
    print(f"TensorFlow import failed: {e}")
    sys.exit(1)

try:
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    print("Scikit-learn imported")
except ImportError as e:
    print(f"Scikit-learn import failed: {e}")
    sys.exit(1)

# Constants
NUM_HANDS = 2
NUM_LANDMARKS = 21
COORDS = 3
FEATURE_VECTOR_LEN = NUM_HANDS * NUM_LANDMARKS * COORDS
WRIST_IDX = 0
MIDDLE_MCP_IDX = 9

print("\n" + "="*60)
print("All libraries imported successfully!")
print("="*60)

Importing libraries...
OpenCV imported
MediaPipe imported
NumPy imported
TensorFlow/Keras imported
Scikit-learn imported

All libraries imported successfully!


In [10]:
# Normalization utility function (from preprocessing_utils.py)
def normalize_per_hand(X_arr: np.ndarray) -> np.ndarray:
    """
    Normalize landmarks per hand: translate by wrist, scale by wrist->middle_mcp distance.
    This ensures consistent spatial representation regardless of hand position/size in frame.
    """
    Xn = X_arr.copy()
    if Xn.ndim == 1:
        Xn = Xn.reshape(1, -1)
    Xn = Xn.reshape(-1, NUM_HANDS, NUM_LANDMARKS, COORDS)

    for i in range(Xn.shape[0]):
        for h in range(NUM_HANDS):
            hand = Xn[i, h]
            # Skip if hand is all zeros (no detection)
            if np.allclose(hand, 0.0):
                continue

            # Translate: center on wrist
            wrist = hand[WRIST_IDX]
            hand[:, :2] -= wrist[:2]  # translate x,y; keep z as-is

            # Scale: normalize by wrist->middle_mcp distance on xy plane
            ref = hand[MIDDLE_MCP_IDX]
            scale = np.linalg.norm(ref[:2])
            if scale > 1e-6:
                hand[:, :2] /= scale

            Xn[i, h] = hand

    return Xn.reshape(-1, FEATURE_VECTOR_LEN)

print("Normalization function defined")

Normalization function defined


## Preprocessing

### Define Preprocessing Functions

These functions handle:
- Loading WLASL metadata
- Selecting glosses (words) to train on
- Building MediaPipe hand detector
- Extracting landmarks from video frames
- Processing videos into fixed-length sequences

In [12]:
def load_metadata(meta_path: str) -> Sequence[dict]:
    """Load WLASL JSON metadata file."""
    with open(meta_path, "r", encoding="utf-8") as handle:
        return json.load(handle)


def choose_glosses(metadata: Sequence[dict], selected_glosses: List[str] = None, max_glosses: int = 25) -> List[str]:
    """
    Select which glosses (words) to include in training.
    If selected_glosses is provided, use those. Otherwise, pick most frequent ones.
    """
    if selected_glosses:
        return selected_glosses

    # Count samples per gloss
    counts = Counter()
    for entry in metadata:
        gloss = entry.get("gloss")
        if not gloss:
            continue
        counts[gloss] += len(entry.get("instances", []))

    # Return most common glosses
    most_common = [gloss for gloss, _ in counts.most_common(max_glosses)]
    return most_common


def build_detector(task_path: str) -> vision.HandLandmarker:
    """Initialize MediaPipe hand landmarker."""
    if not os.path.exists(task_path):
        raise FileNotFoundError(f"MediaPipe model not found at '{task_path}'")

    base_options = mp_python.BaseOptions(model_asset_path=task_path)
    options = vision.HandLandmarkerOptions(base_options=base_options, num_hands=2)
    return vision.HandLandmarker.create_from_options(options)


def extract_landmarks(detector: vision.HandLandmarker, frame_rgb: np.ndarray) -> np.ndarray:
    """
    Extract hand landmarks from a single frame.
    Returns normalized feature vector of shape (126,) = 2 hands × 21 landmarks × 3 coords (x,y,z)
    """
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)
    result = detector.detect(mp_image)

    landmarks = []
    if result.hand_landmarks:
        for hand_landmarks in result.hand_landmarks:
            for lm in hand_landmarks:
                landmarks.extend([lm.x, lm.y, lm.z])

    # Pad with zeros if less than 2 hands detected
    if len(landmarks) < FEATURE_VECTOR_LEN:
        landmarks.extend([0.0] * (FEATURE_VECTOR_LEN - len(landmarks)))

    arr = np.asarray(landmarks, dtype=np.float32).reshape(1, -1)

    # Apply normalization
    try:
        arr = normalize_per_hand(arr)
    except Exception:
        pass

    return arr.astype(np.float32)


def process_video(
    detector: vision.HandLandmarker,
    video_path: str,
    frame_start: int,
    frame_end: int,
    sequence_length: int,
    frame_stride: int,
) -> np.ndarray:
    """
    Process a video file and extract a fixed-length sequence of landmark frames.

    Args:
        detector: MediaPipe hand landmarker
        video_path: Path to video file
        frame_start: Starting frame index (from metadata)
        frame_end: Ending frame index (from metadata)
        sequence_length: Target number of frames in output
        frame_stride: Sample every Nth frame

    Returns:
        Array of shape (sequence_length, FEATURE_VECTOR_LEN)
    """
    capture = cv2.VideoCapture(video_path)
    if not capture.isOpened():
        raise RuntimeError(f"Could not open video '{video_path}'")

    total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
    start = max(0, frame_start or 0)
    end = frame_end if frame_end else total_frames
    end = min(end, total_frames)
    if end <= start:
        end = total_frames

    capture.set(cv2.CAP_PROP_POS_FRAMES, start)

    sampled = []
    frame_index = start

    while frame_index < end:
        success, frame_bgr = capture.read()
        if not success:
            break

        # Sample every frame_stride frames
        if (frame_index - start) % frame_stride == 0:
            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
            sampled.append(extract_landmarks(detector, frame_rgb).reshape(-1))

        frame_index += 1

        # Stop if we have enough frames
        if len(sampled) >= sequence_length:
            break

    capture.release()

    if not sampled:
        return np.empty((0, FEATURE_VECTOR_LEN), dtype=np.float32)

    sequence = np.stack(sampled).astype(np.float32)

    # Truncate if too long
    if sequence.shape[0] >= sequence_length:
        return sequence[:sequence_length]

    # Pad with zeros if too short
    padding = np.zeros((sequence_length - sequence.shape[0], FEATURE_VECTOR_LEN), dtype=np.float32)
    return np.vstack((sequence, padding))

print("Preprocessing functions defined")

Preprocessing functions defined


### Run Preprocessing

In this section:
1. Load WLASL metadata
2. Select glosses to train on
3. Process each video to extract landmarks
4. Save results to `wlasl_landmarks.npz`

In [25]:
# Load metadata
print("Loading WLASL metadata...")
metadata = load_metadata(WLASL_JSON_PATH)
print(f"Loaded {len(metadata)} entries from metadata")

# Scanning glosses
print(f"\nScanning disk for glosses with >= {MIN_VIDEOS_PER_GLOSS} videos...")

gloss_file_counts = Counter()
valid_video_ids = set()

# Iterate through metadata to check file existence
for entry in metadata:
    gloss = entry.get("gloss")
    if not gloss: continue

    for instance in entry.get("instances", []):
        video_id = instance.get("video_id")
        if not video_id: continue

        # Check existence
        video_path = os.path.join(VIDEO_ROOT, f"{video_id}.mp4")
        if os.path.exists(video_path):
            gloss_file_counts[gloss] += 1
            valid_video_ids.add(video_id)

# Create the final list of glosses to process
glosses_to_keep = [g for g, c in gloss_file_counts.items() if c >= MIN_VIDEOS_PER_GLOSS]
gloss_set = set(glosses_to_keep)

print(f"Scan complete.")
print(f"  Total unique glosses in metadata: {len(gloss_file_counts)}")
print(f"  Glosses with >= {MIN_VIDEOS_PER_GLOSS} videos: {len(glosses_to_keep)}")
print(f"  Glosses dropped (too few videos): {len(gloss_file_counts) - len(glosses_to_keep)}")

Loading WLASL metadata...
Loaded 2000 entries from metadata

Scanning disk for glosses with >= 5 videos...
Scan complete.
  Total unique glosses in metadata: 2000
  Glosses with >= 5 videos: 1575
  Glosses dropped (too few videos): 425


In [26]:
# Build detector
print("Initializing MediaPipe hand detector...")
detector = build_detector("hand_landmarker.task")
print("Detector ready")

# Process videos
print("\nProcessing videos...")
sequences = []
labels = []
video_ids = []
per_gloss_counts = Counter()

total_processed = 0
total_skipped = 0

for entry in metadata:
    gloss = entry.get("gloss", "")
    if gloss.lower() not in gloss_set:
        continue

    for instance in entry.get("instances", []):
        # Check if we've hit the per-gloss limit
        if MAX_SAMPLES_PER_GLOSS and per_gloss_counts[gloss] >= MAX_SAMPLES_PER_GLOSS:
            continue

        video_id = instance.get("video_id")
        if not video_id:
            continue

        # skip if video was found missing during the Scan phase
        if video_id not in valid_video_ids:
            continue

        video_path = os.path.join(VIDEO_ROOT, f"{video_id}.mp4")
        if not os.path.exists(video_path):
            print(f"Missing: {video_path}")
            total_skipped += 1
            continue

        try:
            sequence = process_video(
                detector=detector,
                video_path=video_path,
                frame_start=int(instance.get("frame_start", 0) or 0),
                frame_end=int(instance.get("frame_end", 0) or 0),
                sequence_length=SEQUENCE_LENGTH,
                frame_stride=FRAME_STRIDE,
            )
        except Exception as exc:
            print(f"Failed to process {video_path}: {exc}")
            total_skipped += 1
            continue

        # Check if sequence has enough valid frames
        actual_length = np.count_nonzero(np.linalg.norm(sequence, axis=1))
        if actual_length < MIN_FRAMES:
            total_skipped += 1
            continue

        sequences.append(sequence)
        labels.append(gloss)
        video_ids.append(video_id)
        per_gloss_counts[gloss] += 1
        total_processed += 1

        # Progress update
        if total_processed % 50 == 0:
            print(f"  Processed {total_processed} videos...")

print(f"\nProcessing complete!")
print(f"  Total processed: {total_processed}")
print(f"  Total skipped: {total_skipped}")

# Save to NPZ
if not sequences:
    raise RuntimeError("No sequences were extracted. Check your paths and dataset.")

X = np.stack(sequences).astype(np.float32)
y = np.array(labels)
vids = np.array(video_ids)

final_gloss_list = np.array(sorted(list(gloss_set)))
np.savez(OUTPUT_NPZ, sequences=X, labels=y, video_ids=vids, glosses=np.array(glosses_to_keep))

print(f"\nSaved {X.shape[0]} samples to {OUTPUT_NPZ}")
print(f"  Sequence shape: {X.shape}")
print("\n  Sample distribution:")
for gloss in sorted(per_gloss_counts):
    print(f"    {gloss}: {per_gloss_counts[gloss]}")

Initializing MediaPipe hand detector...
Detector ready

Processing videos...
  Processed 50 videos...
  Processed 100 videos...
  Processed 150 videos...
  Processed 200 videos...
  Processed 250 videos...
  Processed 300 videos...
  Processed 350 videos...
  Processed 400 videos...
  Processed 450 videos...
  Processed 500 videos...
  Processed 550 videos...
  Processed 600 videos...
  Processed 650 videos...
  Processed 700 videos...
  Processed 750 videos...
  Processed 800 videos...
  Processed 850 videos...
  Processed 900 videos...
  Processed 950 videos...
  Processed 1000 videos...
  Processed 1050 videos...
  Processed 1100 videos...
  Processed 1150 videos...
  Processed 1200 videos...
  Processed 1250 videos...
  Processed 1300 videos...
  Processed 1350 videos...
  Processed 1400 videos...
  Processed 1450 videos...
  Processed 1500 videos...
  Processed 1550 videos...
  Processed 1600 videos...
  Processed 1650 videos...
  Processed 1700 videos...
  Processed 1750 videos..

In [27]:
# Clean up to free memory
import gc

print("\nCleaning up memory...")
del detector  # Remove detector object
gc.collect()  # Force garbage collection

print("Memory cleanup complete")
print("\nYou can now proceed to training!")


Cleaning up memory...
Memory cleanup complete

You can now proceed to training!


### Backup Preprocessed Data

In [28]:
# Copy preprocessed data to Google Drive for safekeeping
import shutil

BACKUP_PATH = "/content/drive/MyDrive/WLASL Dataset/wlasl_landmarks.npz"

try:
    shutil.copy(OUTPUT_NPZ, BACKUP_PATH)
    print(f"Backup saved to: {BACKUP_PATH}")
    print("\nIf your session disconnects, you can restore by running:")
    print(f"  !cp '{BACKUP_PATH}' '{OUTPUT_NPZ}'")
except Exception as e:
    print(f"Could not backup to Drive: {e}")
    print("  Continuing with local copy only...")

Backup saved to: /content/drive/MyDrive/WLASL Dataset/wlasl_landmarks.npz

If your session disconnects, you can restore by running:
  !cp '/content/drive/MyDrive/WLASL Dataset/wlasl_landmarks.npz' 'wlasl_landmarks.npz'
