# WLASL Preprocessing

The Word-Level American Sign Language 100 (WLASL100) dataset consists of approximately 2,038 videos of around 100 most frequent words from the WLASL dataset.

For preprocessing this dataset, we use MediaPipe Holistic, which combines human pose, face landmarks, and hand tracking, which are all needed for ASL word detection since it does not only rely on hand position.

## Set-Up

### Install Dependencies

In [1]:
import os
import sys
import subprocess

packages = [
    "numpy==1.26.4",
    "protobuf==4.25.3",
    "mediapipe==0.10.21",
    "opencv-python-headless==4.8.1.78",
    "scikit-learn==1.3.2",
    "matplotlib"
]

print("Installing dependencies...")
print("="*60)

command = [sys.executable, "-m", "pip", "install"] + packages
subprocess.check_call(command)

print("\nInstallation complete.")
print("="*60)

# forcing runtime restart
os.kill(os.getpid(), 9)

Installing dependencies...


: 

In [1]:
import sys
import mediapipe as mp
import cv2
import sklearn
import tensorflow as tf
import numpy as np

print("="*60)
print("Verifying installations...")
print(f"  Python version: {sys.version.split()[0]}")
print(f"  TensorFlow: {tf.__version__}")
print(f"  MediaPipe: {mp.__version__}")
print(f"  OpenCV: {cv2.__version__}")
print(f"  Scikit-learn: {sklearn.__version__}")
print(f"  NumPy: {np.__version__}") # Should be 1.26.4

print("="*60)

Verifying installations...
  Python version: 3.11.5
  TensorFlow: 2.19.0
  MediaPipe: 0.10.21
  OpenCV: 4.8.1
  Scikit-learn: 1.3.2
  NumPy: 1.26.4


In [3]:
# Download MediaPipe Hand Landmarker model
import urllib.request
import os

model_url = "https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task"
model_path = "hand_landmarker.task"

if not os.path.exists(model_path):
    print("Downloading MediaPipe Hand Landmarker model...")
    try:
        urllib.request.urlretrieve(model_url, model_path)
        print(f"Model downloaded to {model_path}")
    except Exception as e:
        print(f"Error downloading model: {e}")
        print("Trying alternative download method...")
        !wget -q {model_url} -O {model_path}
        print(f"Model downloaded to {model_path}")
else:
    print(f"Model already exists at {model_path}")

Model already exists at hand_landmarker.task


### Configuration

In [18]:
DATASET_ROOT = 'WLASL100'

train_dir = os.path.join(DATASET_ROOT, 'train')
val_dir = os.path.join(DATASET_ROOT, 'val')
test_dir = os.path.join(DATASET_ROOT, 'test')

# verify path
if os.path.exists(train_dir):
    print(f"Found training data at: {train_dir}")
    print(f"   Contains {len(os.listdir(train_dir))} files.")
else:
    print(f"Error: Could not find folder at {train_dir}")
    print("   Make sure the 'WLASL100' folder is in the same directory as this script.")

Found training data at: WLASL100/train
   Contains 101 files.


In [17]:
# output paths
OUTPUT_ROOT = 'wlasl100_features'

# processing parameters
SEQUENCE_LENGTH = 32  # Fixed number of frames per sequence
MIN_FRAMES = 8        # Discard sequences shorter than this

# training parameters
MAX_SAMPLES_PER_GLOSS = None
TEST_SIZE = 0.2
EPOCHS = 80
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
LSTM_UNITS = [128, 64]
DENSE_UNITS = 64
DROPOUT = 0.5
PATIENCE = 10

# constants
NUM_HANDS = 2
NUM_LANDMARKS = 21
COORDS = 3
FEATURE_VECTOR_LEN = NUM_HANDS * NUM_LANDMARKS * COORDS
WRIST_IDX = 0
MIDDLE_MCP_IDX = 9

### Import Libraries

In [11]:
# Import all required libraries
print("Importing libraries...")

# Standard library imports
import json
import os
import sys
from collections import Counter
from typing import List, Sequence

# Third-party imports
try:
    import cv2
    print("OpenCV imported")
except ImportError as e:
    print(f"OpenCV import failed: {e}")
    sys.exit(1)

try:
    import mediapipe as mp
    from mediapipe.tasks import python as mp_python
    from mediapipe.tasks.python import vision
    print("MediaPipe imported")
except ImportError as e:
    print(f"MediaPipe import failed: {e}")
    print("  Run the installation cell again.")
    sys.exit(1)

try:
    import numpy as np
    print("NumPy imported")
except ImportError as e:
    print(f"NumPy import failed: {e}")
    sys.exit(1)

try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    print("TensorFlow/Keras imported")
except ImportError as e:
    print(f"TensorFlow import failed: {e}")
    sys.exit(1)

try:
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    print("Scikit-learn imported")
except ImportError as e:
    print(f"Scikit-learn import failed: {e}")
    sys.exit(1)

print("\n" + "="*60)
print("All libraries imported successfully!")
print("="*60)

Importing libraries...
OpenCV imported
MediaPipe imported
NumPy imported
TensorFlow/Keras imported
Scikit-learn imported

All libraries imported successfully!


## Preprocessing

The function defined below extracts landmarks using MediaPipe Holistic to get hand (21 points per hand) and body pose landmarks (33 points).

In [None]:
mp_holistic = mp.solutions.holistic

def extract_landmarks(video_path):
    cap = cv2.VideoCapture(video_path)
    frames_data = []

    with mp_holistic.Holistic(
        min_detection_confidence=0.5, 
        min_tracking_confidence=0.5, 
        model_complexity=1
    ) as holistic:
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # convert BGR to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = holistic.process(image)
            
            # placeholder for this frame's data
            frame_data = {
                'pose': np.zeros((33, 3)), # 33 landmarks, 3 coords (x,y,z)
                'lh': np.zeros((21, 3)),   # left hand
                'rh': np.zeros((21, 3))    # right hand
            }

            # extract body pose
            if results.pose_landmarks:
                for i, lm in enumerate(results.pose_landmarks.landmark):
                    frame_data['pose'][i] = [lm.x, lm.y, lm.z]
            
            # extract left hand
            if results.left_hand_landmarks:
                for i, lm in enumerate(results.left_hand_landmarks.landmark):
                    frame_data['lh'][i] = [lm.x, lm.y, lm.z]

            # extract right hand
            if results.right_hand_landmarks:
                for i, lm in enumerate(results.right_hand_landmarks.landmark):
                    frame_data['rh'][i] = [lm.x, lm.y, lm.z]

            frames_data.append(frame_data)
            
    cap.release()
    return frames_data

The function below cleans data so that missing hand frames are interpolated, to improve accuracy.

In [25]:
def convert_and_clean_data(raw_frames):
    T = len(raw_frames)
    if T == 0: return None
    
    # convert to numpy; 75 points = 33 (pose) + 21 (left) + 21 (right)
    data = np.zeros((T, 75, 3)) 
    
    for t, frame in enumerate(raw_frames):
        # pose (0-32)
        data[t, :33] = frame['pose']
        
        # left hand (33-54)
        data[t, 33:54] = frame['lh']
        
        # right hand (54-75)
        data[t, 54:] = frame['rh']

    # interpolation
    for k in range(33, 75):
        col = data[:, k, 0] 
        nonzero_idxs = np.where(col != 0)[0]
        
        # if hand missing in entire video, skip
        if len(nonzero_idxs) == 0: 
            continue 
            
        # if hand exists partially. fill gaps
        all_idxs = np.arange(T)
        
        # interpolate x, y, z
        data[:, k, 0] = np.interp(all_idxs, nonzero_idxs, data[nonzero_idxs, k, 0])
        data[:, k, 1] = np.interp(all_idxs, nonzero_idxs, data[nonzero_idxs, k, 1])
        data[:, k, 2] = np.interp(all_idxs, nonzero_idxs, data[nonzero_idxs, k, 2])

    return data

The function below normalizes the landmarks that were extracted relative to the body, using shoulder placements. This is necessary for identifying ASL words, as where the hand is positioned relative to the body can change the word being signed.

In [27]:
# normalizes landmarks relative to the body (shoulders)
def normalize_holistic(data):
    # find scale and center
    LEFT_SHOULDER_IDX = 11
    RIGHT_SHOULDER_IDX = 12
    
    left_shoulders = data[:, LEFT_SHOULDER_IDX, :]
    right_shoulders = data[:, RIGHT_SHOULDER_IDX, :]

    valid = (left_shoulders[:, 0] != 0) & (right_shoulders[:, 0] != 0)
    
    if not np.any(valid):
        return None

    # use median center of the video as the anchor
    center = np.median((left_shoulders[valid] + right_shoulders[valid]) / 2, axis=0)
    scale = np.median(np.linalg.norm(left_shoulders[valid] - right_shoulders[valid], axis=1))
    
    if scale < 1e-6: scale = 1.0

    # apply normalization
    data -= center
    data /= scale
    
    return data

The function below resamples each video to be 32 frames, making all videos a uniform length for better learning and classification.

In [None]:
# resample video to fit SEQUENCE_LENGTH number of frames
def resample_uniform(sequence, target_len=SEQUENCE_LENGTH):
    source_len = sequence.shape[0]
    if source_len == 0:
        return np.zeros((target_len, sequence.shape[1]))
    
    # pick indices evenly spaced
    indices = np.linspace(0, source_len - 1, num=target_len, dtype=int)
    
    return sequence[indices]

Finally, the function below builds a label map for the distinct classes present in the data set.

In [None]:
# creates label map for consistent word to label mapping
def build_label_map(root_dir):
    labels = set()
    for split in ['train', 'val', 'test']:
        split_path = os.path.join(root_dir, split)
        if not os.path.exists(split_path): continue
        
        for class_name in os.listdir(split_path):
            if os.path.isdir(os.path.join(split_path, class_name)):
                labels.add(class_name)
    
    # sort for consistency
    return {name: i for i, name in enumerate(sorted(list(labels)))}

The full processing function below goes through all the videos in the train, test, and validation data sets, and preprocesses each video using the functions defined above.

In [23]:
def process_dataset():
    print(f"Scanning {DATASET_ROOT} to build Label Map...")
    label_map = build_label_map(DATASET_ROOT)
    print(f"Found {len(label_map)} unique classes.")
    
    # save label map
    os.makedirs(OUTPUT_ROOT, exist_ok=True)
    with open(os.path.join(OUTPUT_ROOT, 'label_map.json'), 'w') as f:
        json.dump(label_map, f)

    # loop through splits
    for split in ['train', 'val', 'test']:
        split_input_dir = os.path.join(DATASET_ROOT, split)
        split_output_dir = os.path.join(OUTPUT_ROOT, split)
        
        if not os.path.exists(split_input_dir):
            print(f"Skipping {split} (not found)")
            continue
            
        print(f"Processing Split: {split}")
        
        # loop through classes
        for class_name in os.listdir(split_input_dir):
            class_input_dir = os.path.join(split_input_dir, class_name)
            
            # skip files that aren't folders
            if not os.path.isdir(class_input_dir): continue
            
            # create matching output folder
            class_output_dir = os.path.join(split_output_dir, class_name)
            os.makedirs(class_output_dir, exist_ok=True)
            
            label_id = label_map[class_name]
            
            # loop through videos
            for video_file in os.listdir(class_input_dir):
                if not video_file.lower().endswith('.mp4'): continue
                
                input_path = os.path.join(class_input_dir, video_file)
                save_filename = os.path.splitext(video_file)[0] + ".npy"
                output_path = os.path.join(class_output_dir, save_filename)
                
                # check if already done
                if os.path.exists(output_path): continue
                
                try:
                    # extract
                    raw = extract_landmarks(input_path)

                    # clean
                    clean_arr = convert_and_clean_data(raw)
                    if clean_arr is None:
                        continue
                    
                    # normalize
                    norm = normalize_holistic(clean_arr)
                    if norm is None:
                        print(f"Skipping {video_file} (No body detected)")
                        continue
                    
                    # resample & flatten (T, 75, 3) -> (T, 225)
                    flat = norm.reshape(norm.shape[0], -1)
                    final = resample_uniform(flat, SEQUENCE_LENGTH)
                    
                    # save features
                    np.save(output_path, {
                        'data': final.astype(np.float32),
                        'label': label_id,
                        'gloss': class_name
                    })
                    
                except Exception as e:
                    print(f"Error processing {video_file}: {e}")
        
        print(f"Finished {split} split.")

In [28]:
# run preprocessing for all videos
process_dataset()

Scanning WLASL100 to build Label Map...
Found 100 unique classes.
Processing Split: train


I0000 00:00:1764011981.084664 6636128 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M2
W0000 00:00:1764011981.157137 7734889 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764011981.169053 7734888 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764011981.171102 7734889 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764011981.171462 7734891 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764011981.173509 7734892 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support 

Finished train split.
Processing Split: val


I0000 00:00:1764014152.889343 6636128 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M2
W0000 00:00:1764014152.956772 7799100 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764014152.970718 7799107 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764014152.972082 7799107 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764014152.972083 7799101 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764014152.972112 7799106 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support 

Finished val split.
Processing Split: test


I0000 00:00:1764014695.494492 6636128 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M2
W0000 00:00:1764014695.562153 7815403 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764014695.574462 7815407 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764014695.575833 7815407 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764014695.575838 7815403 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764014695.575895 7815405 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support 

Finished test split.
