In [1]:
# ==========================================
# 1. SETUP & INSTALLATION
# ==========================================
!pip install -q transformers torch opencv-python-headless tqdm scikit-learn

import os
import sys
import cv2
import zipfile
import torch
import shutil
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from transformers import CLIPProcessor, CLIPVisionModel
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from google.colab import drive

# Mount Google Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# ==========================================
# 2. CONFIGURATION
# ==========================================
# Input Path
DRIVE_ZIP_PATH = "/content/drive/MyDrive/WLASL100 Dataset/archive.zip"

# Extraction Target
# Extracts to: /content/preprocessing/train, /content/preprocessing/val, etc.
EXTRACT_ROOT = "/content/preprocessing"

# Output Paths
OUTPUT_FILENAME = "wlasl100_augmented_features.npz"
BACKUP_PATH = "/content/drive/MyDrive/WLASL100 Dataset/wlasl100_augmented_features.npz"

# Processing Config
SEQUENCE_LENGTH = 32
BATCH_SIZE = 32
AUGMENT_COPIES = 3  # How many extra augmented copies to generate for training

# Setup GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using Device: {device}")
if device == "cpu":
    print("⚠️ WARNING: Switch to T4 GPU Runtime for speed!")

# ==========================================
# 3. EXTRACT DATASET
# ==========================================
if not os.path.exists(EXTRACT_ROOT):
    print(f"Extracting {DRIVE_ZIP_PATH}...")
    try:
        with zipfile.ZipFile(DRIVE_ZIP_PATH, 'r') as zip_ref:
            zip_ref.extractall("/content") # Creates /content/preprocessing
        print("✅ Extraction Complete.")
    except Exception as e:
        print(f"❌ Error extracting: {e}")
else:
    print("Dataset already extracted.")

# ==========================================
# 4. AUGMENTATION & CLIP MODEL
# ==========================================
print("\nLoading CLIP Model...")
model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_name)
vision_model = CLIPVisionModel.from_pretrained(model_name).to(device)
vision_model.eval()

def augment_frames(frames):
    """Applies random visual noise to a sequence of frames."""
    augmented = []

    # Random Params (Consistent across the whole video)
    h, w, _ = frames[0].shape

    # 1. Random Crop (Zoom in 85% - 100%)
    scale = random.uniform(0.85, 1.0)
    new_h, new_w = int(h * scale), int(w * scale)
    y_start = random.randint(0, h - new_h)
    x_start = random.randint(0, w - new_w)

    # 2. Random Rotation (-10 to +10 degrees)
    angle = random.uniform(-10, 10)
    center = (w // 2, h // 2)
    rot_mat = cv2.getRotationMatrix2D(center, angle, 1.0)

    # 3. Brightness
    brightness = random.uniform(0.7, 1.3)

    for frame in frames:
        # Crop & Resize
        crop = frame[y_start:y_start+new_h, x_start:x_start+new_w]
        crop = cv2.resize(crop, (w, h))

        # Rotate
        rotated = cv2.warpAffine(crop, rot_mat, (w, h))

        # Adjust Brightness
        img_float = rotated.astype(np.float32) * brightness
        img_final = np.clip(img_float, 0, 255).astype(np.uint8)

        augmented.append(img_final)

    return augmented

def process_video_frames(frame_files, augment=False):
    """Reads, (optionally) augments, and extracts features."""
    frame_files.sort()
    if len(frame_files) == 0: return None

    # Uniform Sampling
    indices = np.linspace(0, len(frame_files) - 1, SEQUENCE_LENGTH, dtype=int)
    selected_files = [frame_files[i] for i in indices]

    images = []
    for f in selected_files:
        img = cv2.imread(f)
        if img is not None:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            images.append(img)

    if not images: return None

    # === AUGMENTATION ===
    if augment:
        images = augment_frames(images)
    # ====================

    # Pad if short
    while len(images) < SEQUENCE_LENGTH:
        images.append(images[-1])

    try:
        inputs = processor(images=images, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = vision_model(**inputs)
            embeddings = outputs.pooler_output.cpu().numpy()
        return embeddings.astype(np.float32)
    except:
        return None

def extract_split(split_name, augment_count=0):
    """
    Extracts features for a split.
    If augment_count > 0, generates multiple copies of the data.
    """
    # Path: /content/preprocessing/train/frames
    frames_root = os.path.join(EXTRACT_ROOT, split_name, "frames")
    if not os.path.exists(frames_root):
        print(f"❌ Split not found: {frames_root}")
        return [], []

    classes = sorted(os.listdir(frames_root))
    X_list = []
    y_list = []

    print(f"Processing {split_name} set...")

    # Calculate total iterations for progress bar
    total_passes = 1 + augment_count

    for i in range(total_passes):
        is_aug = (i > 0) # First pass is original, others are augmented
        print(f"  > Pass {i+1}/{total_passes} (Augment={is_aug})")

        for class_name in tqdm(classes):
            class_path = os.path.join(frames_root, class_name)
            if not os.path.isdir(class_path): continue

            # Find video folders
            items = os.listdir(class_path)
            # Typically: class_path/video_id/*.jpg
            subfolders = [os.path.join(class_path, item) for item in items if os.path.isdir(os.path.join(class_path, item))]

            for video_folder in subfolders:
                images = [os.path.join(video_folder, f) for f in os.listdir(video_folder) if f.endswith('.jpg')]

                features = process_video_frames(images, augment=is_aug)

                if features is not None:
                    X_list.append(features)
                    y_list.append(class_name)

    return np.array(X_list), np.array(y_list)

# ==========================================
# 5. EXECUTE EXTRACTION
# ==========================================
# Train: 1 Original + 3 Augmented copies
X_train, y_train_raw = extract_split("train", augment_count=AUGMENT_COPIES)

# Val/Test: No augmentation (evaluate on real data)
X_val, y_val_raw = extract_split("val", augment_count=0)
X_test, y_test_raw = extract_split("test", augment_count=0)

print(f"\nFinal Dataset Sizes:")
print(f"Train: {X_train.shape} (Augmented x{AUGMENT_COPIES+1})")
print(f"Val:   {X_val.shape}")
print(f"Test:  {X_test.shape}")

# Backup
np.savez(OUTPUT_FILENAME, X_train=X_train, y_train=y_train_raw,
         X_val=X_val, y_val=y_val_raw, X_test=X_test, y_test=y_test_raw)
try:
    shutil.copy(OUTPUT_FILENAME, BACKUP_PATH)
    print("✅ Backup Saved to Drive.")
except:
    pass

# ==========================================
# 6. TRAINING (Regularized Model)
# ==========================================
print("\nPreparing Training...")

# One-Hot Encoding
le = LabelEncoder()
le.fit(y_train_raw)
num_classes = len(le.classes_)

y_train = to_categorical(le.transform(y_train_raw), num_classes)
y_val = to_categorical(le.transform(y_val_raw), num_classes)
y_test = to_categorical(le.transform(y_test_raw), num_classes)

# Datasets
def create_ds(X, y, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if shuffle: ds = ds.shuffle(len(X))
    return ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

train_ds = create_ds(X_train, y_train, shuffle=True)
val_ds = create_ds(X_val, y_val)
test_ds = create_ds(X_test, y_test)

# Model Architecture (High Regularization)
def build_regularized_model(num_classes):
    inputs = layers.Input(shape=(SEQUENCE_LENGTH, 768))

    # 1. Smaller Projection with High Dropout
    x = layers.Dense(256, activation="linear")(inputs)
    x = layers.Dropout(0.5)(x)

    # Positional Encoding
    positions = tf.range(start=0, limit=SEQUENCE_LENGTH, delta=1)
    x = x + layers.Embedding(input_dim=SEQUENCE_LENGTH, output_dim=256)(positions)

    # 2. Shallower Transformer (2 Layers)
    for _ in range(2):
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        # Attention
        att = layers.MultiHeadAttention(num_heads=4, key_dim=64, dropout=0.5)(x1, x1)
        x2 = layers.Add()([x, att])

        # Feed Forward
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        x3 = layers.Dense(512, activation="gelu")(x3)
        x3 = layers.Dropout(0.5)(x3)
        x3 = layers.Dense(256)(x3)
        x = layers.Add()([x2, x3])

    # 3. Head
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)

    return keras.Model(inputs, outputs, name="UniSign_Regularized")

model = build_regularized_model(num_classes)
model.compile(
    optimizer=keras.optimizers.AdamW(learning_rate=1e-4, weight_decay=1e-4),
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
    metrics=['accuracy', tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top5')]
)

print("Starting Training...")
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=60,  # Increased epochs since data is harder to learn now
    callbacks=[
        keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True, monitor='val_accuracy'),
        keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.5, monitor='val_accuracy', min_lr=1e-6)
    ]
)

print("\nFinal Evaluation:")
model.evaluate(test_ds)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using Device: cuda
Dataset already extracted.

Loading CLIP Model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Processing train set...
  > Pass 1/4 (Augment=False)


100%|██████████| 100/100 [03:30<00:00,  2.11s/it]


  > Pass 2/4 (Augment=True)


100%|██████████| 100/100 [04:22<00:00,  2.62s/it]


  > Pass 3/4 (Augment=True)


100%|██████████| 100/100 [04:21<00:00,  2.62s/it]


  > Pass 4/4 (Augment=True)


100%|██████████| 100/100 [04:20<00:00,  2.61s/it]


Processing val set...
  > Pass 1/1 (Augment=False)


100%|██████████| 100/100 [00:44<00:00,  2.26it/s]


Processing test set...
  > Pass 1/1 (Augment=False)


100%|██████████| 100/100 [00:33<00:00,  2.96it/s]



Final Dataset Sizes:
Train: (5760, 32, 768) (Augmented x4)
Val:   (337, 32, 768)
Test:  (258, 32, 768)
✅ Backup Saved to Drive.

Preparing Training...
Starting Training...
Epoch 1/60
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 47ms/step - accuracy: 0.0107 - loss: 5.5555 - top5: 0.0571 - val_accuracy: 0.0208 - val_loss: 4.6565 - val_top5: 0.0712 - learning_rate: 1.0000e-04
Epoch 2/60
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0265 - loss: 5.0717 - top5: 0.0942 - val_accuracy: 0.0326 - val_loss: 4.6255 - val_top5: 0.1187 - learning_rate: 1.0000e-04
Epoch 3/60
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0459 - loss: 4.7731 - top5: 0.1561 - val_accuracy: 0.0534 - val_loss: 4.6853 - val_top5: 0.1484 - learning_rate: 1.0000e-04
Epoch 4/60
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.1023 - loss: 4.2839 - top5: 0.2805 - val_accuracy: 0.

[3.389517307281494, 0.3139534890651703, 0.604651153087616]