### **Notebook Objectives**

1. **Subset Data:** Get only 1% of the original dataset for efficiency.
2. **Create Splits:** Deterministically split data into Training (90%) and Validation (10%) sets.
3. **Index Splits:** Save `train_split.json` and `val_split.json` so the training notebook can load them instantly.

### **Data Preprocessing**

In [1]:
# Standard Library Imports
import os
import json
import random

In [2]:
def preprocess_data(raw_root, processed_root, sample_ratio, 
                    train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Scans raw data, creates a random subset, and splits into train/val/test.

    Args:
        sample_ratio (float): Percentage of total data to keep.
        train_ratio, val_ratio, test_ratio: Ratios for splitting.
    """
    print(f"[INFO] Scanning {raw_root} ...")

    valid_sequences = []

    if not os.path.exists(raw_root):
        print("Raw data not found.")
        return

    # Scan directories
    for root, dirs, files in os.walk(raw_root):
        if "im1.png" in files and "im7.png" in files:
            rel_path = os.path.relpath(root, raw_root)
            valid_sequences.append(rel_path)

    total_found = len(valid_sequences)
    print(f"[INFO] Total sequences found: {total_found}")

    # Global shuffle
    random.seed(42)
    random.shuffle(valid_sequences)

    # Apply subsetting
    subset_size = int(total_found * sample_ratio)
    subset_sequences = valid_sequences[:subset_size]
    print(f"\nSUBSETTING APPLIED")
    print(f"[INFO] Keeping {subset_size} sequences ({sample_ratio*100}%)")

    # Compute split indices
    n = len(subset_sequences)
    train_end = int(n * train_ratio)
    val_end = train_end + int(n * val_ratio)

    # Split
    train_seqs = subset_sequences[:train_end]
    val_seqs = subset_sequences[train_end:val_end]
    test_seqs = subset_sequences[val_end:]

    # Save lists
    os.makedirs(processed_root, exist_ok=True)

    train_path = os.path.join(processed_root, "train_split.json")
    val_path = os.path.join(processed_root, "val_split.json")
    test_path = os.path.join(processed_root, "test_split.json")

    with open(train_path, 'w') as f:
        json.dump(train_seqs, f)

    with open(val_path, 'w') as f:
        json.dump(val_seqs, f)

    with open(test_path, 'w') as f:
        json.dump(test_seqs, f)

    print(f"\nSPLITTING COMPLETE")
    print(f"[INFO] Train: {len(train_seqs)} items (Saved to {train_path})")
    print(f"[INFO] Val:   {len(val_seqs)} items (Saved to {val_path})")
    print(f"[INFO] Test:  {len(test_seqs)} items (Saved to {test_path})")

In [3]:
preprocess_data(
    "../data/raw/vimeo/sequences",
    "../data/processed",
    sample_ratio=0.01
)

[INFO] Scanning ../data/raw/vimeo/sequences ...
[INFO] Total sequences found: 91701

SUBSETTING APPLIED
[INFO] Keeping 917 sequences (1.0%)

SPLITTING COMPLETE
[INFO] Train: 733 items (Saved to ../data/processed\train_split.json)
[INFO] Val:   91 items (Saved to ../data/processed\val_split.json)
[INFO] Test:  93 items (Saved to ../data/processed\test_split.json)
