# Multi-Process Video Preprocessing

This notebook demonstrates how to run the optimized multi-process preprocessing script with optimal settings for your hardware configuration.

## Hardware Configuration:
- **GPU**: RTX 3060, 12 GB VRAM
- **CPU**: Xeon E5-2683 v4 (16 cores / 32 threads)
- **RAM**: 32 GB
- **Disk**: NVMe (~1.3 GB/s)

## Expected Performance:
- **30-50x speedup** compared to sequential processing
- **Processing time**: Days → 2-4 hours for ~2000 videos
- **GPU utilization**: Batched InceptionV3 keeps GPU busy
- **CPU utilization**: MediaPipe distributed across cores


## 1. Setup and Configuration

**Note**: This notebook automatically suppresses verbose logging from MediaPipe and TensorFlow to provide cleaner output. You'll only see the essential progress information.


In [None]:
import os
import sys
import subprocess
import time
from pathlib import Path
import pandas as pd
import warnings

# Suppress verbose logging from MediaPipe and TensorFlow
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow INFO, WARNING, ERROR logs
os.environ['MP_VERBOSE'] = '0'  # Suppress MediaPipe verbose output
os.environ['GLOG_minloglevel'] = '3'  # Suppress Google logging (used by MediaPipe)

# Suppress Python warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Current working directory: {os.getcwd()}")
print("✅ Verbose logging suppressed for cleaner output")


## 2. Configuration Parameters

Configure the preprocessing parameters for optimal performance on your hardware.


In [None]:
# =============================================================================
# CONFIGURATION - Modify these paths and parameters as needed
# =============================================================================

# Input and output paths
VIDEO_DIR = "../data/raw"  # Directory containing input videos
OUTPUT_DIR = "../data/processed"  # Directory for processed output
LABELS_CSV = "../data/processed/labels.csv"  # Path to labels CSV

# Performance settings (optimized for your hardware)
WORKERS = 10  # Number of parallel workers (optimal for 16-core CPU)
BATCH_SIZE = 64  # Batch size for InceptionV3 GPU inference
TARGET_FPS = 15  # Target frames per second (recommended for speed)

# Processing options
WRITE_KEYPOINTS = True  # Extract MediaPipe keypoints
WRITE_IV3_FEATURES = True  # Extract InceptionV3 features
DISABLE_PARQUET = True  # Disable parquet output for faster I/O (recommended for batch processing)

# Labeling options
GLOSS_ID = 1  # Gloss ID for labeling
CAT_ID = 1  # Category ID for labeling
APPEND_LABELS = False  # Append to existing labels CSV

# Occlusion detection
ENABLE_OCCLUSION = True  # Enable occlusion detection
OCC_VIS_THRESH = 0.6  # Frame visible fraction threshold
OCC_FRAME_PROP = 0.4  # Clip occluded if proportion >= this
OCC_MIN_RUN = 15  # Clip occluded if run length >= this

# MediaPipe settings
OUT_SIZE = 256  # Output image size for keypoint extraction
CONF_THRESH = 0.5  # Confidence threshold for keypoints
MAX_GAP = 5  # Maximum gap for interpolation

print("Configuration loaded successfully!")
print(f"Video directory: {VIDEO_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Workers: {WORKERS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Target FPS: {TARGET_FPS}")


## 3. Validate Input Directory

Check that the video directory exists and count the number of video files.


In [None]:
def count_videos(directory):
    """Count video files in directory."""
    video_extensions = {'.mp4', '.mov', '.avi', '.mkv'}
    count = 0
    video_files = []
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            if Path(file).suffix.lower() in video_extensions:
                count += 1
                video_files.append(os.path.join(root, file))
    
    return count, video_files

# Check if video directory exists
if not os.path.exists(VIDEO_DIR):
    print(f"❌ ERROR: Video directory '{VIDEO_DIR}' does not exist!")
    print("Please update the VIDEO_DIR variable with the correct path.")
else:
    video_count, video_files = count_videos(VIDEO_DIR)
    print(f"✅ Video directory found: {VIDEO_DIR}")
    print(f"📹 Found {video_count} video files")
    
    if video_count > 0:
        print("\nFirst 5 video files:")
        for i, video in enumerate(video_files[:5]):
            print(f"  {i+1}. {os.path.basename(video)}")
        if video_count > 5:
            print(f"  ... and {video_count - 5} more")


## 4. Create Output Directory

Ensure the output directory exists and is ready for processing.


In [None]:
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Output directory ready: {OUTPUT_DIR}")

# Check available disk space (approximate)
import shutil
total, used, free = shutil.disk_usage(OUTPUT_DIR)
free_gb = free // (1024**3)
print(f"💾 Available disk space: {free_gb} GB")

# Estimate required space (rough calculation)
if 'video_count' in locals() and video_count > 0:
    # Rough estimate: ~50MB per video for keypoints + IV3 features
    estimated_space_mb = video_count * 50
    estimated_space_gb = estimated_space_mb / 1024
    print(f"📊 Estimated space needed: {estimated_space_gb:.1f} GB")
    
    if free_gb < estimated_space_gb:
        print(f"⚠️  WARNING: May not have enough disk space!")
    else:
        print(f"✅ Sufficient disk space available")


## 5. Build Command and Validate Parameters

Build the multi-process preprocessing command with all required parameters.


In [None]:
def build_preprocessing_command():
    """Build the multi-process preprocessing command."""
    cmd = [
        sys.executable, "../preprocessing/multi_preprocess.py",
        VIDEO_DIR,
        OUTPUT_DIR,
        "--workers", str(WORKERS),
        "--batch-size", str(BATCH_SIZE),
        "--target-fps", str(TARGET_FPS),
        "--out-size", str(OUT_SIZE),
        "--conf-thresh", str(CONF_THRESH),
        "--max-gap", str(MAX_GAP)
    ]
    
    # Add processing options
    if WRITE_KEYPOINTS:
        cmd.append("--write-keypoints")
    if WRITE_IV3_FEATURES:
        cmd.append("--write-iv3-features")
    if DISABLE_PARQUET:
        cmd.append("--disable-parquet")
    
    # Add labeling options
    if GLOSS_ID is not None:
        cmd.extend(["--gloss-id", str(GLOSS_ID)])
    if CAT_ID is not None:
        cmd.extend(["--cat-id", str(CAT_ID)])
    if LABELS_CSV is not None:
        cmd.extend(["--labels-csv", LABELS_CSV])
    if APPEND_LABELS:
        cmd.append("--append")
    
    # Add occlusion detection
    if ENABLE_OCCLUSION:
        cmd.append("--occ-enable")
        cmd.extend(["--occ-vis-thresh", str(OCC_VIS_THRESH)])
        cmd.extend(["--occ-frame-prop", str(OCC_FRAME_PROP)])
        cmd.extend(["--occ-min-run", str(OCC_MIN_RUN)])
    
    return cmd

# Build command
command = build_preprocessing_command()

print("🔧 Multi-process preprocessing command:")
print("\n".join([f"  {part}" for part in command]))

print("\n📋 Parameter summary:")
print(f"  • Workers: {WORKERS}")
print(f"  • Batch size: {BATCH_SIZE}")
print(f"  • Target FPS: {TARGET_FPS}")
print(f"  • Write keypoints: {WRITE_KEYPOINTS}")
print(f"  • Write IV3 features: {WRITE_IV3_FEATURES}")
print(f"  • Disable parquet: {DISABLE_PARQUET}")
print(f"  • Enable occlusion: {ENABLE_OCCLUSION}")
print(f"  • Gloss ID: {GLOSS_ID}")
print(f"  • Category ID: {CAT_ID}")


## 6. Run Multi-Process Preprocessing

Execute the preprocessing with progress monitoring and error handling.


In [None]:
def run_preprocessing():
    """Run the multi-process preprocessing."""
    print("🚀 Starting multi-process preprocessing...")
    print(f"⏰ Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    print("\n" + "="*60)
    
    start_time = time.time()
    
    try:
        # Run the command with stderr redirected to suppress warnings
        result = subprocess.run(
            command,
            check=True,
            capture_output=False,  # Show output in real-time
            text=True,
            stderr=subprocess.DEVNULL  # Suppress stderr warnings
        )
        
        end_time = time.time()
        total_time = end_time - start_time
        
        print("\n" + "="*60)
        print("✅ Preprocessing completed successfully!")
        print(f"⏰ End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"⏱️  Total time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
        
        if 'video_count' in locals() and video_count > 0:
            print(f"📊 Average time per video: {total_time/video_count:.2f} seconds")
            print(f"📈 Videos per hour: {video_count * 3600 / total_time:.1f}")
        
        return True
        
    except subprocess.CalledProcessError as e:
        end_time = time.time()
        total_time = end_time - start_time
        
        print("\n" + "="*60)
        print(f"❌ Preprocessing failed with error code: {e.returncode}")
        print(f"⏱️  Time before failure: {total_time:.2f} seconds")
        return False
        
    except KeyboardInterrupt:
        end_time = time.time()
        total_time = end_time - start_time
        
        print("\n" + "="*60)
        print("⚠️  Preprocessing interrupted by user")
        print(f"⏱️  Time before interruption: {total_time:.2f} seconds")
        return False

# Run preprocessing
success = run_preprocessing()


## 7. Verify Results

Check the output directory and verify that files were processed correctly.


In [None]:
def verify_results():
    """Verify the preprocessing results."""
    print("🔍 Verifying preprocessing results...")
    
    # Count processed files
    npz_files = list(Path(OUTPUT_DIR).glob("*.npz"))
    parquet_files = list(Path(OUTPUT_DIR).glob("*.parquet"))
    
    print(f"\n📁 Output directory: {OUTPUT_DIR}")
    print(f"📄 NPZ files: {len(npz_files)}")
    print(f"📄 Parquet files: {len(parquet_files)}")
    
    # Check labels CSV if it should exist
    if LABELS_CSV and os.path.exists(LABELS_CSV):
        try:
            labels_df = pd.read_csv(LABELS_CSV)
            print(f"📋 Labels CSV: {len(labels_df)} entries")
            print(f"   Columns: {list(labels_df.columns)}")
            
            if 'occluded' in labels_df.columns:
                occluded_count = labels_df['occluded'].sum()
                print(f"   Occluded videos: {occluded_count}")
        except Exception as e:
            print(f"⚠️  Could not read labels CSV: {e}")
    
    # Show sample files
    if npz_files:
        print(f"\n📄 Sample processed files:")
        for i, npz_file in enumerate(npz_files[:5]):
            file_size = npz_file.stat().st_size / (1024 * 1024)  # MB
            print(f"   {i+1}. {npz_file.name} ({file_size:.1f} MB)")
        if len(npz_files) > 5:
            print(f"   ... and {len(npz_files) - 5} more")
    
    # Check for errors
    if len(npz_files) == 0:
        print("❌ No NPZ files found! Check for errors above.")
        return False
    
    print("\n✅ Verification complete!")
    return True

if success:
    verify_results()
else:
    print("❌ Skipping verification due to preprocessing failure.")


## 8. Performance Summary

Display a summary of the preprocessing performance and next steps.


In [None]:
def print_performance_summary():
    """Print performance summary and next steps."""
    print("\n" + "="*60)
    print("📊 PERFORMANCE SUMMARY")
    print("="*60)
    
    if success:
        print("✅ Status: SUCCESS")
        print(f"📁 Output directory: {OUTPUT_DIR}")
        print(f"⚙️  Configuration used:")
        print(f"   • Workers: {WORKERS}")
        print(f"   • Batch size: {BATCH_SIZE}")
        print(f"   • Target FPS: {TARGET_FPS}")
        print(f"   • Keypoints: {WRITE_KEYPOINTS}")
        print(f"   • IV3 features: {WRITE_IV3_FEATURES}")
        print(f"   • Parquet disabled: {DISABLE_PARQUET}")
        
        print("\n🚀 Next steps:")
        print("1. Use data_split.py to organize files into train/val splits")
        print("2. Start training your models")
        print("3. Use the Streamlit app to visualize results")
        
        if LABELS_CSV and os.path.exists(LABELS_CSV):
            print(f"\n📋 Labels CSV created: {LABELS_CSV}")
            print("   Use this for data splitting and training")
    else:
        print("❌ Status: FAILED")
        print("\n🔧 Troubleshooting:")
        print("1. Check error messages above")
        print("2. Verify video directory path")
        print("3. Ensure sufficient disk space")
        print("4. Check GPU memory usage")
        print("5. Try reducing batch size or workers")
    
    print("\n" + "="*60)

print_performance_summary()
