In [None]:
import os
import subprocess
import cv2
from collections import defaultdict

# Directories
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"
shortened_videos_path = f"{base_path}/own_dataset/shortened_videos"
augmented_videos_path = f"{base_path}/own_dataset/videos_augmented"
os.makedirs(augmented_videos_path, exist_ok=True)

# Augmentations
augmentations = {
    "flip": "-vf hflip",  # Horizontal flip
    "bright": "-vf eq=brightness=0.2",  # Increased brightness
    "contrast": "-vf eq=contrast=1.5",  # Increased contrast
    "speedup": "-vf setpts=0.75*PTS",  # 25% speed increase
    "slowdown": "-filter:v fps=20",  # Slow down
    "noise": "-vf noise=c0s=5:allf=t"  # Add noise
}

# Count videos per label
label_counts = defaultdict(int)
video_sources = {}

def process_videos(folder):
    for video_file in os.listdir(folder):
        if video_file.endswith(".mp4"):
            parts = video_file.rsplit("_", 2)  # Extract <video_id>_<label>_<frames>
            if len(parts) == 3 and parts[2].replace(".mp4", "").isdigit():
                video_id, label, _ = parts
                video_sources[video_file] = os.path.join(folder, video_file)
                label_counts[label] += 1

# Count videos from both directories
process_videos(shortened_videos_path)
process_videos(raw_videos_path)

target_videos_per_label = 36

# Calculate missing videos per label
total_missing_videos = 0
for label, count in label_counts.items():
    missing_videos = max(0, target_videos_per_label - count)
    total_missing_videos += missing_videos
    print(f"{label}: {count} videos, needs {missing_videos} more")

print(f"🔢 Total videos to be created: {total_missing_videos}")

# Generate augmented videos
for video_file, input_video in video_sources.items():
    parts = video_file.rsplit("_", 2)
    if len(parts) < 3:
        continue
    
    video_id, label, frames = parts
    base_name = os.path.splitext(video_file)[0]
    
    # Calculate how many additional videos are needed
    needed_augmentations = max(0, target_videos_per_label - label_counts[label])
    
    if needed_augmentations == 0:
        print(f"⏩ Skipping {video_file}, already {target_videos_per_label} videos for label {label}")
        continue
    
    applied_augmentations = 0
    for aug_name, filter_cmd in augmentations.items():
        if applied_augmentations >= needed_augmentations:
            break
        
        output_video = os.path.join(augmented_videos_path, f"{base_name}_{aug_name}.mp4")
        
        if os.path.exists(output_video):
            print(f"⏩ {output_video} already exists. Skipping...")
            continue
        
        # Ensure only up to 90 frames are used
        ffmpeg_command = [
            "ffmpeg", "-y",
            "-i", input_video,
        ] + filter_cmd.split() + [
            "-vf", "select='lt(n,90)'",  # Only first 90 frames
            "-vsync", "vfr",
            "-c:v", "libx264", "-preset", "fast", "-crf", "18", "-pix_fmt", "yuv420p", output_video
        ]
        
        print(f"📌 Creating {output_video} with FFmpeg ({aug_name})")
        process = subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        if process.returncode == 0:
            print(f"✅ Augmented video saved: {output_video}")
            label_counts[label] += 1
            applied_augmentations += 1
        else:
            print(f"❌ Error processing {output_video} with FFmpeg")
            print(process.stderr.decode())

print("✅ Done! Missing augmented videos have been created.")