In [None]:
import os
import glob
import random
import subprocess
from collections import defaultdict

# Define the source and destination directories
VIDEO_DIR = "/home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/full_dataset"
AUGMENTED_DIR = "/home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/augmented_dataset"

# Ensure the augmented directory exists
os.makedirs(AUGMENTED_DIR, exist_ok=True)

def get_videos(directory):
    """Find all MP4 videos and extract metadata from their filenames."""
    videos = defaultdict(list)
    for file in glob.glob(os.path.join(directory, "*.mp4")):
        filename = os.path.basename(file)
        parts = filename.rsplit("_", 2)
        if len(parts) == 3:
            video_id, label, frames_ext = parts
            frames = int(frames_ext.split(".")[0])
            videos[label].append((file, video_id, frames))
    return videos

def apply_ffmpeg_augmentations(input_file, output_file, augmentation):
    """Apply an FFmpeg transformation to the video."""
    commands = {
        "flip": ["-vf", "hflip"],
        "rotate": ["-vf", "rotate=PI/12"],
        "fast": ["-filter:v", "setpts=0.5*PTS"],
        "slow": ["-filter:v", "setpts=2.0*PTS"],
        "brightness": ["-vf", "eq=brightness=0.1"],
        "contrast": ["-vf", "eq=contrast=1.5"],
        "blur": ["-vf", "gblur=sigma=2"],
        "noise": ["-vf", "noise=alls=10:allf=t"],
    }
    
    if augmentation not in commands:
        return
    
    cmd = ["ffmpeg", "-i", input_file] + commands[augmentation] + ["-y", output_file]
    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def augment_videos(source_directory, destination_directory):
    """Generate augmented videos to ensure exactly 40 per label."""
    videos = get_videos(source_directory)
    augmentations = ["flip", "rotate", "fast", "slow", "brightness", "contrast", "blur", "noise"]
    
    for label, files in videos.items():
        existing_videos = set(glob.glob(os.path.join(destination_directory, f"*_{label}_*.mp4")))
        count = len(files) + len(existing_videos)
        
        if count >= 40:
            continue  # Already enough videos
        
        index = 0
        while count < 40:
            file, video_id, frames = random.choice(files)
            new_frames = min(frames, 102)
            augmentation = augmentations[index % len(augmentations)]
            output_file = os.path.join(destination_directory, f"{video_id}_{label}_{new_frames}_{augmentation}.mp4")
            
            if output_file not in existing_videos:
                apply_ffmpeg_augmentations(file, output_file, augmentation)
                existing_videos.add(output_file)
                count += 1
                print(f"Augmented video saved: {output_file}")
            
            index += 1


# Run the augmentation
augment_videos(VIDEO_DIR, AUGMENTED_DIR)

In [None]:
import os
import glob
import random
from collections import defaultdict

def get_videos(directory):
    """Find all MP4 videos and extract metadata from their filenames."""
    videos = defaultdict(list)
    for file in glob.glob(os.path.join(directory, "*.mp4")):
        filename = os.path.basename(file)
        parts = filename.rsplit("_", 3)  # Try splitting into 4 parts (augmentations present)
        
        if len(parts) == 4:  # Augmented video format: <video_id>_<label>_<frames>_<changes>.mp4
            video_id, label, frames_ext, changes = parts
        elif len(parts) == 3:  # Original video format: <video_id>_<label>_<frames>.mp4
            video_id, label, frames_ext = parts
            changes = None
        else:
            continue  # Skip malformed filenames
        
        try:
            frames = int(frames_ext.split(".")[0])  # Extract frames as an integer
            videos[label].append((file, video_id, frames, changes))
        except ValueError:
            continue  # Skip files where frames cannot be converted to int
    
    return videos

def remove_excess_videos(source_directory, destination_directory):
    """Remove excess augmented videos so that each label has exactly 40 videos in total."""
    videos = get_videos(source_directory)  # Original videos
    augmented_videos = get_videos(destination_directory)  # Augmented videos
    
    for label, original_files in videos.items():
        total_count = len(original_files) + len(augmented_videos.get(label, []))
        
        if total_count > 40:
            excess_count = total_count - 40
            
            # Collect only augmented videos (those that have 4 parts: video_id, label, frames, changes)
            augmented_files = [file for file, _, _, changes in augmented_videos.get(label, []) if changes is not None]
            
            if len(augmented_files) < excess_count:
                print(f"Warning: Not enough augmented videos to remove for label {label}!")
                continue
            
            # Sort augmented videos by filename and delete only the required amount
            for file in sorted(augmented_files)[:excess_count]:
                os.remove(file)
                print(f"Removed excess augmented video: {file}")

remove_excess_videos(VIDEO_DIR, AUGMENTED_DIR)