# Get all frames per video

In [None]:
import os
import cv2
import numpy as np

# Define paths
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"

# Function to count frames
def count_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    return frame_count

# Store results
frame_counts = []

# Process the folder
for filename in os.listdir(raw_videos_path):
    filepath = os.path.join(raw_videos_path, filename)
    if filename.endswith(".mp4"):  # If videos are in MP4 format
        parts = filename.rsplit("_", 2)  # Expected format: <video_id>_<label>_<frames>.mp4
        if len(parts) == 3 and parts[2].replace(".mp4", "").isdigit():
            video_id, label, stored_frames = parts[0], parts[1], int(parts[2].replace(".mp4", ""))
            actual_frames = count_frames(filepath)
            
            # Rename file if the stored frame count is incorrect
            new_filename = f"{video_id}_{label}_{actual_frames}.mp4"
            new_filepath = os.path.join(raw_videos_path, new_filename)
            
            if filename != new_filename:
                os.rename(filepath, new_filepath)
                print(f"Renamed: {filename} -> {new_filename}")
            
            frame_counts.append((f"{video_id}_{label}", actual_frames))

# Sort by frame count (descending)
frame_counts.sort(key=lambda x: x[1], reverse=True)

# Output results
for video, frames in frame_counts:
    print(f"{video} : {frames}")

63232_who : 155
34738_man : 155
00629_accident : 149
55361_study : 148
01383_africa : 146
59204_trade : 143
53872_spin : 139
53696_speech : 138
62923_wet : 135
01552_ago : 134
25375_grammar : 134
61983_vomit : 134
05088_bar : 132
45261_pull : 131
13636_cousin : 131
27920_hope : 129
25241_government : 129
04509_baby : 128
23946_future : 128
03003_apple : 128
24720_glasses : 127
02583_animal : 127
10895_city : 127
63788_work : 127
60350_ugly : 127
00965_add : 124
42243_perspective : 123
41446_past : 123
03060_appointment : 123
01388_africa : 123
19406_environment : 123
62499_water : 122
19408_environment : 121
63280_why : 121
32325_later : 121
63664_woman : 121
03274_argue : 120
33477_list : 120
24721_glasses : 120
09182_careful : 119
05233_basketball : 119
50037_secretary : 119
57943_thin : 119
44320_presentation : 119
69439_pull : 119
02106_alone : 118
60948_upset : 118
04190_australia : 117
05472_beard : 117
58649_toast : 116
33472_list : 116
32160_language : 116
09179_careful : 116
5

# Check video availabiliy

In [None]:
import os
import cv2
import numpy as np

# Define paths
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"

# List for unreadable videos
bad_videos = []

# Process the folder
for filename in os.listdir(raw_videos_path):
    filepath = os.path.join(raw_videos_path, filename)
    if filename.endswith(".mp4"):  # If videos are in MP4 format
        
        # Check if the file is too small
        if os.path.getsize(filepath) < 1000:  # Files under 1 KB are likely corrupted
            bad_videos.append(filename)
            continue
        
        cap = cv2.VideoCapture(filepath)
        if not cap.isOpened():
            bad_videos.append(filename)
        else:
            valid = False
            for _ in range(5):  # Try reading multiple frames
                ret, frame = cap.read()
                if ret and frame is not None:
                    valid = True
                    break
            if not valid:
                bad_videos.append(filename)
        cap.release()

# Output unreadable videos
if bad_videos:
    print("\nVideos that cannot be opened:")
    for bad_video in bad_videos:
        print(bad_video)
else:
    print("All videos are readable.")

Alle Videos sind lesbar.


# Count labels and videos per label

In [None]:
import os
from collections import defaultdict

# Define paths
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"

# Dictionary to store the number of videos per label
label_counts = defaultdict(int)

# Process the folder
for filename in os.listdir(raw_videos_path):
    if filename.endswith(".mp4"):  # If videos are in MP4 format
        parts = filename.split("_")  # Expected format: <video_id>_<label>_...
        if len(parts) > 1:
            label = parts[1]  # Assumption: The label is in the second position
            label_counts[label] += 1

# Output the number of labels and videos per label
print(f"Number of labels: {len(label_counts)}")
print("\nNumber of videos per label:")
for label, count in sorted(label_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{label}: {count}")

Anzahl der Labels: 168

Anzahl der Videos pro Label:
thin: 15
go: 14
help: 13
bowling: 13
before: 13
computer: 13
cool: 13
drink: 12
tall: 12
trade: 12
accident: 12
bed: 12
short: 11
candy: 11
pizza: 11
shirt: 11
dark: 11
change: 11
corn: 11
cold: 11
who: 11
thanksgiving: 11
man: 11
cousin: 11
call: 10
delay: 10
champion: 10
laugh: 10
later: 10
family: 10
deaf: 10
leave: 10
basketball: 10
thursday: 10
far: 10
dog: 10
yes: 10
woman: 10
apple: 10
last: 9
toast: 9
bad: 9
no: 9
check: 9
full: 9
cheat: 9
move: 9
bar: 9
score: 9
son: 9
play: 9
hot: 9
many: 9
study: 9
black: 9
language: 9
fat: 9
write: 9
give: 9
take: 9
what: 9
balance: 9
chat: 9
white: 9
graduate: 9
secretary: 9
why: 9
yesterday: 9
good: 9
daughter: 9
tell: 9
sandwich: 9
convince: 9
carrot: 9
fish: 9
ready: 9
letter: 9
wait: 9
year: 9
delicious: 9
mother: 9
brother: 9
bird: 9
work: 9
argue: 9
environment: 8
how: 8
future: 8
week: 8
pull: 8
same: 8
wow: 8
meet: 8
barely: 8
speech: 8
tiger: 8
spin: 8
past: 8
orange: 8
decide: 

# Get videos with more or 90 frames

In [None]:
import os

# Define paths
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"

# List for videos with more than 90 frames
videos_above_90 = []

# Process the folder
for filename in os.listdir(raw_videos_path):
    if filename.endswith(".mp4"):  # If videos are in MP4 format
        parts = filename.rsplit("_", 1)  # Expected format: <video_id>_<label>_<frames>.mp4
        if len(parts) == 2 and parts[1].replace(".mp4", "").isdigit():
            frame_count = int(parts[1].replace(".mp4", ""))
            if frame_count > 90:
                videos_above_90.append(filename)

# Output videos with more than 90 frames
if videos_above_90:
    print(f"Number of videos with more than 90 frames: {len(videos_above_90)}")
    print("\nVideos with more than 90 frames:")
    for video in videos_above_90:
        print(video)
else:
    print("No videos with more than 90 frames found.")

Anzahl der Videos mit mehr als 90 Frames: 216

Videos mit mehr als 90 Frames:
19408_environment_121.mp4
08691_call_101.mp4
45261_pull_131.mp4
17713_drink_91.mp4
63948_wow_91.mp4
56838_tall_96.mp4
35509_meet_94.mp4
58649_toast_116.mp4
10187_check_95.mp4
42243_perspective_123.mp4
02106_alone_118.mp4
43904_postpone_91.mp4
51056_shirt_99.mp4
01554_ago_98.mp4
43166_play_92.mp4
01552_ago_134.mp4
55361_study_148.mp4
51704_silly_99.mp4
24722_glasses_94.mp4
49694_score_96.mp4
26835_headache_99.mp4
58655_toast_113.mp4
25621_great_91.mp4
24636_give_92.mp4
00624_accident_109.mp4
26837_headache_93.mp4
63285_why_96.mp4
53696_speech_138.mp4
32326_later_108.mp4
65129_balance_94.mp4
26971_hearing_101.mp4
15093_decorate_100.mp4
57276_tell_94.mp4
25372_grammar_105.mp4
49245_sandwich_98.mp4
03056_appointment_114.mp4
69259_carrot_94.mp4
28204_how_92.mp4
17087_dog_96.mp4
59204_trade_143.mp4
03057_appointment_96.mp4
37852_necklace_100.mp4
24720_glasses_127.mp4
41446_past_123.mp4
51058_shirt_95.mp4
63280_why_

# Get labels after deleting all videos with less than 90 frames

In [None]:
import os
from collections import defaultdict

# Define paths
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"

# Dictionary to store the number of videos per label
label_counts = defaultdict(int)

# List of videos with more than 90 frames
videos_above_90 = set()
for filename in os.listdir(raw_videos_path):
    if filename.endswith(".mp4"):  # If videos are in MP4 format
        parts = filename.rsplit("_", 1)  # Expected format: <video_id>_<label>_<frames>.mp4
        if len(parts) == 2 and parts[1].replace(".mp4", "").isdigit():
            frame_count = int(parts[1].replace(".mp4", ""))
            if frame_count > 90:
                videos_above_90.add(filename)

# Process the folder, excluding videos with more than 90 frames
for filename in os.listdir(raw_videos_path):
    if filename.endswith(".mp4") and filename not in videos_above_90:
        parts = filename.split("_")  # Expected format: <video_id>_<label>_...
        if len(parts) > 1:
            label = parts[1]  # Assumption: The label is in the second position
            label_counts[label] += 1

# Count the number of labels with fewer than 7 videos
labels_less_than_7 = sum(1 for count in label_counts.values() if count < 7)

# Output the number of these labels
print(labels_less_than_7)

37


# Shorten videos with more than 90 frames (without end)

In [None]:
import os
import subprocess
import cv2
import mediapipe as mp

# Directories
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"
shortened_videos_path = f"{base_path}/own_dataset/shortened_videos"
os.makedirs(shortened_videos_path, exist_ok=True)

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)

# Function to extract frame count from filename
def get_frame_count_from_filename(filename):
    parts = filename.rsplit("_", 1)  # Expected format: <video_id>_<label>_<frames>.mp4
    if len(parts) == 2 and parts[1].replace(".mp4", "").isdigit():
        return int(parts[1].replace(".mp4", ""))
    return 0

# Process all videos with more than 90 frames
for filename in os.listdir(raw_videos_path):
    if not filename.endswith(".mp4"):
        continue
    
    frame_count = get_frame_count_from_filename(filename)
    if frame_count <= 90:
        print(f"Skipped: {filename} (only {frame_count} frames)")
        continue
    
    input_video = os.path.join(raw_videos_path, filename)
    output_video = os.path.join(shortened_videos_path, filename)
    
    cap = cv2.VideoCapture(input_video)
    if not cap.isOpened():
        print(f"Error: Could not open video: {input_video}")
        continue
    
    hand_detected = False
    start_time = None
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    # Detect first frame with hands
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb_frame)
        
        if results.multi_hand_landmarks:
            hand_detected = True
            start_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert to seconds
            break
    
    cap.release()
    
    if not hand_detected:
        print(f"❌ No hands detected in {filename}, skipping...")
        continue
    
    # Use FFmpeg to trim video from first detected hand frame
    ffmpeg_command = [
        "ffmpeg", "-y",
        "-i", input_video,
        "-ss", str(start_time),  # Start from detected hand frame
        "-c:v", "libx264", "-preset", "fast", "-crf", "18",
        "-pix_fmt", "yuv420p",
        output_video
    ]
    
    print(f"✂️ Trimming {filename} from {start_time} seconds...")
    process = subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    
    if process.returncode == 0:
        print(f"✅ Trimmed and saved: {output_video}")
    else:
        print(f"❌ Error processing {output_video}")
        print(process.stderr.decode())

print("Done! Videos have been trimmed based on first hand detection frame.")

I0000 00:00:1741269955.872683  431047 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1741269955.905226  671398 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.216.01), renderer: NVIDIA RTX A6000/PCIe/SSE2
W0000 00:00:1741269955.948504  671370 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741269955.971108  671393 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


✂️ Trimming 19408_environment_121.mp4 from 0.88 seconds...
✅ Trimmed and saved: /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/shortened_videos/19408_environment_121.mp4
Skipped: 28201_how_72.mp4 (only 72 frames)
Skipped: 65798_future_64.mp4 (only 64 frames)
✂️ Trimming 08691_call_101.mp4 from 0.7000000000000001 seconds...
✅ Trimmed and saved: /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/shortened_videos/08691_call_101.mp4
Skipped: 66759_week_47.mp4 (only 47 frames)
✂️ Trimming 45261_pull_131.mp4 from 0.32 seconds...
✅ Trimmed and saved: /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/shortened_videos/45261_pull_131.mp4
✂️ Trimming 17713_drink_91.mp4 from 0.633 seconds...
✅ Trimmed and saved: /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/shortened_videos/17713_drink_91.mp4
Skipped: 32257_last_50.mp4 (only 50 frames)
Skipped: 27214_help_53.mp4 (only 53 frames)
Skipped: 49174_same_36.mp4

# Shorten videos with more than 90 frames (with end)

In [17]:
import os
import subprocess
import cv2
import mediapipe as mp

# Directories
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"
shortened_videos_path = f"{base_path}/own_dataset/shortened_videos"
os.makedirs(shortened_videos_path, exist_ok=True)

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)

# Function to extract frame count from filename
def get_frame_count_from_filename(filename):
    parts = filename.rsplit("_", 1)  # Expected format: <video_id>_<label>_<frames>.mp4
    if len(parts) == 2 and parts[1].replace(".mp4", "").isdigit():
        return int(parts[1].replace(".mp4", ""))
    return 0

# Process all videos with more than 90 frames
for filename in os.listdir(raw_videos_path):
    if not filename.endswith(".mp4"):
        continue
    
    frame_count = get_frame_count_from_filename(filename)
    if frame_count <= 90:
        print(f"Skipped: {filename} (only {frame_count} frames)")
        continue
    
    input_video = os.path.join(raw_videos_path, filename)
    output_video = os.path.join(shortened_videos_path, filename)
    
    cap = cv2.VideoCapture(input_video)
    if not cap.isOpened():
        print(f"Error: Could not open video: {input_video}")
        continue
    
    hand_detected = False
    start_time = None
    end_time = None
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    # Detect first and last frame with hands
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb_frame)
        current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert to seconds
        
        if results.multi_hand_landmarks:
            if not hand_detected:
                start_time = current_time  # First frame where hands appear
                hand_detected = True
            end_time = current_time  # Last frame where hands were detected
    
    cap.release()
    
    if not hand_detected:
        print(f"❌ No hands detected in {filename}, skipping...")
        continue
    
    if end_time is None:
        end_time = start_time + (90 / fps)  # Default to 90 frames if end_time isn't found
    
    # Use FFmpeg to trim video from first detected hand frame to last detected hand frame
    ffmpeg_trim_command = [
        "ffmpeg", "-y",
        "-i", input_video,
        "-ss", str(start_time),  # Start from detected hand frame
        "-to", str(end_time),  # Stop when hands disappear
        "-c:v", "libx264", "-preset", "fast", "-crf", "18",
        "-pix_fmt", "yuv420p",
        output_video
    ]
    
    print(f"✂️ Trimming {filename} from {start_time} to {end_time} seconds...")
    process = subprocess.run(ffmpeg_trim_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    
    if process.returncode == 0:
        print(f"✅ Trimmed and saved: {output_video} (from {start_time} to {end_time})")
    else:
        print(f"❌ Error processing {output_video}")
        print(process.stderr.decode())

print("Done! Videos have been trimmed from first to last detected hand frame.")

I0000 00:00:1741270975.487804  431047 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1741270975.511215  696137 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.216.01), renderer: NVIDIA RTX A6000/PCIe/SSE2
W0000 00:00:1741270975.546572  696106 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741270975.572175  696113 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


✂️ Trimming 19408_environment_121.mp4 from 0.88 to 3.48 seconds...
✅ Trimmed and saved: /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/shortened_videos/19408_environment_121.mp4 (from 0.88 to 3.48)
Skipped: 28201_how_72.mp4 (only 72 frames)
Skipped: 65798_future_64.mp4 (only 64 frames)
✂️ Trimming 08691_call_101.mp4 from 0.7000000000000001 to 2.367 seconds...
✅ Trimmed and saved: /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/shortened_videos/08691_call_101.mp4 (from 0.7000000000000001 to 2.367)
Skipped: 66759_week_47.mp4 (only 47 frames)
✂️ Trimming 45261_pull_131.mp4 from 0.32 to 1.8 seconds...
✅ Trimmed and saved: /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/shortened_videos/45261_pull_131.mp4 (from 0.32 to 1.8)
✂️ Trimming 17713_drink_91.mp4 from 0.633 to 2.1670000000000003 seconds...
✅ Trimmed and saved: /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/shortened_videos/17713_drink_9

# Update frames in naming for every shorten video

In [None]:
import os
import cv2
import numpy as np

# Define paths
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/own_dataset/shortened_videos"

# Function to count frames
def count_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    return frame_count

# Store results
frame_counts = []

# Process the folder
for filename in os.listdir(raw_videos_path):
    filepath = os.path.join(raw_videos_path, filename)
    if filename.endswith(".mp4"):  # If videos are in MP4 format
        parts = filename.rsplit("_", 2)  # Expected format: <video_id>_<label>_<frames>.mp4
        if len(parts) == 3 and parts[2].replace(".mp4", "").isdigit():
            video_id, label, stored_frames = parts[0], parts[1], int(parts[2].replace(".mp4", ""))
            actual_frames = count_frames(filepath)
            
            # Rename file if the stored frame count is incorrect
            new_filename = f"{video_id}_{label}_{actual_frames}.mp4"
            new_filepath = os.path.join(raw_videos_path, new_filename)
            
            if filename != new_filename:
                os.rename(filepath, new_filepath)
                print(f"Renamed: {filename} -> {new_filename}")
            
            frame_counts.append((f"{video_id}_{label}", actual_frames))

# Sort by frame count (descending)
frame_counts.sort(key=lambda x: x[1], reverse=True)

# Output results
for video, frames in frame_counts:
    print(f"{video} : {frames}")

Renamed: 19408_environment_121.mp4 -> 19408_environment_65.mp4
Renamed: 08691_call_101.mp4 -> 08691_call_50.mp4
Renamed: 45261_pull_131.mp4 -> 45261_pull_74.mp4
Renamed: 17713_drink_91.mp4 -> 17713_drink_46.mp4
Renamed: 63948_wow_91.mp4 -> 63948_wow_45.mp4
Renamed: 56838_tall_96.mp4 -> 56838_tall_60.mp4
Renamed: 35509_meet_94.mp4 -> 35509_meet_34.mp4
Renamed: 58649_toast_116.mp4 -> 58649_toast_59.mp4
Renamed: 10187_check_95.mp4 -> 10187_check_43.mp4
Renamed: 42243_perspective_123.mp4 -> 42243_perspective_82.mp4
Renamed: 02106_alone_118.mp4 -> 02106_alone_74.mp4
Renamed: 43904_postpone_91.mp4 -> 43904_postpone_70.mp4
Renamed: 51056_shirt_99.mp4 -> 51056_shirt_44.mp4
Renamed: 01554_ago_98.mp4 -> 01554_ago_62.mp4
Renamed: 43166_play_92.mp4 -> 43166_play_58.mp4
Renamed: 01552_ago_134.mp4 -> 01552_ago_75.mp4
Renamed: 55361_study_148.mp4 -> 55361_study_77.mp4
Renamed: 51704_silly_99.mp4 -> 51704_silly_49.mp4
Renamed: 24722_glasses_94.mp4 -> 24722_glasses_58.mp4
Renamed: 49694_score_96.mp4 ->

# Get all duplicate videos

In [3]:
import os

# Directories
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"
shortened_videos_path = f"{base_path}/own_dataset/shortened_videos"

# Get video IDs from shortened_videos
shortened_video_ids = {file.split("_")[0] for file in os.listdir(shortened_videos_path) if file.endswith(".mp4")}

# Count duplicates in raw_videos
duplicate_count = sum(1 for file in os.listdir(raw_videos_path) if file.endswith(".mp4") and file.split("_")[0] in shortened_video_ids)

# Print result
print(f"📊 Number of duplicate videos in raw_videos: {duplicate_count}")

📊 Number of duplicate videos in raw_videos: 0


# Remove duplicates from raw_videos

In [2]:
import os
import shutil

# Directories
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"
shortened_videos_path = f"{base_path}/own_dataset/shortened_videos"
unused_videos_path = f"{base_path}/wlasl/raw_videos_unused"
os.makedirs(unused_videos_path, exist_ok=True)

# Get video IDs from shortened_videos
shortened_video_ids = {file.split("_")[0] for file in os.listdir(shortened_videos_path) if file.endswith(".mp4")}

# Move duplicate videos to raw_videos_unused
duplicate_count = 0
for file in os.listdir(raw_videos_path):
    if file.endswith(".mp4") and file.split("_")[0] in shortened_video_ids:
        shutil.move(os.path.join(raw_videos_path, file), os.path.join(unused_videos_path, file))
        duplicate_count += 1

# Print result
print(f"📊 Moved {duplicate_count} duplicate videos from raw_videos to raw_videos_unused.")

📊 Moved 216 duplicate videos from raw_videos to raw_videos_unused.


--> start here

# Get video count per label and max frames

In [None]:
import os
import cv2
from collections import defaultdict

# Directories
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"
shortened_videos_path = f"{base_path}/own_dataset/shortened_videos"
augmented_videos_path = f"{base_path}/own_dataset/videos_augmented"

# Count videos per label and determine maximum frames
label_counts = defaultdict(int)
label_max_frames = defaultdict(int)

def process_videos(folder, is_augmented=False):
    for video_file in os.listdir(folder):
        if video_file.endswith(".mp4"):
            parts = video_file.rsplit("_", 3) if is_augmented else video_file.rsplit("_", 2)
            if len(parts) >= 3:
                label = parts[1]  # The label is the second element
                video_path = os.path.join(folder, video_file)
                
                # Open video and count frames
                cap = cv2.VideoCapture(video_path)
                frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                cap.release()
                
                label_counts[label] += 1
                label_max_frames[label] = max(label_max_frames[label], frame_count)

# Count videos and find maximum frames in all three folders
process_videos(raw_videos_path)
process_videos(shortened_videos_path)
process_videos(augmented_videos_path, is_augmented=True)

# Display results
for label in sorted(label_counts.keys()):
    print(f"Label: {label}, Videos: {label_counts[label]}, Maximum Frames: {label_max_frames[label]}")

Label: accident, Videos: 36, Maximale Frames: 102
Label: add, Videos: 36, Maximale Frames: 83
Label: africa, Videos: 36, Maximale Frames: 77
Label: ago, Videos: 36, Maximale Frames: 75
Label: alone, Videos: 36, Maximale Frames: 76
Label: animal, Videos: 36, Maximale Frames: 90
Label: apple, Videos: 36, Maximale Frames: 88
Label: appointment, Videos: 36, Maximale Frames: 72
Label: argue, Videos: 36, Maximale Frames: 87
Label: australia, Videos: 36, Maximale Frames: 72
Label: baby, Videos: 36, Maximale Frames: 80
Label: bad, Videos: 36, Maximale Frames: 79
Label: balance, Videos: 36, Maximale Frames: 69
Label: banana, Videos: 36, Maximale Frames: 80
Label: bar, Videos: 36, Maximale Frames: 84
Label: barely, Videos: 36, Maximale Frames: 73
Label: basketball, Videos: 36, Maximale Frames: 72
Label: beard, Videos: 36, Maximale Frames: 76
Label: bed, Videos: 36, Maximale Frames: 87
Label: before, Videos: 36, Maximale Frames: 87
Label: bird, Videos: 36, Maximale Frames: 87
Label: black, Videos

# Rename augmented vdieos

In [None]:
import os
import cv2
from collections import defaultdict

# Directories
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"
shortened_videos_path = f"{base_path}/own_dataset/shortened_videos"
augmented_videos_path = f"{base_path}/own_dataset/videos_augmented"

# Count videos per label and determine maximum frames
label_counts = defaultdict(int)
label_max_frames = defaultdict(int)

def process_videos(folder, is_augmented=False):
    for video_file in os.listdir(folder):
        if video_file.endswith(".mp4"):
            parts = video_file.rsplit("_", 3) if is_augmented else video_file.rsplit("_", 2)
            if len(parts) >= 3:
                label = parts[1]  # The label is the second element
                video_path = os.path.join(folder, video_file)
                
                # Open video and count frames
                cap = cv2.VideoCapture(video_path)
                frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                cap.release()
                
                label_counts[label] += 1
                label_max_frames[label] = max(label_max_frames[label], frame_count)

# Count videos and find maximum frames in all three folders
process_videos(raw_videos_path)
process_videos(shortened_videos_path)
process_videos(augmented_videos_path, is_augmented=True)

# Verify and update filenames in augmented videos folder
for video_file in os.listdir(augmented_videos_path):
    if video_file.endswith(".mp4"):
        parts = video_file.rsplit("_", 3)  # <video_id>_<label>_<frames>_<change>.mp4
        if len(parts) == 4:
            video_id, label, old_frames, change = parts
            old_frames = old_frames.replace(".mp4", "")
            
            video_path = os.path.join(augmented_videos_path, video_file)
            cap = cv2.VideoCapture(video_path)
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            cap.release()
            
            # Rename file if frame count is incorrect
            if str(frame_count) != old_frames:
                new_filename = f"{video_id}_{label}_{frame_count}_{change}.mp4"
                new_filepath = os.path.join(augmented_videos_path, new_filename)
                os.rename(video_path, new_filepath)
                print(f"Renamed: {video_file} -> {new_filename}")

# Display results
for label in sorted(label_counts.keys()):
    print(f"Label: {label}, Videos: {label_counts[label]}, Maximum Frames: {label_max_frames[label]}")

print("✅ All augmented videos have been verified and renamed if necessary.")

Umbenannt: 15325_delay_49_bright.mp4 -> 15325_delay_48_bright.mp4.mp4
Umbenannt: 32257_last_50_noise.mp4 -> 32257_last_47_noise.mp4.mp4
Umbenannt: 01391_africa_48_bright.mp4 -> 01391_africa_47_bright.mp4.mp4
Umbenannt: 15325_delay_49_flip.mp4 -> 15325_delay_48_flip.mp4.mp4
Umbenannt: 64311_yesterday_50_bright.mp4 -> 64311_yesterday_47_bright.mp4.mp4


Umbenannt: 14893_deaf_45_flip.mp4 -> 14893_deaf_43_flip.mp4.mp4
Umbenannt: 32953_letter_36_slowdown.mp4 -> 32953_letter_33_slowdown.mp4.mp4
Umbenannt: 06476_black_33_speedup.mp4 -> 06476_black_32_speedup.mp4.mp4
Umbenannt: 20986_family_61_speedup.mp4 -> 20986_family_60_speedup.mp4.mp4
Umbenannt: 25330_graduate_60_bright.mp4 -> 25330_graduate_59_bright.mp4.mp4
Umbenannt: 63795_work_40_flip.mp4 -> 63795_work_39_flip.mp4.mp4
Umbenannt: 15330_delay_47_contrast.mp4 -> 15330_delay_44_contrast.mp4.mp4
Umbenannt: 01391_africa_48_contrast.mp4 -> 01391_africa_47_contrast.mp4.mp4
Umbenannt: 06476_black_33_slowdown.mp4 -> 06476_black_32_slowdown.mp4.mp4
Umbenannt: 33479_list_67_flip.mp4 -> 33479_list_66_flip.mp4.mp4
Umbenannt: 13333_corn_34_contrast.mp4 -> 13333_corn_32_contrast.mp4.mp4
Umbenannt: 55369_study_100_flip.mp4 -> 55369_study_90_flip.mp4.mp4
Umbenannt: 58683_today_37_contrast.mp4 -> 58683_today_34_contrast.mp4.mp4
Umbenannt: 23953_future_45_noise.mp4 -> 23953_future_44_noise.mp4.mp4
Umb

# Normalize frames

In [None]:
import os
import cv2
import torch
import numpy as np
from collections import defaultdict

# Directories
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"
shortened_videos_path = f"{base_path}/own_dataset/shortened_videos"
augmented_videos_path = f"{base_path}/own_dataset/videos_augmented"
processed_folder = f"{base_path}/own_dataset/videos_processed"
os.makedirs(processed_folder, exist_ok=True)

# Count videos per label and determine maximum frames
label_counts = defaultdict(int)
label_max_frames = defaultdict(int)
max_frames = 0

def process_videos(folder, is_augmented=False):
    global max_frames
    for video_file in os.listdir(folder):
        if video_file.endswith(".mp4"):
            parts = video_file.rsplit("_", 3) if is_augmented else video_file.rsplit("_", 2)
            if len(parts) >= 3:
                label = parts[1]  # The label is the second element
                video_path = os.path.join(folder, video_file)
                
                # Open video and count frames
                cap = cv2.VideoCapture(video_path)
                frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                cap.release()
                
                label_counts[label] += 1
                label_max_frames[label] = max(label_max_frames[label], frame_count)
                max_frames = max(max_frames, frame_count)

# Count videos and find maximum frames in all three folders
process_videos(raw_videos_path)
process_videos(shortened_videos_path)
process_videos(augmented_videos_path, is_augmented=True)

print(f"📏 Maximum number of frames: {max_frames}")

# Function to extract frames as Torch tensors
def extract_frames(video_path, device="cuda"):
    cap = cv2.VideoCapture(video_path)
    frames = []

    if not cap.isOpened():
        print(f"⚠️ Warning: Could not open video: {video_path}")
        return torch.zeros((1, 3, 224, 224), dtype=torch.float32, device=device)
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (224, 224))
        frame = torch.tensor(frame, dtype=torch.float32, device=device).permute(2, 0, 1)
        frames.append(frame)
    
    cap.release()
    
    if not frames:
        print(f"⚠️ Warning: No frames extracted for {video_path}")
        return torch.zeros((1, 3, 224, 224), dtype=torch.float32, device=device)
    
    return torch.stack(frames, dim=0)

# Function to pad frames
def pad_frames(frames, target_length, device="cuda"):
    num_frames = frames.shape[0]
    
    if num_frames < target_length:
        padding = torch.zeros((target_length - num_frames, 3, 224, 224), dtype=torch.float32, device=device)
        return torch.cat((frames, padding), dim=0)
    else:
        return frames[:target_length]

# Process videos
device = "cuda" if torch.cuda.is_available() else "cpu"

for folder in [shortened_videos_path, raw_videos_path, augmented_videos_path]:
    for video_file in os.listdir(folder):
        if not video_file.endswith(".mp4"):
            continue

        video_path = os.path.join(folder, video_file)
        
        try:
            # Extract frames
            frames = extract_frames(video_path, device=device)

            # Pad/Trim to `max_frames`
            padded_frames = pad_frames(frames, max_frames, device=device)

            # Save as `.npy` file
            npy_path = os.path.join(processed_folder, video_file.replace(".mp4", ".npy"))
            np.save(npy_path, padded_frames.cpu().numpy())

            print(f"✅ {video_file} processed and saved as {npy_path}")
        
        except Exception as e:
            print(f"❌ Error processing {video_file}: {e}")

print("🚀 Normalization completed!")

📏 Maximale Anzahl an Frames: 102
✅ 53696_speech_88.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/53696_speech_88.npy
✅ 05233_basketball_71.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/05233_basketball_71.npy
✅ 20978_family_62.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/20978_family_62.npy
✅ 62499_water_67.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/62499_water_67.npy
✅ 56306_sweet_48.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/56306_sweet_48.npy
✅ 69325_fish_45.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/69325_f

[h264 @ 0x557de4d18780] Invalid NAL unit size (745 > 472).
[h264 @ 0x557de4d18780] Error splitting the input into NAL units.
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x557de261bf80] stream 1, offset 0x3b468: partial file
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x557de261bf80] stream 1, offset 0x3b7d3: partial file


✅ 04709_bad_72.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/04709_bad_72.npy
✅ 62746_week_31.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/62746_week_31.npy
✅ 60348_ugly_33.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/60348_ugly_33.npy
✅ 14893_deaf_45.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/14893_deaf_45.npy
✅ 62168_walk_72.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/62168_walk_72.npy
✅ 05727_before_87.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/05727_before_87.npy
✅ 64219_year_28.mp4 verarbeitet und gespeich

[h264 @ 0x557de1e39800] Invalid NAL unit size (71678 > 10776).
[h264 @ 0x557de1e39800] missing picture in access unit with size 10780
[h264 @ 0x557de1e33bc0] Invalid NAL unit size (71678 > 10776).
[h264 @ 0x557de1e33bc0] Error splitting the input into NAL units.
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x557de261bf80] stream 1, offset 0x2a27a7: partial file


✅ 69211_animal_90.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/69211_animal_90.npy
✅ 23766_full_76.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/23766_full_76.npy
✅ 14750_daughter_84.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/14750_daughter_84.npy
✅ 25245_government_44.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/25245_government_44.npy
✅ 15322_delay_26.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/15322_delay_26.npy
✅ 04186_australia_69.mp4 verarbeitet und gespeichert als /home/haggenmueller/asl_detection/machine_learning/datasets/own_dataset/videos_processed/04186_australia_69.npy
✅ 09180_ca