In [None]:
import os
import cv2
import pandas as pd
import glob
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
# Paths
dataset_dir = "/mnt/d/AI Challenge/Data/video"
output_dir = "/mnt/d/AI Challenge/Data/keyframes"
scene_dir = "/mnt/d/AI Challenge/Data/scene"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Recursively find all videos
videos = glob.glob(os.path.join(dataset_dir, '**', '*.mp4'))
if not videos:
    print("[ERROR] No videos found.")
    return

print(f"[INFO] Found {len(videos)} videos to process.")

In [None]:
def process_video(video_path):
    base_name = os.path.splitext(os.path.basename(video_path))[0]
    batchh = base_name.split("_")[0]
    scene_path = os.path.join(scene_dir, f"{batchh}/{base_name}.csv")
    scene_list = pd.read_csv(scene_path)
    
    os.makedirs(os.path.join(output_dir, batchh, base_name), exist_ok=True)
    
    # Extract keyframes
    cap = cv2.VideoCapture(video_path)
    for i, row in scene_list.iterrows():
        median_frame = int(row['median_frame'])
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, median_frame)
        ret, frame = cap.read()
        if ret:
            keyframe_path = os.path.join(output_dir, f"{batchh}/{base_name}/{base_name}_{median_frame}.jpg")
            cv2.imwrite(keyframe_path, frame)
        else:
            print(f"[WARNING] Could not read frame {median_frame} in {video_name}")

    cap.release()
    video_manager.release()
    print(f"[DONE] {rel_path} processed.")


# Process in parallel
max_workers = 4
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(process_video, v) for v in videos]
    for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing videos"):
        _.result()

[INFO] Found 169 videos to process.


KeyboardInterrupt: 