In [1]:
import cv2
import os
import numpy as np
from glob import glob
from tqdm import tqdm

def extract_16_frames_uniformly(video_path, num_frames=16, resize=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Decide on frame indices to grab
    indices = np.linspace(0, frame_count - 1, num_frames, dtype=np.int32)
    frames = []

    for i in range(frame_count):
        success, frame = cap.read()
        if not success:
            break
        if i in indices:
            frame = cv2.resize(frame, resize)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)

    cap.release()
    
    # Padding if not enough frames
    while len(frames) < num_frames:
        frames.append(np.zeros_like(frames[0]))  # pad with black frame

    return np.stack(frames)

def save_16frame_videos(input_dir, output_dir, video_exts=['.mp4', '.avi'], num_frames=16, resize=(224,224)):
    os.makedirs(output_dir, exist_ok=True)
    video_paths = []
    for ext in video_exts:
        video_paths.extend(glob(os.path.join(input_dir, f'*{ext}')))

    for vid_path in tqdm(video_paths, desc="Processing videos"):
        try:
            frames = extract_16_frames_uniformly(vid_path, num_frames=num_frames, resize=resize)
            filename = os.path.splitext(os.path.basename(vid_path))[0]
            np.save(os.path.join(output_dir, f"{filename}.npy"), frames)
        except Exception as e:
            print(f"[ERROR] Failed on {vid_path}: {e}")


In [2]:
input_dir = os.path.join('datasets', 'ASL-Citizen', 'top100_videos')
output_dir = os.path.join('datasets', 'ASL-Citizen', 'top100_videos_16frames')
os.makedirs(output_dir, exist_ok=True)

In [3]:
save_16frame_videos(input_dir, output_dir)

Processing videos: 100%|██████████| 3544/3544 [03:07<00:00, 18.94it/s]


In [10]:
from data_loader import plot_video_gif

In [14]:
fname = glob(os.path.join(output_dir, '*.npy'))[0]
random_vid = np.load(fname)
random_vid.shape

(16, 224, 224, 3)

In [15]:
plot_video_gif(random_vid, fps=8, label= os.path.basename(fname))

Extracting the keypoints and storing them in a new folder. 

In [None]:
import mediapipe as mp
import torch

mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_holistic = mp.solutions.holistic

# face_contour_style = mp_drawing_styles.get_default_face_mesh_contours_style()
# face_tesselation_style = mp_drawing_styles.get_default_face_mesh_tesselation_style()
hand_landmark_style = mp_drawing_styles.get_default_hand_landmarks_style()
hand_connection_style = mp_drawing_styles.get_default_hand_connections_style()
pose_landmark_style = mp_drawing_styles.get_default_pose_landmarks_style()

# Reduce the marker size and line thickness
for k in hand_landmark_style:
    hand_landmark_style[k].circle_radius = 1
    hand_landmark_style[k].thickness = 1

for k in hand_connection_style:
    hand_connection_style[k].thickness = 1


UPPER_BODY_LANDMARKS = [
    mp_holistic.PoseLandmark.LEFT_SHOULDER,
    mp_holistic.PoseLandmark.RIGHT_SHOULDER,
    mp_holistic.PoseLandmark.LEFT_ELBOW,
    mp_holistic.PoseLandmark.RIGHT_ELBOW,
    mp_holistic.PoseLandmark.LEFT_WRIST,
    mp_holistic.PoseLandmark.RIGHT_WRIST,
]

# Define which connections to draw (upper body only)
POSE_CONNECTIONS_UPPER_BODY = [
    (mp_holistic.PoseLandmark.LEFT_WRIST, mp_holistic.PoseLandmark.LEFT_ELBOW),
    (mp_holistic.PoseLandmark.LEFT_ELBOW, mp_holistic.PoseLandmark.LEFT_SHOULDER),
    (mp_holistic.PoseLandmark.RIGHT_WRIST, mp_holistic.PoseLandmark.RIGHT_ELBOW),
    (mp_holistic.PoseLandmark.RIGHT_ELBOW, mp_holistic.PoseLandmark.RIGHT_SHOULDER),
    (mp_holistic.PoseLandmark.LEFT_SHOULDER, mp_holistic.PoseLandmark.RIGHT_SHOULDER),
]

In [17]:
def preprocess_video(input_dir, draw_kpts = False):
    with mp_holistic.Holistic(
        static_image_mode=False,
        model_complexity=2,
        smooth_landmarks=True,
        enable_segmentation=False,
        refine_face_landmarks=False, 
        min_detection_confidence=0.4,
        min_tracking_confidence=0.6,
    ) as holistic:
        all_hand_keypoints = []
        all_pose_keypoints = []
        all_images = []
        for video in tqdm(glob(os.path.join(input_dir, '*.npy')), desc="Processing videos"):
            video = np.load(video)
            video_kpta = []
            video_pose = []

            for frame in video:
                image = frame.copy()
                results = holistic.process(image)

                keypoints_frame = []
                pose_frame = []
                if draw_kpts:
                    image = np.zeros(shape=(224,224,3), dtype=np.uint8)
                if results.left_hand_landmarks:
                    if draw_kpts:
                        mp_drawing.draw_landmarks(
                            image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                            landmark_drawing_spec=hand_landmark_style, connection_drawing_spec=hand_connection_style)
                    for landmark in results.left_hand_landmarks.landmark:
                        x, y, z = int(landmark.x * image.shape[1]), int(landmark.y * image.shape[0]), landmark.z
                        keypoints_frame.append((x, y, z))

                if results.right_hand_landmarks:
                    if draw_kpts:
                        mp_drawing.draw_landmarks(
                            image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                            landmark_drawing_spec=hand_landmark_style, connection_drawing_spec=hand_connection_style)
                    for landmark in results.right_hand_landmarks.landmark:
                        x, y, z = int(landmark.x * image.shape[1]), int(landmark.y * image.shape[0]), landmark.z
                        keypoints_frame.append((x, y, z))

                # Get upper-body pose keypoints
                if results.pose_landmarks:
                    pose_landmarks = results.pose_landmarks.landmark
                    h, w, _ = image.shape

                    # Extract & collect upper-body keypoints
                    for landmark_id in UPPER_BODY_LANDMARKS:
                        lm = pose_landmarks[landmark_id]
                        x, y, z = int(lm.x * w), int(lm.y * h), lm.z
                        pose_frame.append((x, y, z))

                    if draw_kpts:
                        # Draw selected upper-body pose connections
                        for connection in POSE_CONNECTIONS_UPPER_BODY:
                            start_idx, end_idx = connection
                            start = pose_landmarks[start_idx]
                            end = pose_landmarks[end_idx]

                            x0, y0 = int(start.x * w), int(start.y * h)
                            x1, y1 = int(end.x * w), int(end.y * h)
                            cv2.line(image, (x0, y0), (x1, y1), (255, 255, 0), 2)

                video_kpta.append(keypoints_frame)
                video_pose.append(pose_frame)
                if draw_kpts:
                    all_images.append(image)
            
            all_hand_keypoints.append(video_kpta)
            all_pose_keypoints.append(video_pose)
            if draw_kpts:
                all_images.append(image)
                all_images = np.array(all_images)

        if not draw_kpts:
            return all_hand_keypoints, all_pose_keypoints       
        return all_images, all_hand_keypoints, all_pose_keypoints

In [38]:
import json

def write_json(data, path):
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

def save_preprocessed_outputs(kpts, pose, fnames, out_dir):
        n = len(kpts)
        for i in tqdm(range(n)):
            hand_keypoints = kpts[i]
            pose_keypoints = pose[i]
            fname = fnames[i]
            fpath = os.path.join(out_dir, f"{fname}.json")
            metadata = {
                'hand_keypoints': hand_keypoints,
                'pose_keypoints': pose_keypoints,
            }
            write_json(metadata, fpath)

In [18]:
output_dir

'datasets\\ASL-Citizen\\top100_videos_16frames'

In [19]:
hand_kpts, pose_kpts = preprocess_video(output_dir)

Processing videos: 100%|██████████| 3544/3544 [1:04:46<00:00,  1.10s/it]


In [44]:
fnames = glob(os.path.join(output_dir, '*.npy'))
fnames = [os.path.basename(f).split('.')[0] for f in fnames]
fnames[0]

'0014822228991275832-BORROW'

In [45]:
kpts_output_dir = os.path.join('datasets', 'ASL-Citizen', 'top100_videos_16frames_kpts')
os.makedirs(kpts_output_dir, exist_ok=True)
save_preprocessed_outputs(hand_kpts, pose_kpts, fnames, kpts_output_dir)

100%|██████████| 3544/3544 [00:07<00:00, 475.91it/s]
