In [1]:
import cv2
import whisper
import os
import tempfile
import av  # PyAV as a modern alternative
import numpy as np
from transformers import MarianMTModel, MarianTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 2: Translate to ASL gloss using a pre-trained translation model (MarianMT)
asl_model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"  # Using generic EN translator for demo
asl_tokenizer = MarianTokenizer.from_pretrained(asl_model_name)
asl_model = MarianMTModel.from_pretrained(asl_model_name)

In [3]:
# Step 1: Extract audio and generate transcript from video using Whisper
def extract_audio_and_transcribe(video_path):
    print("Extracting audio and transcribing...")
    temp_audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

    # Extract audio using FFmpeg (PyAV can't easily write raw WAV directly)
    os.system(f"ffmpeg -i \"{video_path}\" -vn -acodec pcm_s16le -ar 16000 -ac 1 \"{temp_audio_path}\" -y")

    model = whisper.load_model("base")
    result = model.transcribe(temp_audio_path)
    os.remove(temp_audio_path)

    # Return as list of tuples (start, end, text)
    transcript = [(seg['start'], seg['end'], seg['text']) for seg in result['segments']]
    return transcript


def asl_translate(text):
    batch = asl_tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
    translated = asl_model.generate(**batch)
    return asl_tokenizer.decode(translated[0], skip_special_tokens=True).upper()

# Step 3: Overlay ASL gloss on video

def overlay_asl_on_video(input_video_path, output_video_path, transcript):
    cap = cv2.VideoCapture(input_video_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    frame_count = 0
    subtitle_index = 0
    asl_text = ""

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        current_time = frame_count / fps

        if subtitle_index < len(transcript):
            start, end, text = transcript[subtitle_index]
            if start <= current_time <= end:
                asl_text = asl_translate(text)
            elif current_time > end:
                subtitle_index += 1
                asl_text = ""

        if asl_text:
            cv2.rectangle(frame, (50, height - 100), (width - 50, height - 30), (0, 0, 0), -1)
            cv2.putText(frame, asl_text, (60, height - 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

        out.write(frame)
        frame_count += 1

    cap.release()
    out.release()
    print("ASL subtitle video saved as:", output_video_path)

# Main pipeline
def process_video_to_asl_subtitles(input_video_path, output_video_path):
    transcript = extract_audio_and_transcribe(input_video_path)
    overlay_asl_on_video(input_video_path, output_video_path, transcript)


In [4]:
process_video_to_asl_subtitles("input_video.mp4", "asl_subtitled_output.mp4")

Extracting audio and transcribing...


ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --ena

ASL subtitle video saved as: asl_subtitled_output.mp4
