In [12]:
import os
import re
import json
import numpy as np
import cv2
import tempfile
import whisper

from ASL_model import TexttoMPPoints


In [9]:
model = TexttoMPPoints()

In [4]:
# Step 1: Extract audio and generate transcript from video using Whisper
def extract_audio_and_transcribe(video_path):
    print("Extracting audio and transcribing...")
    temp_audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

    # Extract audio using FFmpeg (PyAV can't easily write raw WAV directly)
    os.system(f"ffmpeg -i \"{video_path}\" -vn -acodec pcm_s16le -ar 16000 -ac 1 \"{temp_audio_path}\" -y")

    model = whisper.load_model("base")
    result = model.transcribe(temp_audio_path)
    os.remove(temp_audio_path)

    # Return as list of tuples (start, end, text)
    transcript = [(seg['start'], seg['end'], seg['text']) for seg in result['segments']]
    return transcript

In [14]:
# Clean up transcript words so they match JSON filenames
def sanitize_word(w):
    # lowercase
    w = w.lower()
    # keep only letters+digits
    w = re.sub(r"[^a-z0-9]", "", w)
    return w

# Build word‐level timings from Whisper segments
def build_word_timings(transcript):
    word_timings = []
    for start, end, text in transcript:
        # split on whitespace
        raw_words = text.strip().split()
        dur = end - start
        for i, rw in enumerate(raw_words):
            w = sanitize_word(rw)
            if not w:
                continue
            w_start = start + (i/len(raw_words))*dur
            w_end   = start + ((i+1)/len(raw_words))*dur
            word_timings.append({"word": w, "start": w_start, "end": w_end})
    return word_timings


def load_gt_landmarks(folder="landmark_data"):
    gt = {}
    for fname in os.listdir(folder):
        if not fname.endswith(".json"):
            continue
        word = fname[:-5]  # “about.json” → “about”
        gt[word] = json.load(open(os.path.join(folder, fname)))
    return gt

def load_pred_landmarks_for_words(words):
    preds = {}
    for raw in set(words):
        w = sanitize_word(raw)
        if not w:
            continue
        preds[w]  = model.predict_landmarks(w)   
    return preds

def create_stick_figure_image(frame_landmarks, width=300, height=300):
    img = np.zeros((height, width, 3), np.uint8)
    def draw(pts, color):
        for p in pts:
            if isinstance(p, dict):
                x,y,_ = p["x"], p["y"], p["z"]
            else:
                x,y,_ = p
            px, py = int(x*width), int(y*height)
            cv2.circle(img, (px,py), 3, color, -1)

    if "face"      in frame_landmarks: draw(frame_landmarks["face"],      (0,255,0))
    if "pose"      in frame_landmarks: draw(frame_landmarks["pose"],      (255,255,255))
    if "left_hand" in frame_landmarks: draw(frame_landmarks["left_hand"], (255,0,255))
    if "right_hand" in frame_landmarks:draw(frame_landmarks["right_hand"],(0,255,255))
    return img

In [15]:
# Open the input video
video_path = "sample_text_video.mp4" 
transcript = extract_audio_and_transcribe(video_path)
word_times  = build_word_timings(transcript)

gt_landmarks   = load_gt_landmarks("landmark_data")
pred_landmarks = load_pred_landmarks_for_words([wt["word"] for wt in word_times])

cap     = cv2.VideoCapture(video_path)
fps     = cap.get(cv2.CAP_PROP_FPS)
W, H    = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out     = cv2.VideoWriter("comparison_overlay.mp4",
                          cv2.VideoWriter_fourcc(*"mp4v"), fps, (W,H))

Extracting audio and transcribing...


In [16]:
frame_idx = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    t = frame_idx / fps

    # 5) Find which word covers this timestamp
    wt = next((wt for wt in word_times if wt["start"] <= t < wt["end"]), None)
    if wt:
        w = wt["word"]
        rel = (t - wt["start"]) / (wt["end"] - wt["start"])

        # lookup ground‑truth & prediction sequences
        gt_seq   = gt_landmarks.get(w, [])
        pred_seq = pred_landmarks.get(w, [])

        # pick frame index in each
        idx_gt   = min(int(rel * len(gt_seq)),   max(len(gt_seq)-1,0))
        idx_pred = min(int(rel * len(pred_seq)), max(len(pred_seq)-1,0))

        # draw mini‑figures
        fig_gt   = create_stick_figure_image(gt_seq[idx_gt])
        fig_pred = create_stick_figure_image(pred_seq[idx_pred])

        # overlay bottom‑left & bottom‑right
        frame[H-300:H, 0:300]       = fig_gt
        frame[H-300:H, W-300:W]     = fig_pred

    out.write(frame)
    frame_idx += 1

cap.release()
out.release()
print("Done! Comparison video → comparison_overlay.mp4")

Done! Comparison video → comparison_overlay.mp4


In [22]:
from moviepy import VideoFileClip, CompositeAudioClip

In [23]:


# 1) load the overlaid silent video
video = VideoFileClip("comparison_overlay.mp4")

# 2) grab the audio from the original
audio = VideoFileClip("sample_text_video.mp4").audio

new_audioclip = CompositeAudioClip([audio])
video.audio = new_audioclip

video.write_videofile("comparison_overlay_final.mp4")
print("New video file created!")

# # 3) set it on the overlaid clip and write out
# final = video.set_audio(audio)
# final.write_videofile(
#     "comparison_with_audio.mp4",
#     codec="libx264",        # or "mpeg4"
#     audio_codec="aac",      # or "libmp3lame"
#     fps=video.fps
# )

{'video_found': True, 'audio_found': False, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2mp41', 'encoder': 'Lavf58.76.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1920, 1080], 'bitrate': 18058, 'fps': 29.91, 'codec_name': 'mpeg4', 'profile': '(Simple Profile)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 6.38, 'bitrate': 18060, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'mpeg4', 'video_profile': '(Simple Profile)', 'video_size': [1920, 1080], 'video_bitrate': 18058, 'video_fps': 29.91, 'video_duration': 6.38, 'video_n_frames': 190}
c:\Users\PC\anaconda3\envs\text_to_asl\lib\site-packages\imageio_ffmpeg\binaries\ffmpeg-win-x86_64-v7.1.exe -i comparison_overlay.mp4 -loglevel error -f image2pipe -vf scale=1920:1080

                                                                   

MoviePy - Done.
MoviePy - Writing video comparison_overlay_final.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready comparison_overlay_final.mp4
New video file created!
