In [8]:
import os
import re
import json
import numpy as np
import cv2
import tempfile
import whisper
import mediapipe as mp

from ASL_model import TexttoMPPoints


In [2]:
model = TexttoMPPoints()

In [3]:
# Step 1: Extract audio and generate transcript from video using Whisper
def extract_audio_and_transcribe(video_path):
    print("Extracting audio and transcribing...")
    temp_audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

    # Extract audio using FFmpeg (PyAV can't easily write raw WAV directly)
    os.system(f"ffmpeg -i \"{video_path}\" -vn -acodec pcm_s16le -ar 16000 -ac 1 \"{temp_audio_path}\" -y")

    model = whisper.load_model("base")
    result = model.transcribe(temp_audio_path)
    os.remove(temp_audio_path)

    # Return as list of tuples (start, end, text)
    transcript = [(seg['start'], seg['end'], seg['text']) for seg in result['segments']]
    return transcript

In [9]:
# 1) sanitize transcript words
def sanitize_word(w):
    w = w.lower().strip()
    return re.sub(r"[^a-z0-9]", "", w)

# 2) build per-word timings
def build_word_timings(transcript):
    word_times = []
    for start, end, text in transcript:
        raw = text.strip().split()
        dur = end - start
        for i, rw in enumerate(raw):
            w = sanitize_word(rw)
            if not w: 
                continue
            w_start = start + (i/len(raw))*dur
            w_end   = start + ((i+1)/len(raw))*dur
            word_times.append({"word": w, "start": w_start, "end": w_end})
    return word_times

# 3) load GT JSONs
def load_gt_landmarks(folder="landmark_data"):
    gt = {}
    for fn in os.listdir(folder):
        if not fn.endswith(".json"):
            continue
        w = fn[:-5]
        gt[w] = json.load(open(os.path.join(folder, fn)))
    return gt

# 4) precompute predictions
def load_pred_landmarks(words):
    preds = {}
    for raw in set(words):
        w = sanitize_word(raw)
        if not w: 
            continue
        preds[w] = model.predict_landmarks(w)
    return preds

mp_pose      = mp.solutions.pose
mp_hands     = mp.solutions.hands
mp_face_mesh = mp.solutions.face_mesh

POSE_CONNECTIONS     = mp_pose.POSE_CONNECTIONS
HAND_CONNECTIONS     = mp_hands.HAND_CONNECTIONS
FACE_CONNECTIONS     = mp_face_mesh.FACEMESH_TESSELATION

def create_avatar_figure_image(flm, width=300, height=300):
    """
    Build a 300×300 RGBA avatar:
     - Pose skeleton in white
     - Facial mesh in green
     - Left hand in magenta
     - Right hand in cyan
    """
    im = np.zeros((height, width, 4), np.uint8)

    def to_px(pt):
        return int(pt["x"]*width), int(pt["y"]*height)

    # draw a connection set with given color & list of points
    def draw_connections(points, connections, color, thickness=2):
        for a,b in connections:
            if a < len(points) and b < len(points):
                pa = to_px(points[a])
                pb = to_px(points[b])
                cv2.line(im, pa, pb, color, thickness, cv2.LINE_AA)

    # 1) Pose
    if "pose" in flm:
        draw_connections(
            flm["pose"],
            POSE_CONNECTIONS,
            color=(255,255,255,200),
            thickness=3
        )

    # 2) Face
    if "face" in flm:
        draw_connections(
            flm["face"],
            FACE_CONNECTIONS,
            color=(0,255,0,100),
            thickness=1
        )
        # draw key facial landmarks (eyes, lips) a bit more boldly?
        for idx in [33,133,362,263]:  # outer landmarks of both eyes
            cv2.circle(im, to_px(flm["face"][idx]), 2, (0,200,0,255), -1)

    # 3) Left hand
    if "left_hand" in flm and flm["left_hand"]:
        draw_connections(
            flm["left_hand"],
            HAND_CONNECTIONS,
            color=(255,0,255,200),
            thickness=2
        )
        # draw wrist/joints
        for p in flm["left_hand"]:
            cv2.circle(im, to_px(p), 2, (255,0,255,255), -1)

    # 4) Right hand
    if "right_hand" in flm and flm["right_hand"]:
        draw_connections(
            flm["right_hand"],
            HAND_CONNECTIONS,
            color=(0,255,255,200),
            thickness=2
        )
        for p in flm["right_hand"]:
            cv2.circle(im, to_px(p), 2, (0,255,255,255), -1)

    return im

# 6) alpha-blend a small RGBA overlay into `img` at (x,y)
def overlay_image_alpha(img, overlay, x, y):
    h, w = overlay.shape[:2]
    alpha = overlay[:,:,3] / 255.0
    inv = 1.0 - alpha
    for c in range(3):
        img[y:y+h, x:x+w, c] = (
            alpha * overlay[:,:,c] +
            inv   * img[y:y+h, x:x+w, c]
        ).astype(np.uint8)

In [10]:
# paths & prep
video_path = "sample_text_video.mp4"
transcript = extract_audio_and_transcribe(video_path)
word_times = build_word_timings(transcript)

gt_land   = load_gt_landmarks("landmark_data")
pred_land = load_pred_landmarks([wt["word"] for wt in word_times])

cap    = cv2.VideoCapture(video_path)
fps    = cap.get(cv2.CAP_PROP_FPS)
W, H   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out    = cv2.VideoWriter("comparison_with_avatar.mp4",
                         cv2.VideoWriter_fourcc(*"mp4v"), fps, (W,H))

Extracting audio and transcribing...


In [11]:
frame_i = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    t = frame_i / fps
    wt = next((w for w in word_times if w["start"] <= t < w["end"]), None)
    if wt:
        rel = (t - wt["start"]) / (wt["end"] - wt["start"])
        w   = wt["word"]

        gt_seq = gt_land.get(w, [])
        pd_seq = pred_land.get(w, [])
        if gt_seq and pd_seq:
            i_gt = min(int(rel * len(gt_seq)), len(gt_seq)-1)
            i_pd = min(int(rel * len(pd_seq)), len(pd_seq)-1)

            ov_gt = create_avatar_figure_image(gt_seq[i_gt])
            ov_pd = create_avatar_figure_image(pd_seq[i_pd])

            overlay_image_alpha(frame, ov_gt,   0,    H-300)
            overlay_image_alpha(frame, ov_pd,   W-300, H-300)

    out.write(frame)
    frame_i += 1

cap.release()
out.release()
print("Done — comparison_with_avatar.mp4 generated.")

Done — comparison_with_avatar.mp4 generated.


In [None]:
from moviepy import VideoFileClip, CompositeAudioClip

In [None]:


# 1) load the overlaid silent video
video = VideoFileClip("comparison_overlay.mp4")

# 2) grab the audio from the original
audio = VideoFileClip("sample_text_video.mp4").audio

new_audioclip = CompositeAudioClip([audio])
video.audio = new_audioclip

video.write_videofile("comparison_overlay_final.mp4")
print("New video file created!")