In [5]:
import whisperx
from difflib import SequenceMatcher
import datetime

# 1) Load audio & models
device     = "cpu"
audio_path = "data/shapeofyou.mp3"
model      = whisperx.load_model(
                "base",
                device=device,
                compute_type="float32"   # ← force float32 on CPU
             )

audio      = whisperx.load_audio(audio_path)

# 2) Transcribe & get word timestamps
result       = model.transcribe(audio_path)
segments     = result["segments"]
align_model, metadata = whisperx.load_align_model(
    language_code=result["language"],
    device=device
)
result_aligned = whisperx.align(
    segments, align_model, metadata, audio, device
)
words        = result_aligned["word_segments"]
words     = result_aligned["word_segments"]       # list of dicts: {word, start, end}

# 3) Load your lyrics in exact order
with open("data/shapeofyou.mp3", encoding="utf-8") as f:
    lyrics = [L.strip() for L in f if L.strip()]

# 4) Match each lyric line to the best window of words
def match_lyrics_to_words(lyrics, words):
    aligned = []
    used_idxs = set()
    for line in lyrics:
        # find the contiguous run of words whose joined text best matches the lyric
        best = {"score":0, "start":None, "end":None}
        for i in range(len(words)):
            if i in used_idxs: continue
            text_accum = ""
            for j in range(i, len(words)):
                if j in used_idxs: break
                text_accum += words[j]["word"] + " "
                score = SequenceMatcher(None,
                                        line.lower(),
                                        text_accum.strip().lower()
                                       ).ratio()
                if score > best["score"]:
                    best = {
                        "score": score,
                        "start": words[i]["start"],
                        "end":   words[j]["end"],
                        "indexes": list(range(i, j+1))
                    }
        if best["start"] is not None:
            aligned.append({
                "lyric": line,
                "start": best["start"],
                "end":   best["end"]
            })
            used_idxs.update(best["indexes"])
        else:
            aligned.append({"lyric": line, "start": None, "end": None})
    return aligned

aligned = match_lyrics_to_words(lyrics, words)

# 5) Enforce strictly sequential, non‑overlapping timing
def enforce_sequence(aligned, min_gap=0.01):
    last_end = 0.0
    for e in aligned:
        s = e["start"] if e["start"] is not None else last_end
        t = e["end"]   if e["end"]   is not None else s + min_gap
        if s < last_end:
            s = last_end
        if t <= s:
            t = s + min_gap
        e["start"], e["end"] = s, t
        last_end = t
    return aligned

aligned = enforce_sequence(aligned)

# 6) Write SRT
def fmt(ts):
    td = datetime.timedelta(seconds=ts)
    tot = int(td.total_seconds())
    hh = tot//3600; mm = (tot%3600)//60; ss = tot%60
    ms = int((td.total_seconds()-tot)*1000)
    return f"{hh:02d}:{mm:02d}:{ss:02d},{ms:03d}"

with open("output1.srt", "w", encoding="utf-8") as f:
    for i, e in enumerate(aligned, 1):
        f.write(f"{i}\n{fmt(e['start'])} --> {fmt(e['end'])}\n{e['lyric']}\n\n")

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint e:\LPU\Build-a-thon\.venv\Lib\site-packages\whisperx\assets\pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.74) in first 30s of audio...


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 138: invalid start byte

In [None]:
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.video.VideoClip import ImageClip, TextClip
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
from moviepy.audio.io.AudioFileClip import AudioFileClip
from PIL import Image
import numpy as np
import pysrt

# --- STEP 1: Load audio ---
audio = AudioFileClip("data/shapeofyou.mp3")
audio_duration = audio.duration

# --- STEP 2: Load and resize background image using Pillow ---
img = Image.open("data/eagle.jpg")
img_resized = img.resize((int(720 * img.width / img.height), 720), Image.Resampling.LANCZOS)
img_array = np.array(img_resized)
background = ImageClip(img_array, duration=audio_duration)

# --- STEP 3: Load SRT file ---
subs = pysrt.open("output1.srt")

# --- STEP 4: Create text overlays for each subtitle line ---
text_clips = []
last_end = 0.0  # time in seconds when the last subtitle finished

for sub in subs:
    # get precise start/end in seconds (including milliseconds)
    start = sub.start.ordinal / 1000.0
    end   = sub.end.ordinal   / 1000.0

    # if this subtitle would overlap the previous one, push its start forward
    if start < last_end:
        start = last_end

    # recalc duration; skip if non‑positive
    duration = end - start
    if duration <= 0:
        continue

    txt = TextClip(
        text=sub.text,
        font_size=48,
        font=r'C:\WINDOWS\FONTS\COPRGTL.TTF',
        color="white",
        bg_color="black",
        method='label',
        transparent=False
    )
    txt = txt.with_start(start).with_duration(duration).with_position('center')

    text_clips.append(txt)
    last_end = start + duration

# --- STEP 5: Combine background + text overlays ---
final_video = CompositeVideoClip([background] + text_clips)
final_video = final_video.with_audio(audio)

# --- STEP 6: Export final video ---
final_video.write_videofile("lyrics_video2.mp4", fps=40)

frame_index:  14%|█▎        | 1180/8663 [01:49<08:41, 14.34it/s, now=None]

MoviePy - Building video lyrics_video2.mp4.
MoviePy - Writing audio in lyrics_video2TEMP_MPY_wvf_snd.mp3


frame_index:  14%|█▎        | 1180/8663 [01:49<08:41, 14.34it/s, now=None]

MoviePy - Done.
MoviePy - Writing video lyrics_video2.mp4



frame_index:  14%|█▎        | 1180/8663 [02:31<08:41, 14.34it/s, now=None]

MoviePy - Done !
MoviePy - video ready lyrics_video2.mp4
