🎬 1. Extract Audio from Video

In [1]:
%pip install moviepy speechrecognition pydub

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from moviepy.editor import VideoFileClip

video = VideoFileClip("Subtitle-Video/video/Demo-Original.mp4")
video.audio.write_audiofile("Subtitle-Video/temp_audio/temp_audio.wav")

MoviePy - Writing audio in Subtitle-Video/temp_audio/temp_audio.wav


                                                        

MoviePy - Done.




🧠 2. Transcribe Audio to Text (Speech Recognition)

In [3]:
import speech_recognition as sr
from pydub import AudioSegment
from pydub.silence import split_on_silence

In [4]:
recognizer = sr.Recognizer()
sound = AudioSegment.from_wav("Subtitle-Video/temp_audio/temp_audio.wav")

In [5]:
chunks = split_on_silence(sound, min_silence_len=700, silence_thresh=sound.dBFS-14)

In [6]:
subtitles = []
start_time = 0

In [7]:
# for i, chunk in enumerate(chunks):
#     chunk_silent = AudioSegment.silent(duration=500)
#     audio_chunk = chunk_silent + chunk + chunk_silent
#     chunk_filename = f"Subtitle-Video/temp_audio/chunk{i}.wav"
#     audio_chunk.export(chunk_filename, format="wav")

#     with sr.AudioFile(chunk_filename) as source:
#         audio = recognizer.record(source)
#         try:
#             text = recognizer.recognize_google(audio)
#             end_time = start_time + len(audio_chunk) / 1000.0
#             subtitles.append((start_time, end_time, text))
#             start_time = end_time
#         except sr.UnknownValueError:
#             continue

In [8]:
from pydub.silence import detect_nonsilent
import os

nonsilent_ranges = detect_nonsilent(sound, min_silence_len=700, silence_thresh=sound.dBFS - 14)

In [9]:
for i, (start_ms, end_ms) in enumerate(nonsilent_ranges):
    chunk = sound[start_ms:end_ms]
    chunk_filename = f"Subtitle-Video/temp_audio/chunk{i}.wav"
    chunk.export(chunk_filename, format="wav")

    with sr.AudioFile(chunk_filename) as source:
        audio = recognizer.record(source)

        try:
            text = recognizer.recognize_google(audio)
            start_time = start_ms / 1000.0
            end_time = end_ms / 1000.0
            subtitles.append((start_time, end_time, text))
        except sr.UnknownValueError:
            continue

    os.remove(chunk_filename)

💬 3. Burn Subtitles into Video

In [10]:
from moviepy.editor import TextClip, CompositeVideoClip

In [11]:
# import moviepy.config as mpy_config
# mpy_config.change_settings({"IMAGEMAGICK_BINARY": r"C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\convert.exe"})

In [12]:
from moviepy.config import change_settings
change_settings({"IMAGEMAGICK_BINARY": r"C:\\Program Files\\ImageMagick-7.1.1-Q16-HDRI\\magick.exe"})

In [13]:
from moviepy.editor import TextClip
# solution to MoviePy not found error: https://stackoverflow.com/questions/51928807/moviepy-cant-detect-imagemagick-binary-on-windows

clip = TextClip("Hello, world!", fontsize=70, color='white', bg_color='black')
clip.save_frame("test_output.png")

In [14]:
import textwrap

In [15]:
import re

def split_text(text, max_words=8):
    """Split a sentence into smaller chunks with up to `max_words` each."""
    words = text.split()
    return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

In [16]:
subtitle_clips = []
for start, end, text in subtitles:
    duration = end - start
    chunks = split_text(text, max_words=8)
    chunk_duration = duration / len(chunks)

    for i, chunk in enumerate(chunks):
        wrapped_text = textwrap.fill(chunk, width=50)
        txt_clip = TextClip(
            wrapped_text,
            fontsize=28,
            color='white',
            method='caption',
            size=(int(video.w * 0.9), None)
        )

        txt_clip = txt_clip.on_color(
            size=txt_clip.size,
            color=(0, 0, 0),
            col_opacity=0.6
        )

        txt_clip = txt_clip.set_position(("center", "bottom"))
        txt_clip = txt_clip.set_start(start + i * chunk_duration).set_duration(chunk_duration)

        subtitle_clips.append(txt_clip)

In [17]:
final_video = CompositeVideoClip([video] + subtitle_clips, size=video.size)
final_video.write_videofile("Subtitle-Video/output/Output-Demo.mp4", codec="libx264", fps=video.fps)

Moviepy - Building video Subtitle-Video/output/Output-Demo.mp4.
MoviePy - Writing audio in Output-DemoTEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video Subtitle-Video/output/Output-Demo.mp4



                                                               

Moviepy - Done !
Moviepy - video ready Subtitle-Video/output/Output-Demo.mp4
