In [1]:
import ffmpeg
import whisper
import torch
import config

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Using GPU.


In [2]:
# Step 1: Extract audio from video
def extract_audio_from_video(video_path, output_audio_path):
    ffmpeg.input(video_path).output(output_audio_path).run(overwrite_output=True)

In [3]:
# Step 2: Transcribe audio with Whisper
def transcribe_audio(audio_path):
    model = whisper.load_model(config.MODEL).to(device)  # Load Whisper model
    result = model.transcribe(audio_path, verbose=True, task="translate")  # Perform transcription
    return result['segments']  # Return segments with time codes and text

In [4]:
# Step 3: Format time into SRT format
def format_time(seconds):
    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds % 1) * 1000)
    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

In [5]:
# Step 4: Generate SRT file from transcription segments
def write_srt_file(segments, output_srt_path):
    with open(output_srt_path, "w", encoding="utf-8") as f:
        for i, segment in enumerate(segments, start=1):
            start_time = format_time(segment['start'])
            end_time = format_time(segment['end'])
            text = segment['text']
            f.write(f"{i}\n{start_time} --> {end_time}\n{text.strip()}\n\n")

In [6]:
# Main Code

# Paths
import config

video_path = config.video_path  # Input video file
audio_path = config.audio_path  # Extracted audio file
srt_path = config.srt_path  # Output SRT file

# Process
print("Extracting audio...")
extract_audio_from_video(video_path, audio_path)

print("Transcribing audio...")
transcription_segments = transcribe_audio(audio_path)

print("Generating SRT file...")
write_srt_file(transcription_segments, srt_path)

print(f"Subtitle file created: {srt_path}")

Extracting audio...


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Transcribing audio...


  checkpoint = torch.load(fp, map_location=device)


Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: Tagalog
[00:00.000 --> 00:18.000]  I love you, I thought your love was real
[00:18.000 --> 00:21.000]  But it didn't take long for the real color to come out
[00:21.000 --> 00:25.000]  Your eyebrows are always up and always slim
[00:25.000 --> 00:28.000]  But I'm the one who paid for your lack of gas
[00:28.000 --> 00:31.000]  You're like Spiggs, your face is like a stick
[00:31.000 --> 00:35.000]  If you're shy, you'll really get shrieked
[00:35.000 --> 00:38.000]  Girlie, pretty, bye bye, don't tell a lie
[00:38.000 --> 00:41.000]  Why do you always deny me?
[00:41.000 --> 00:44.000]  All the good's are done, there's no recognition
[00:44.000 --> 00:48.000]  You like to get elections, you lost my attention
[00:48.000 --> 00:51.000]  Yo anyway, everyday, why do you always get text?
[00:51.000 --> 00:54.000]  And then one time I caught you having sex
[00:54.000 --> 00:57.000