In [1]:
import ffmpeg
import whisper
import torch
import pathConfig
from modelConfig import ModelConfig

In [2]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Using GPU.


In [3]:
# Step 1: Extract audio from video
def extract_audio_from_video(video_path, output_audio_path):
    ffmpeg.input(video_path).output(output_audio_path).run(overwrite_output=True)

In [4]:
# Step 2: Transcribe audio with Whisper
def transcribe_audio(audio_path):
    model = whisper.load_model(ModelConfig.MEDIUM).to(device)  # Load Whisper model
    result = model.transcribe(audio_path, verbose=True, task="translate")  # Perform transcription
    return result['segments']  # Return segments with time codes and text

In [5]:
# Step 3: Format time into SRT format
def format_time(seconds):
    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds % 1) * 1000)
    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

In [6]:
# Step 4: Generate SRT file from transcription segments
def write_srt_file(segments, output_srt_path):
    with open(output_srt_path, "w", encoding="utf-8") as f:
        for i, segment in enumerate(segments, start=1):
            start_time = format_time(segment['start'])
            end_time = format_time(segment['end'])
            text = segment['text']
            f.write(f"{i}\n{start_time} --> {end_time}\n{text.strip()}\n\n")

In [7]:
# Main Code

# Paths
video_path = pathConfig.video_path  # Input video file
audio_path = pathConfig.audio_path  # Extracted audio file
srt_path = pathConfig.srt_path  # Output SRT file

# Process
print("Extracting audio...")
extract_audio_from_video(video_path, audio_path)

print("Transcribing audio...")
transcription_segments = transcribe_audio(audio_path)

print("Generating SRT file...")
write_srt_file(transcription_segments, srt_path)

print(f"Subtitle file created: {srt_path}")

Extracting audio...


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Transcribing audio...


  checkpoint = torch.load(fp, map_location=device)


Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:26.160]  Finally, I get to upload this vlog stall from months and amazing trip to the Gosai
[00:26.160 --> 00:31.520]  Kunder lake with these amazing talents. You've met Manish Moharjan already and Abin Bhau.
[00:31.520 --> 00:37.800]  Introducing Concentrest to doing the hair and makeup and I'm sure you know this one very well
[00:37.800 --> 00:42.040]  by now. But just in case you don't she's got a five minute long introduction video which you can
[00:42.040 --> 01:05.200]  find on YouTube. Link in description. This helicopter ride was a last minute thing pulled off by our
[01:05.200 --> 01:11.000]  Miss Nepal. With time being constrained, we absolutely needed this. Cruising past the cities,
[01:11.040 --> 01:19.640]  over the hills and above the clouds, it just happens too fast. Close call with the mountains,
[01:19.640 --> 01:29.400]  no problem