In [None]:
from datetime import timedelta
import os
import ffmpeg
import torch 
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline



def transcribe_audio(path):

    device = "cuda:0" #if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "openai/whisper-large-v3-turbo"

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
    )
    
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)
    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language)
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        chunk_length_s=30,
        batch_size=32,  
        torch_dtype=torch_dtype,
        device=device,
        max_line_width = 80 
    )


    print("Whisper model loaded.")
    transcribe = pipe(path, return_timestamps = True)
    segments = transcribe['chunks']


    free_vram(model)
    os.makedirs("SrtFiles", exist_ok=True)


    for i, segment in enumerate(segments):

        #works for chunking but sequential has a bug that resets the timestamp
        start_time = str(0)+str(timedelta(seconds=int(segment["timestamp"][0]))) + ",000"
        end_time = str(0)+str(timedelta(seconds=int(segment["timestamp"][1]))) + ",000"
        text = segment['text']
        segmentId = i + 1
        segment = f"{segmentId}\n{start_time} --> {end_time}\n{text[1:] if text.startswith(" ") else text}\n\n"
        srtFilename = os.path.join("SrtFiles", f"{path}.srt")
        with open(srtFilename, 'a', encoding='utf-8') as srtFile:
            srtFile.write(segment)

        

    pipe = pipeline("translation", model="facebook/nllb-200-distilled-1.3B")
    for i, segment in enumerate(segments):

        start_time = str(0)+str(timedelta(seconds=int(segment["timestamp"][0]))) + ",000"
        end_time = str(0)+str(timedelta(seconds=int(segment["timestamp"][1]))) + ",000"
        text = pipe(segment['text'].strip(), src_lang = src_lang, tgt_lang = tgt_lang)[0]['translation_text']

        segmentId = i + 1
        segment = f"{segmentId}\n{start_time} --> {end_time}\n{text[1:] if text.startswith(" ") else text}\n\n"

        srtFilename = os.path.join("SrtFiles", f"{path}_translated.srt")
        with open(srtFilename, 'a', encoding='utf-8') as srtFile:
            srtFile.write(segment)

    return srtFilename

def free_vram(model):
    model.to("cpu")
    del model
    torch.cuda.empty_cache() 

def convert_float(number):
    return "{:06.3f}".format(number).replace('.', ',')



In [None]:
src_lang = "auto"
tgt_lang = "eng_Latn"
language = "es"

file_name= ""
#save audio
output_audio = file_name + ".wav"
if not os.path.exists(output_audio):
    ffmpeg.input(file_name).output(output_audio, format='wav', acodec='pcm_s16le', ar=16000, ac=1).run()

transcribe_audio(output_audio)