## SETUP

In [1]:
import torch, torchaudio
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(device , torch_dtype)

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "english"}
)

# Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

cpu torch.float32


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def chunks_to_vtt(chunks):
    vtt_content = "WEBVTT\n\n"
    for idx, chunk in enumerate(chunks):
        start_time = chunk["timestamp"][0]
        end_time = chunk["timestamp"][1]
        text = chunk["text"].strip()
        vtt_content += f"{idx+1}\n"  # optional
        vtt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
        vtt_content += f"{text}\n\n"
    return vtt_content

def format_time(seconds):
    hours = int(seconds / 3600)
    seconds %= 3600
    minutes = int(seconds / 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{int(seconds):02}.{milliseconds:03}"


# ------------------------------------------------------------------------------------------------------------------------------------------------------------------


def convert_to_srt(chunks):
    srt_content = ''
    for i, chunk in enumerate(chunks, start=1):
        start_time = chunk['timestamp'][0]
        end_time = chunk['timestamp'][1]
        text = chunk['text'].strip()
        srt_content += f"{i}\n"
        srt_content += f"{format_srt_time(start_time)} --> {format_srt_time(end_time)}\n"
        srt_content += f"{text}\n\n"
    return srt_content

def format_srt_time(seconds):
    milliseconds = int(seconds * 1000)
    hours, milliseconds = divmod(milliseconds, 3600000)
    minutes, milliseconds = divmod(milliseconds, 60000)
    seconds = milliseconds / 1000
    return "{:02d}:{:02d}:{:06.3f}".format(hours, minutes, seconds)

## AUDIO 1

In [87]:
audio_file_path = "/Users/sohanm/EMEA/OCP/240201-080649-570711.mp3"
waveform, sample_rate = torchaudio.load(audio_file_path)

print("Sampling Rate :", sample_rate)
print("Waveform Shape :", waveform.shape)

Sampling Rate : 48000
Waveform Shape : torch.Size([1, 9872160])


In [88]:
transcription = pipe(audio_file_path,
                    chunk_length_s=30,
                    stride_length_s=5,
                    batch_size=8,
                    generate_kwargs={"language": "french"})
transcription

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


{'text': " Bonjour. Oui, bonjour dame. Pharmacie d'Elysse à Montagnac. Oui. Référence 814-566. Alors, 814. Oui. Alors, je vous appelle parce qu'on a été livré cette nuit, après l'ouverture de la pharmacie. Et il nous manque un bac. Ah, on va vérifier ensemble. Alors, on va regarder ça de suite. En tout cas, il n'y a pas beaucoup de commandes, mais là, pour le coup... J'ai des produits urgents, ça m'embête un peu. Ouf. On va vérifier, je me mets juste sur... Ouh là là, j'ai sié les rames un peu. On va faire ça. Ça. Toc, toc, hop, voilà. Toc. 814, 566, hop. Hop, hop, hop. Voilà. Montpellier, Montpellier, Montpellier. Du coup, je suppose qu'on n'avait pas eu le BL. Alors, si vous voulez, j'ai tout eu. J'ai bien reçu les factures par mail. J'ai tout reçu, sauf les brûlures. Mais du coup, s'il vous plaît, je vais vous demander le numéro d'ordre sur le B.L. en haut à droite. Alors, attendez, parce que j'ai la facture. Vous avez le numéro du document sur la facture. Alors, c'est numéro de com

In [89]:
vtt_content = chunks_to_vtt(transcription["chunks"])

# Convert chunks to VTT format
file_name = audio_file_path.split(".mp3")[0]
with open(file_name+".txt", "w") as f:
    f.write(vtt_content)

TypeError: unsupported operand type(s) for /: 'NoneType' and 'int'

In [66]:
srt_content = convert_to_srt(transcription["chunks"])

# Convert chunks to SRT format
file_name = audio_file_path.split(".mp4")[0]
with open(file_name+"_output.srt", "w") as f:
    f.write(srt_content)

TypeError: unsupported operand type(s) for *: 'NoneType' and 'int'

## AUDIO 2

In [None]:
audio_file_path = "/Users/gautam/Code/WatsonX_Foundations/project_gse/results/IBM Tech Now_ IBM watsonx.ai demo, AI and sustainability, and the AI Bundle for IBM Z and LinuxONE.mp4"
waveform, sample_rate = torchaudio.load(audio_file_path)

print("Sampling Rate :", sample_rate)
print("Waveform Shape :", waveform.shape)

In [None]:
transcription = pipe(audio_file_path,
                    chunk_length_s=30,
                    stride_length_s=5,
                    batch_size=8,
                    generate_kwargs={"language": "english"})
transcription

In [None]:
vtt_content = chunks_to_vtt(transcription["chunks"])

file_name = audio_file_path.split(".mp4")[0]
with open(file_name+"_output.vtt", "w") as f:
    f.write(vtt_content)                             

## AUDIO 3

In [None]:
audio_file_path = "/Users/gautam/Code/WatsonX_Foundations/project_gse/results/Customer Care L1 Intro Video.MP4"
waveform, sample_rate = torchaudio.load(audio_file_path)

print("Sampling Rate :", sample_rate)
print("Waveform Shape :", waveform.shape)

In [None]:
transcription = pipe(audio_file_path,
                    chunk_length_s=30,
                    stride_length_s=5,
                    batch_size=8,
                    generate_kwargs={"language": "english"})
transcription

In [None]:
vtt_content = chunks_to_vtt(transcription["chunks"])

file_name = audio_file_path.casefold().split(".mp4")[0]
with open(file_name+"_output.vtt", "w") as f:
    f.write(vtt_content)

## AUDIO 4

In [None]:
audio_file_path = "/Users/gautam/Code/WatsonX_Foundations/project_gse/results/Facilitation Introduction to the course.MP4"
waveform, sample_rate = torchaudio.load(audio_file_path)

print("Sampling Rate :", sample_rate)
print("Waveform Shape :", waveform.shape)

In [None]:
transcription = pipe(audio_file_path,
                    chunk_length_s=30,
                    stride_length_s=5,
                    batch_size=8,
                    generate_kwargs={"language": "english"})
transcription

In [None]:
vtt_content = chunks_to_vtt(transcription["chunks"])

file_name = audio_file_path.casefold().split(".mp4")[0]
with open(file_name+"_output.vtt", "w") as f:
    f.write(vtt_content)