# Whisper benchmarking


In [None]:
%pip install --upgrade pip
# %pip install --upgrade evaluate jiwer
# Let's install torch here.  Pick out your options from https://pytorch.org/get-started/locally/
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu #CPU Only Version

import time

# import evaluate

# wer = evaluate.load("wer")
# cer = evaluate.load("cer")

## Transformers


In [None]:
%pip install --upgrade transformers accelerate

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

start_time = time.time()
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

device = "cpu"
torch_dtype = torch.float32

model_id = "openai/whisper-tiny.en"

print(f"We are using {torch_dtype} on {device} with {model_id}")

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

sample = "hp0.wav"


result = pipe(sample, generate_kwargs={"return_timestamps": True})
elapsed_time = time.time() - start_time
print(f"Elapsed time:{elapsed_time / 60}")

print(result["text"])


# with open("transformers.txt", "w") as f:
#    f.write(str(result["text"].encode("utf-8")))

## Faster Whisper


In [None]:
%pip install faster-whisper

In [None]:
from faster_whisper import WhisperModel

start_time = time.time()
model_size = "tiny.en"
# model_size = "deepdml/faster-whisper-large-v3-turbo-ct2"

# Run on GPU with FP16
# model = WhisperModel(model_size, device="cuda", compute_type="float16")

# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
model = WhisperModel(model_size, device="cpu", compute_type="auto")

segments, info = model.transcribe(sample, beam_size=5)

print(
    "Detected language '%s' with probability %f"
    % (info.language, info.language_probability)
)

calltext = "".join(segment.text for segment in segments)

print(calltext)

elapsed_time = time.time() - start_time
print(f"Elapsed time:{elapsed_time / 60}")