# **Test distil-whisper**

**distill-small**

In [None]:
!pip install -q datasets bitsandbytes accelerate

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import time

# Model setup code
model_id = "distil-whisper/distil-small.en"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

dataset_repo = "johnlohjy/imda_nsc_p3_same_closemic_train"
dataset = load_dataset(dataset_repo, split='train', streaming=True, trust_remote_code=True)
dataset_iter = iter(dataset)

Device set to use cpu


In [None]:
sample = next(dataset_iter)
sample = sample["audio"]
start_time = time.time()
result = pipe(sample)
end_time = time.time()
print(f'The transcripton is {result} \n')
print(f'It took {end_time-start_time}')



The transcripton is {'text': " You can go first? You guys are gonna stand here? They're like, well, this is a weird topic. Singapore and Malaysia are like, you know, brothers, but not really brothers, brothers on a tricky relationship. You know what, let's keep this topic. Next, do I go next? Do I go next? Okay, Hingsui, what's the best worst thing, best or worst thing that can happen to you in Singapore?"} 

It took 19.30585479736328


In [None]:
sample = next(dataset_iter)
sample = sample["audio"]
start_time = time.time()
result = pipe(sample)
end_time = time.time()
print(f'The transcripton is {result} \n')
print(f'It took {end_time-start_time}')

In [None]:
sample = next(dataset_iter)
sample = sample["audio"]
start_time = time.time()
result = pipe(sample)
end_time = time.time()
print(f'The transcripton is {result} \n')
print(f'It took {end_time-start_time}')

In [None]:
sample = next(dataset_iter)
sample = sample["audio"]
start_time = time.time()
result = pipe(sample)
end_time = time.time()
print(f'The transcripton is {result} \n')
print(f'It took {end_time-start_time}')