In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_from_disk
from evaluate import load

In [14]:
device = "cuda:0"

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, low_cpu_mem_usage=True,
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=1,  # batch size for inference - set based on your device
    device=device,
)

Device set to use cuda:0


In [4]:
dss = {
    "cslu": load_from_disk("../data/cslu_kids.ds"),
}

In [16]:
# Test
sample = dss["cslu"][0]["audio"]

result = pipe(sample)
print(dss["cslu"][0]["sentence"])
print(result)

<bn> a b c d e f g <br> h i j k<ln> l m n o p<ln> <br> q r s t u v w x y and z <bn> <pau> my<bn> family<bn> <bn> she<bn> went<bn> to<bn> go<bn> pick<bn> up<bn> my<bn> little<bn> sister<bn> and<bn> she's gonna<bn> <br> come<bn> tomorrow she's gonna come at eleven <pau> yeah <pau> <bn> okay <bn> clean my room <bn> and<bn> then<bn> when<bn> i'm<bn> done<bn> i<bn> get<bn> to<bn> play<bn> with<bn> my<bn> friend<bn> <pau> brittney we go over to her house and we play barbies <pau> and <br> we uhm <pau> we ride our bikes after we're done and then we eat some ice cream <pau> i have four sisters <pau> <bs> one's fifteen <bn> th* four* thirteen <br> and ten and one's five <pau> yeah <pau> <bn> they're nice and they let me <br> uhm watch tv<ln> in their room <bs> and uhm <br> <pau> and<bn> <br> she<bn> when sometimes<ln> when i <br> do a little bit of chores <br> she gives me a dollar
{'text': " A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, and Z. My family. She went t

# Run on all Data

In [None]:
results = pipe(dss["cslu"]["audio"])
print(results["text"])

In [28]:
import re
from text2digits import text2digits
import string

t2d = text2digits.Text2Digits()
punctuation_remover = str.maketrans('', '', string.punctuation)

def normalize_transcript(text):
    # The original transcript has annotations, for example a pause is <pau>
    # Remove tags in angle brackets
    text = re.sub(r'<[^>]*>', '', text)
    
    # These are "false starts" in the original transcript, for example th*
    # These are ignored by ASR
    # Remove words that end with asterisks (e.g., th*)
    text = re.sub(r'\S*\*', '', text)

    # Remove all punctuation
    text = text.translate(punctuation_remover)

    # Clean up excess spaces in the original transcript or resulting from above operations
    text = re.sub(r'\s+', ' ', text)

    # Convert number representations, e.g., "thirteen" to "13"
    # This is imperfect (does not know when "one" is a pronoun vs. a number)
    # But we apply the same normalization to both samples, so works fine for Word Error Rate
    try:
        normalized_text = t2d.convert(text)
    except:
        print(text)
    return normalized_text.strip().lower()

In [30]:
wer = load("wer")
predictions = [normalize_transcript(d["text"]) for d in results]
references = [normalize_transcript(text) for text in dss["cslu"]["sentence"]]
wer_score = wer.compute(predictions=predictions, references=references)
print(wer_score)

0.20958586984480837
