In [12]:
import torch
from transformers import WhisperForConditionalGeneration

In [13]:
from datasets import load_dataset, DatasetDict

timit = DatasetDict()

timit["train"] = load_dataset("timit_asr", data_dir="./timit", split="train")
timit["test"] = load_dataset("timit_asr", data_dir="./timit", split="test")

timit

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 4620
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 1680
    })
})

In [14]:
model_name = "openai/whisper-tiny.en"

In [15]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)

In [16]:
from transformers import WhisperTokenizerFast

tokenizer = WhisperTokenizerFast(
    tokenizer_file="tokenizer/tokenizer.json",
)

In [17]:
def prepare_dataset(example):
    # load audio data as 16kHz
    audio = example["audio"]

    # compute log-Mel input features from input audio array
    example["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    example["labels"] = tokenizer(" ".join(example["phonetic_detail"]["utterance"])).input_ids
    return example

In [18]:
timit_clean = timit.map(prepare_dataset, remove_columns=timit.column_names["train"], num_proc=1)

In [19]:
input_features = torch.tensor(timit_clean["test"][0]["input_features"]).unsqueeze(0)

model_trained = WhisperForConditionalGeneration.from_pretrained("./openai/whisper-tiny.en-timit/checkpoint-1000")

generated_ids = model_trained.generate(input_features=input_features)

transcription = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
transcription

'sh iy tcl ae dcl d axr dcl d aa r kcl k s ux tcl t ih n gcl r iy s iy epi epi epi ao sh epi epi ao dx er q ao dh iy axr h# h# h# h# h# h# h# h# h# h# h# sh iy epi aa r kcl k epi ax h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# dh ax tcl h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# dh ax tcl s epi epi dh ax tcl s epi epi dh ax tcl h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# dh ax tcl s epi dh ax tcl s epi epi dh ax tcl h# h# h# h# h# h# h# h# h# tcl s epi dh ax tcl s epi epi dh ax tcl s epi epi dh ax tcl s epi epi epi dh ax tcl s epi epi epi dh ax tcl h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# h# t

In [20]:
" ".join(timit["test"][0]["phonetic_detail"]["utterance"])

'h# sh iy hv ae dcl d y er dcl d aa r kcl k s uw dx ih ng gcl g r iy s iy w aa sh epi w aa dx er q ao l y iy axr h#'