In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

https://huggingface.co/blog/fine-tune-whisper

### Download datasts

In [2]:
from datasets import load_dataset, DatasetDict, concatenate_datasets

common_voice_de = DatasetDict()

common_voice_de["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "de", split="train+validation", use_auth_token=True, cache_dir="../datasets")
common_voice_de["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "de", split="test", use_auth_token=True, cache_dir="../datasets")

print(common_voice_de)

Found cached dataset common_voice_13_0 (/home/ruzickal/Code/Privat/Whisper/notebooks/../datasets/mozilla-foundation___common_voice_13_0/de/13.0.0/22809012aac1fc9803eaffc44122e4149043748e93933935d5ea19898587e4d7)
Found cached dataset common_voice_13_0 (/home/ruzickal/Code/Privat/Whisper/notebooks/../datasets/mozilla-foundation___common_voice_13_0/de/13.0.0/22809012aac1fc9803eaffc44122e4149043748e93933935d5ea19898587e4d7)


DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 556580
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 16143
    })
})


In [6]:
#common_voice_en = DatasetDict()

#common_voice_en["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="train+validation", use_auth_token=True, cache_dir="../datasets")
#common_voice_en["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", use_auth_token=True, cache_dir="../datasets")

In [None]:
#print(common_voice_en)

### Setup Datasets

In [3]:
from datasets import Audio

common_voice_de = common_voice_de.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
#common_voice_en = common_voice_en.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

#common_voice_de = concatenate_datasets([common_voice_de["train"], common_voice_de["test"]])
#common_voice_en = concatenate_datasets([common_voice_en["train"], common_voice_en["test"]])
common_voice_de = common_voice_de.cast_column("audio", Audio(sampling_rate=16000))
#common_voice_en = common_voice_en.cast_column("audio", Audio(sampling_rate=16000))


### Setup input pipeline

In [6]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium")


In [7]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="german", task="transcribe")
# tokenizer.set_prefix_tokens(language="enlish")

In [2]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2", task="transcribe", language="german")
processor.tokenizer.save_pretrained("../models/whisper-large-v2")
processor.feature_extractor.save_pretrained("../models/whisper-large-v2")

['../models/whisper-large-v2/preprocessor_config.json']

In [8]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [8]:

common_voice_de_train = common_voice_de["train"].select(range(1000)).map(prepare_dataset,  num_proc=4)
common_voice_de_test = common_voice_de["test"].select(range(1000)).map(prepare_dataset,  num_proc=4)
#common_voice_de = common_voice_de.map(prepare_dataset, remove_columns=common_voice_de.column_names["train"], num_proc=4)
#common_voice_en = common_voice_en.map(prepare_dataset, remove_columns=common_voice_en.column_names["train"], num_proc=4)


Loading cached processed dataset at /home/ruzickal/Code/Privat/Whisper/datasets/mozilla-foundation___common_voice_13_0/de/13.0.0/22809012aac1fc9803eaffc44122e4149043748e93933935d5ea19898587e4d7/cache-86709cdfcc3a071b_*_of_00004.arrow
Loading cached processed dataset at /home/ruzickal/Code/Privat/Whisper/datasets/mozilla-foundation___common_voice_13_0/de/13.0.0/22809012aac1fc9803eaffc44122e4149043748e93933935d5ea19898587e4d7/cache-512bf8d5376d8080_*_of_00004.arrow


In [9]:
common_voice_de_train

Dataset({
    features: ['audio', 'sentence', 'variant', 'input_features', 'labels'],
    num_rows: 1000
})

In [10]:
common_voice_de

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'variant'],
        num_rows: 556580
    })
    test: Dataset({
        features: ['audio', 'sentence', 'variant'],
        num_rows: 16143
    })
})

### Train & Evaluate

In [6]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previ
        # ous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [7]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [9]:
import evaluate

metric = evaluate.load("wer")


In [10]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [3]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [4]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="../models/whisper-large-v2",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    no_cuda=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

In [11]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    #train_dataset=common_voice_de_train,
    #eval_dataset=common_voice_de_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 3.94 GiB total capacity; 3.51 GiB already allocated; 4.69 MiB free; 3.59 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.train()

### Upload to the hub

In [None]:
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_13_0",
    "dataset": "Common Voice 13.0",  # a 'pretty' name for the training dataset
    "dataset_args": "config: de, split: test",
    "language": "de",
    "model_name": "Whisper v2 Large German - Laurenz Ruzicka",  # a 'pretty' name for your model
    "finetuned_from": "bofenghuang/whisper-large-v2-cv11-german",
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}


In [None]:
trainer.push_to_hub(**kwargs)

In [None]:
trainer.save_model("../models/whisper-large-v2")

### Convert to ONNX

https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model

Leave out the optimize argument if the model should run on more general inference hardware than the onnx runtime

In [None]:
!optimum-cli export onnx --model ../models/whisper-large-v2-cv13-ge --output ../onnx_models/whisper-large-v2-cv13-ge --task automatic-speech-recognition --device cuda --optimize 04

In [None]:
!optimum-cli export onnx --model openai/whisper-medium --output ../onnx_models/whisper-medium --task automatic-speech-recognition --device cuda --optimize 04

### Load from the hub

In [3]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

#model = WhisperForConditionalGeneration.from_pretrained("Znerual/whisper-large-v2-cv13-ge")
#processor = WhisperProcessor.from_pretrained("Znerual/whisper-large-v2-cv13-ge")

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")

#model.save_pretrained("../models/whisper-medium")


### Build Interface to test it

In [5]:
from transformers import pipeline
import gradio as gr

#pipe = pipeline(model="Znerual/whisper-large-v2-cv13-ge")  # change to "your-username/the-name-you-picked"
pipe = pipeline(model="openai/whisper-medium", generate_kwargs = {"language":"<|de|>","task": "transcribe"})  # change to "your-username/the-name-you-picked"

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text",
    title="Whisper German",
    description="Realtime demo for German speech recognition using a fine-tuned Whisper large v2 model.",
)

iface.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.






### Convert to faster-whisper Transformer

https://github.com/guillaumekln/faster-whisper
https://opennmt.net/CTranslate2/guides/transformers.html

In [6]:
from torch import nn
from torch.utils.data.dataloader import DataLoader
import functools
from tqdm import tqdm

def get_act_scales(model, dataset, num_samples,):
    model.eval()
    device = next(model.parameters()).device
    act_scales = {}
    
    def stat_tensor(name, tensor):
        hidden_dim = tensor.shape[-1]
        tensor = tensor.view(-1, hidden_dim).abs().detach()
        comming_max = torch.max(tensor, dim=0)[0].float().cpu()
        if name in act_scales:
            act_scales[name] = torch.max(act_scales[name], comming_max)
        else:
            act_scales[name] = comming_max

    def stat_input_hook(m, x, y, name):
        if isinstance(x, tuple):
            x = x[0]
        stat_tensor(name, x)

    hooks = []
    for name, m in model.named_modules():
        if isinstance(m, nn.Linear):
            hooks.append(
                m.register_forward_hook(
                    functools.partial(stat_input_hook, name=name))
            )
            
    test_dataloader = DataLoader(dataset, batch_size=1, collate_fn=data_collator)
    for step, batch in tqdm(enumerate(test_dataloader)):
        if step > num_samples:
            break
        
        batch = batch.to(device)
        model(batch)

    for h in hooks:
        h.remove()

    return act_scales
    
act_scales = get_act_scales(model, common_voice_de["test"], 128)
torch.save(act_scales, "act_scales.pt")

NameError: name 'common_voice_de' is not defined

In [14]:
import ctranslate2
converter = ctranslate2.converters.TransformersConverter("openai/whisper-large-v2",  load_as_float16=True) # activation_scales="act_scales.pt",
converter.convert("../quantized_models/whisper-large-v2", quantization="int8", force=True) # or use "int8, int16, float16"

'../quantized_models/whisper-large-v2'

### Run faster-whisper

In [17]:
from faster_whisper import WhisperModel
import logging
ctranslate2.set_log_level(logging.INFO)
model = WhisperModel("../quantized_models/whisper-large-v2", device="cpu", compute_type="int8")

segments, _ = model.transcribe("../whisper.cpp/samples/jfk.wav", word_timestamps=True, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))

for segment in segments:
    for word in segment.words:
        print("[%.2fs -> %.2fs, %d] %s" % (word.start, word.end, segment.avg_logprob, word.word))

[2023-05-21 11:21:18.323] [ctranslate2] [thread 34050] [info] Loaded model ../quantized_models/whisper-large-v2 on device cpu:0
[2023-05-21 11:21:18.323] [ctranslate2] [thread 34050] [info]  - Binary version: 6
[2023-05-21 11:21:18.324] [ctranslate2] [thread 34050] [info]  - Model specification revision: 3
[2023-05-21 11:21:18.324] [ctranslate2] [thread 34050] [info]  - Selected compute type: int8


[0.00s -> 0.56s, 0]  And
[0.56s -> 0.88s, 0]  so
[0.88s -> 1.22s, 0]  my
[1.22s -> 1.56s, 0]  fellow
[1.56s -> 2.22s, 0]  Americans,
[3.32s -> 3.78s, 0]  ask
[3.78s -> 4.36s, 0]  not
[4.52s -> 5.58s, 0]  what
[5.58s -> 5.84s, 0]  your
[5.84s -> 6.30s, 0]  country
[6.30s -> 6.64s, 0]  can
[6.64s -> 6.84s, 0]  do
[6.84s -> 7.08s, 0]  for
[7.08s -> 7.50s, 0]  you,
[8.16s -> 8.54s, 0]  ask
[8.54s -> 8.80s, 0]  what
[8.80s -> 9.08s, 0]  you
[9.08s -> 9.36s, 0]  can
[9.36s -> 9.58s, 0]  do
[9.58s -> 9.80s, 0]  for
[9.80s -> 10.02s, 0]  your
[10.02s -> 10.34s, 0]  country.


In [None]:
import gradio as gr


def transcribe(audio):
    segments, _ = model.transcribe(audio)
    output_text = []
    for segment in segments:
        for word in segment.words:
            output_text.append("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
    return " ".join(output_text)

iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text",
    title="Whisper German",
    description="Realtime demo for German speech recognition using a fine-tuned Whisper large v2 model.",
)

iface.launch()

### Alternative approach

https://github.com/ggerganov/whisper.cpp

In order to obtain timestamps, use the -ml 1 argument (at least for the main code where the input is an audio file)

In [None]:
#!python ../whisper.cpp/models/convert-pt-to-ggml.py ../models/whisper-large-v2-cv13-ge.pt ../whisper ../ggml_models/whisper-large-v2-cv13-ge

In [None]:
!python ../whisper.cpp/models/convert-h5-to-ggml.py ../models/whisper-large-v2-cv13-ge ../whisper ../ggml_models/whisper-large-v2-cv13-ge

In [None]:
%cd ../whisper.cpp

In [None]:
!WHISPER_CUBLAS=1 make -j

In [None]:
!make stream

In [None]:
%cd ..

In [None]:
!./whisper.cpp/stream -m ./ggml_models/whisper-large-v2-cv13-ge.bin -t 8 --step 500 --length 5000 --print-colors

In [None]:
!./whisper.cpp/main -m ./ggml_models/whisper-large-v2-cv13-ge.bin -f ./whisper.cpp/samples/jfk.wav --print-colors