## **Wav2Vec2 Fine-tune with Word-level tokenizer**

In [56]:
import warnings
warnings.filterwarnings("ignore")

In [57]:
# General
import random
import pandas as pd
import numpy as np
import torch
import re
import json

# Dataloader and preprocessing
from datasets import load_dataset, ClassLabel, DatasetDict, Audio
from IPython.display import display, HTML
import IPython.display as ipd
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

# Model
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC

# Training
from datasets import load_metric
from transformers import Trainer, AutoConfig, TrainingArguments

## **Data Loading**

In [58]:
# Load into memory
print("Loading data...")

dataset = DatasetDict()
dataset = load_dataset("data_gtts", data_dir="./data_gtts")

print(f"Train: {len(dataset['train'])} samples")
print(f"Test: {len(dataset['test'])} samples")

# Resample to 16kHz
dataset = dataset.cast_column('audio', Audio(sampling_rate=16000))
dataset['train'][0]

Using custom data configuration default-data_dir=.%2Fdata_gtts
Found cached dataset data_gtts (I:/Repos/HFdatasets/data_gtts/default-data_dir=.%2Fdata_gtts/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68)


Loading data...


  0%|          | 0/2 [00:00<?, ?it/s]

Train: 1000 samples
Test: 250 samples


{'audio': {'path': 'data_gtts/train/BAW4QM_Knots_260.mp3',
  'array': array([ 2.9964204e-14,  5.3524551e-14, -1.3652175e-13, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00], dtype=float32),
  'sampling_rate': 16000},
 'transcription': 'Speedbird Four Quebec Mike make your speed Two Six Zero knots'}

In [59]:
# Play an example data file
rand_int = random.randint(0, len(dataset["train"]))

print(dataset["train"][rand_int]["transcription"])
ipd.Audio(data=np.asarray(dataset["train"][rand_int]["audio"]["array"]), autoplay=True, rate=16000)

Astraeus Oscar Three India contact London


## **Data Preprocessing**

### Data Cleaning

In [60]:
# Clean data of special characters and normalise to lowercase
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def clean_data(batch):
    batch["transcription"] = re.sub(chars_to_ignore_regex, '', batch["transcription"]).lower() + " "
    return batch

dataset = dataset.map(clean_data)

Loading cached processed dataset at I:/Repos/HFdatasets/data_gtts/default-data_dir=.%2Fdata_gtts/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68\cache-19f2c25dd0c5eac1.arrow
Loading cached processed dataset at I:/Repos/HFdatasets/data_gtts/default-data_dir=.%2Fdata_gtts/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68\cache-7a36417cb0c46b29.arrow


In [61]:
# Display random data elements for demonstration
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(dataset['train'].remove_columns(["audio"]))

Unnamed: 0,transcription
0,speedbird four four oscar resume own navigation to romeo uniform delta mike oscar
1,ryanair seven oscar two turn right three fife degrees
2,ryanair eight foxtrot two resume own navigation to hotel oscar golf bravo alfa
3,astraeus zero four niner make your speed two three zero knots
4,astraeus whiskey four six contact london
5,astraeus niner november two contact london
6,astraeus kilo papa one climb flight level four six zero
7,speedbird six victor tango route oscar kilo echo sierra india
8,ryanair juliett niner niner descend flight level four fife zero
9,astraeus four bravo echo resume own navigation direct to bravo oscar golf november alfa


### Prepare Vocabulary

In [62]:
# First we need to define our vocabulary. For normal spoken speech, the common approach is to define a letter- or phoneme-level tokenizer.
# Here, we will train a letter-level tokenizer.
# def extract_all_chars(batch):
#   all_text = " ".join(batch["transcription"])
#   vocab = list(set(all_text))
#   return {"vocab": [vocab], "all_text": [all_text]}

# # Set up vocabulary
# vocabs = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names["train"])
# vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
# vocab_dict = {v: k for k, v in enumerate(vocab_list)}

# # Add blank tokens
# vocab_dict["|"] = vocab_dict[" "]
# del vocab_dict[" "]
# vocab_dict["[UNK]"] = len(vocab_dict)
# vocab_dict["[PAD]"] = len(vocab_dict)

In [63]:
# print(f"Vocabulary Length: {len(vocab_dict)}")
# print(vocab_dict)

# with open('./model_wav2vec2/nats_vocab.json', 'w') as vocab_file:
#     json.dump(vocab_dict, vocab_file)

### Prepare Feature Extractor and Tokenizer

In [64]:
# Initialise new tokenizer with vocabulary dict
tokenizer = Wav2Vec2CTCTokenizer("./model_wav2vec2/nats_vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

# Initialise feature extractor and processor pipeline for processing inputs into features
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [65]:
# Prepare input and output
def prepare_dataset(batch):
    audio = batch["audio"]
    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch

dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"])
dataset


Loading cached processed dataset at I:/Repos/HFdatasets/data_gtts/default-data_dir=.%2Fdata_gtts/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68\cache-dfa5d554f31de666.arrow
Loading cached processed dataset at I:/Repos/HFdatasets/data_gtts/default-data_dir=.%2Fdata_gtts/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68\cache-acdfcffdebf27a9a.arrow


DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 250
    })
})

In [66]:
# Prepare a data collator, which will apply separate padding modalities to input and output
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """
    def __init__(self, 
        processor: Wav2Vec2Processor,
        padding: Union[bool, str] = True,
        max_length: Optional[int] = None,
        max_length_labels: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        pad_to_multiple_of_labels: Optional[int] = None,
    ) -> None:
        self.processor = processor
        self.padding = padding
        self.max_length = max_length
        self.max_length_labels = max_length_labels
        self.pad_to_multiple_of = pad_to_multiple_of
        self.pad_to_multiple_of_labels = pad_to_multiple_of_labels

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [67]:
# Prepare loss metric
wer_metric = load_metric("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## **Training**

In [68]:
model_checkpoint = "facebook/wav2vec2-base"

model = Wav2Vec2ForCTC.from_pretrained(
    model_checkpoint,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)
model.freeze_feature_extractor()

loading configuration file config.json from cache at C:\Users\huyle/.cache\huggingface\hub\models--facebook--wav2vec2-base\snapshots\0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8\config.json
Model config Wav2Vec2Config {
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extrac

In [69]:
repo_name = "checkpoints"

training_args = TrainingArguments(
    output_dir=repo_name,
    group_by_length=True,
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=40,
    fp16=True,
    gradient_checkpointing=True, 
    save_steps=250,
    eval_steps=250,
    logging_steps=250,
    learning_rate=1e-4,
    weight_decay=0.005,
    warmup_steps=250,
    save_total_limit=2,
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [70]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


In [71]:
trainer.train()

In [None]:
# Save model
trainer.save_model("./model_wav2vec2/")
tokenizer.save_pretrained("./model_wav2vec2/")

Saving model checkpoint to ./model_wav2vec2/
Configuration saved in ./model_wav2vec2/config.json
Model weights saved in ./model_wav2vec2/pytorch_model.bin
Feature extractor saved in ./model_wav2vec2/preprocessor_config.json
tokenizer config file saved in ./model_wav2vec2/tokenizer_config.json
Special tokens file saved in ./model_wav2vec2/special_tokens_map.json


('./model_wav2vec2/tokenizer_config.json',
 './model_wav2vec2/special_tokens_map.json',
 './model_wav2vec2/vocab.json',
 './model_wav2vec2/added_tokens.json')

In [None]:
# Try to load model
processor_tmp = Wav2Vec2Processor.from_pretrained("./model_wav2vec2/")
model_tmp = Wav2Vec2ForCTC.from_pretrained("./model_wav2vec2/").cuda()

# Reload the dataset
dataset_tmp = DatasetDict()
dataset_tmp = load_dataset("data_gtts", data_dir="I:/Repos/STT_FineTune/nats/data_gtts")
# Resample to 16kHz
dataset_tmp = dataset_tmp.cast_column('audio', Audio(sampling_rate=16000))

loading configuration file ./checkpoints/checkpoint-1250/preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file ./checkpoints/checkpoint-1250/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "./checkpoints/checkpoint-1250/",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    

OSError: Can't load tokenizer for './checkpoints/checkpoint-1250/'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure './checkpoints/checkpoint-1250/' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.

In [None]:
# Load random sample
idx = np.random.randint(0, len(dataset_tmp['test']))
display(ipd.Audio(data=np.asarray(dataset_tmp["test"][idx]["audio"]["array"]), autoplay=False, rate=16000))

# Print label
print(f"Label: {dataset_tmp['test'][idx]['transcription']}")

# Print prediction
input_values = processor_tmp(dataset_tmp["test"][idx]["audio"]["array"], return_tensors="pt", padding="longest", sampling_rate=16000).input_values
logits = model_tmp(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor_tmp.batch_decode(predicted_ids)
print(f"Prediction: {transcription}")

Label: Astraeus Seven Eight Xray contact London
Prediction: ['']
