# Pretraining for ASR

In [1]:
# installing libs
# !pip3 install torch torchvision torchaudio datasets transformers soundfile jiwer --index-url https://download.pytorch.org/whl/cu118
# !pip3 install librosa --index-url https://pypi.org/simple

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import re
import torch
import torch.nn as nn
import numpy as np

from datasets import load_dataset, disable_caching
from evaluate import load
from transformers import Wav2Vec2ForPreTraining, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Encoder


## Finetuning Wav2Vec2 model on CTC loss (5 points)


In this task you have to create pipeline for finetuning pretrained multilingual Wav2Vec2 model on belarusian audio from [Fleurs](https://huggingface.co/datasets/google/fleurs) dataset.

#### Prepare data

In [3]:
fleurs = load_dataset("google/fleurs", "be_by", split=["train", "validation", "test"], trust_remote_code=True)

In [4]:
fleurs[0]["transcription"][9]

'вышыня двух пілонаў складае 83 метры даўжыня моста - 378 метраў праезная частка складаецца з дзвюх палос шырыня кожнай - 3,50 м'

In [5]:
fleurs[0][0]

{'id': 396,
 'num_samples': 250560,
 'path': 'C:\\Users\\andre\\.cache\\huggingface\\datasets\\downloads\\extracted\\4a7cb41bec2f9e3bb08125197d8a953f6e2e9fecf18e75e5746ee8f65b3da558\\10009414287632395082.wav',
 'audio': {'path': 'train/10009414287632395082.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00031281,
         -0.00038069, -0.00132966]),
  'sampling_rate': 16000},
 'transcription': 'у той жа час паблізу ад верагодных маршрутаў уварвання базіравалася вельмі мала караблёў каралеўскага флоту таму што адміралы асцерагаліся іх патаплення нямецкімі паветранымі сіламі',
 'raw_transcription': 'У той жа час паблізу ад верагодных маршрутаў уварвання базіравалася вельмі мала караблёў каралеўскага флоту, таму што адміралы асцерагаліся іх патаплення нямецкімі паветранымі сіламі.',
 'gender': 1,
 'lang_id': 6,
 'language': 'Belarusian',
 'lang_group_id': 1}

In this task, you should:

* filter all samples, where `transcription` includes digits. Hint: take care of specific belarussian symbols "і", "ў";
* remove punctuation from `transcription`.

In [6]:
import re

has_digit = re.compile(r"\d")

def filter_f(x):
    print(x)
    return x is not None


preprocessed_train = fleurs[0].filter(lambda x: has_digit.search(x['transcription']) is None)
preprocessed_val = fleurs[1].filter(lambda x: has_digit.search(x['transcription']) is None)

In [7]:
len(fleurs[0]), len(preprocessed_train), len(fleurs[1]), len(preprocessed_val)

(2433, 1927, 408, 355)

#### Train tokenizer

There you should train your own BPE tokenizer based on texts from Fleurs dataset using [HuggingFace tokenizer](https://huggingface.co/docs/tokenizers/en/training_from_memory).

In [8]:
from tokenizers import models, trainers, tokenizers, normalizers, pre_tokenizers, decoders

PAD_TOKEN = "[PAD]"
BOS_TOKEN = "[BOS]"
EOS_TOKEN = "[EOS]"
UNK_TOKEN = "[UNK]"
VOCAB_SIZE = 1000

tokenizer = tokenizers.Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(special_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN], vocab_size=VOCAB_SIZE, show_progress=True)
tokenizer.train_from_iterator(preprocessed_train['transcription'], trainer)


In [9]:
tokenizer

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"[PAD]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":1, "content":"[BOS]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":2, "content":"[EOS]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":3, "content":"[UNK]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}], normalizer=NFKC(), pre_tokenizer=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), post_processor=None, decoder=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={"[PAD]":0, "[BOS]":1, "[EOS]":2, "[UNK]":3, "!":4, "'":5, ",":6, "-":7, ".":8, "/":9, ":":10, "

#### Loading model and preprocessor

In [10]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
   "facebook/wav2vec2-xls-r-300m"
)
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    ctc_loss_reduction="mean", 
    pad_token_id=tokenizer.token_to_id(PAD_TOKEN),
    vocab_size=tokenizer.get_vocab_size(),
)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Data processor and data collator 

In [11]:
class CtcDataProcessor:
    def __init__(self, tokenizer, feature_extractor):
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor

    def __call__(self, row):
        """
            Function applies tokenizer on row['transcription'] and applies feature extractor on audio column in row.
            Input: dict with transcription and audio fields
            Output: original dict includes `labels` column with tokenized sequence and `input_values` column with computed spectrogram.
        """
        row['labels'] = torch.tensor(self.tokenizer.encode(row['transcription'], add_special_tokens=True).ids)
        row['input_values'] = torch.tensor(self.feature_extractor(row['audio']['array'], sampling_rate=row['audio']['sampling_rate']).input_values[0])
        return row

In [12]:
data_processor = CtcDataProcessor(tokenizer, feature_extractor)
data_processor(preprocessed_train[0])

{'id': 396,
 'num_samples': 250560,
 'path': 'C:\\Users\\andre\\.cache\\huggingface\\datasets\\downloads\\extracted\\4a7cb41bec2f9e3bb08125197d8a953f6e2e9fecf18e75e5746ee8f65b3da558\\10009414287632395082.wav',
 'audio': {'path': 'train/10009414287632395082.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00031281,
         -0.00038069, -0.00132966]),
  'sampling_rate': 16000},
 'transcription': 'у той жа час паблізу ад верагодных маршрутаў уварвання базіравалася вельмі мала караблёў каралеўскага флоту таму што адміралы асцерагаліся іх патаплення нямецкімі паветранымі сіламі',
 'raw_transcription': 'У той жа час паблізу ад верагодных маршрутаў уварвання базіравалася вельмі мала караблёў каралеўскага флоту, таму што адміралы асцерагаліся іх патаплення нямецкімі паветранымі сіламі.',
 'gender': 1,
 'lang_id': 6,
 'language': 'Belarusian',
 'lang_group_id': 1,
 'labels': tensor([141, 631, 232,  78, 304,  95, 805, 834, 153, 581, 132, 155, 210, 748,
         139, 229, 4

In [13]:
data_processor = CtcDataProcessor(tokenizer, feature_extractor)
train = preprocessed_train.map(data_processor, keep_in_memory=True, remove_columns=preprocessed_train.column_names)
val = preprocessed_val.map(data_processor, keep_in_memory=True, remove_columns=preprocessed_val.column_names)

Map:   0%|          | 0/1927 [00:00<?, ? examples/s]

Map:   0%|          | 0/355 [00:00<?, ? examples/s]

In [28]:
class CTCDataCollator:
    # HuggingFace requires pad transcript tokens with this value
    LABELS_PAD_IDX = -100

    @staticmethod
    def collate_tokens(tokens_batch, type, pad_value=0.0):
        """
            Function collates list of tokens
        """
        pass
        
    def __call__(self, batch):
        """
            Function collates `input_values` and `labels` into one tensor respectively
            Input: list with dicts, output of CTCDataProcessor
            Output row includes `labels` column with tokenized sequence, `input_values` column with computed spectrogram and 
            `attention_mask` (0 for not-attending position, 1 for attending)
        """
        input_values = torch.nn.utils.rnn.pad_sequence([torch.tensor(row['input_values']) for row in batch], batch_first=True, padding_value=0.0)
        labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(row['labels']) for row in batch], batch_first=True, padding_value=self.LABELS_PAD_IDX)
        attention_mask = (input_values != 0).float()
        return {"input_values": input_values, "labels": labels, "attention_mask": attention_mask}

In [29]:
batch = [train[0], train[1], train[2]]
collator = CTCDataCollator()
collated_batch = collator(batch)
collated_batch

{'input_values': tensor([[0.0002, 0.0002, 0.0002,  ..., 0.0000, 0.0000, 0.0000],
         [0.0003, 0.0003, 0.0003,  ..., 0.0000, 0.0000, 0.0000],
         [0.0002, 0.0002, 0.0002,  ..., 0.0031, 0.0041, 0.0092]]),
 'labels': tensor([[ 141,  631,  232,   78,  304,   95,  805,  834,  153,  581,  132,  155,
           210,  748,  139,  229,  460,  141,  585,  100,  302,  127,  156,   85,
           300,  835,  572,  111,  338,  340,  135,   93,  173,  123,  340,  243,
           123,  782,  221,  298,  246,  419,  176,  153,  208,   88,  110,   83,
           297,   91,  149,  132,  642,  318,  371,  163,  667,  532,  283,   91,
           136,  208,  255,  775,  319,  537,  399,   93,  220],
         [ 264,  676,  365,  993,  658,  157,  140,   92,  165,  292,   92,   88,
           175,  407,  624,  207,  321,  126,  600,  601,  111,  175,   94,  159,
           100,   94,  730,  361,  750,  408,  125,  145,  118,  785,  229,  423,
           273,  785,   88,  117,  646,  105,   98, -100

#### Inference and metrics computing

There you should use simple greedy straregy for CTC output decoding. 

Hint: Don't forget about padding value -100 in reference.

Hint: Don't forget about CTC output format.

In [30]:
wer_metric = load("wer")

class MetricsComputer:
    def __call__(self, pred):
        """
            Input: object with fields `predictions` for CTC model output and `label_ids` for tokenized reference;
            Output: dict with key `wer` and computed wer
        """
        print(pred)
        # model prediction tensor, tensor batch_size x max_seq_len x vocab_size
        preds_logits = pred.predictions
        # reference, tensor batch_size x max_seq_len
        label_ids = pred.label_ids
        print(preds_logits, label_ids)
        
        pad_token_id = tokenizer.token_to_id(PAD_TOKEN)
        preds = torch.argmax(preds_logits, dim=-1)
        label_str = [tokenizer.decode(ids[torch.where(ids != -100)].tolist()) for ids in label_ids]
        pred_str = []
        
        for seq in preds.tolist():
            pred = []
            previous = -100
            for id in seq:
                if id == previous:
                    continue
                if id != pad_token_id:
                    pred.append(id)
                previous = id
            pred_str.append(tokenizer.decode(pred))
    
        print(f"Prediction: {pred_str[0]}")
        print(f"Reference: {label_str[0]}")
        
        wer = wer_metric.compute(predictions=pred_str, references=label_str)
        return {"wer": wer}

In [31]:
predictions = torch.randn(1, 100, VOCAB_SIZE)
label_ids = torch.randint(0, VOCAB_SIZE, (1, 30))
class preds:
    predictions = predictions
    label_ids = label_ids
    
pred = preds()
metrics_computer = MetricsComputer()
metrics_computer(pred)

<__main__.preds object at 0x0000023ACDB3FD30>
tensor([[[ 0.3056,  0.2909,  0.4085,  ...,  0.5964,  0.5362,  1.2350],
         [ 0.2825, -0.8119, -0.6384,  ..., -1.0859, -0.2322,  0.8948],
         [ 1.0967, -0.9883,  1.9996,  ...,  0.0386,  1.2138, -0.0866],
         ...,
         [-0.4730,  0.4409,  2.1444,  ..., -1.2985, -0.8560,  0.7133],
         [ 0.6120,  1.2808, -0.3932,  ...,  0.0959, -0.8767, -1.3836],
         [-0.2902, -0.9350,  0.8850,  ...,  0.8541,  0.7561,  1.0171]]]) tensor([[152, 967, 915, 105, 964, 858, 507, 452, 399, 750, 858,  37, 373, 705,
         342,  60, 960, 470, 386,  84, 295, 228, 714, 748, 220, 994, 367, 102,
         729, 685]])
Prediction: енду ч ф тамуніц былодомятганізрыі здст звычай маюцьіцца� ўсёзяцца менд арыокоеаныя паведён часанняаіед навём нас бытам прад�нікую�меx згоднаух дляэноўв аптцуччкаўзеарадскія-наль ск верлуяўолкага прост прав зшасам алестыў нововаез быўценігуолькалё сказынлтоўяг дзію ціана ве� атрымаць рэгімат каліярэ павінаныязі
Referenc

{'wer': 2.0588235294117645}

#### Overfitting on train batch

In this task you should check pipeline correctness by overfitting on you need to finetune Wav2Vec2 model and achieve 50 WER or lower accuracy on val set.

In [32]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="test",
    per_device_train_batch_size=2, # you could increase batch size
    gradient_accumulation_steps=8, 
    eval_strategy="steps",
    max_steps=3000,
    fp16=True,
    save_steps=50,
    eval_steps=10,
    logging_steps=10,
    learning_rate=3e-4, 
    weight_decay=1e-5,
    warmup_steps=100,
    gradient_checkpointing=True,
)

In [33]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=CTCDataCollator(),
    args=training_args,
    compute_metrics=MetricsComputer(),
    train_dataset=train,
    eval_dataset=val,
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

  0%|          | 0/3000 [00:00<?, ?it/s]