# Finetuning

In [None]:
import json
import random
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Union

import numpy as np
import torch
from datasets import Dataset, Audio
from datasets import load_metric
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer


### Loading the dataset

In [None]:
base_directory = Path.cwd().parent

dataset_name = "yale/econ251"
data_dir = base_directory / 'data'
# audio_dir = data_dir / 'inputs' / dataset_name / 'lectures'
audio_dir = data_dir / 'inputs' / dataset_name / 'lectures-tiny'
predictions_dir = data_dir / 'predictions' / dataset_name

# transcripts_dir = data_dir / 'inputs' / dataset_name / 'transcripts'
transcripts_dir = data_dir / 'inputs' / dataset_name / 'transcripts-tiny'

In [None]:
txt_files = [str(text_file) for text_file in transcripts_dir.glob('*.txt') if
             'tiny' in str(text_file)]
txt_files = sorted(txt_files)[:7]

mp3_files = [str(audio_file) for audio_file in audio_dir.glob('*.mp3') if
             'tiny' in str(audio_file)]
mp3_files = sorted(mp3_files)[:7]

data_dict = {
    'mp3': mp3_files,
    'txt': txt_files,
}

dataset = Dataset.from_dict(data_dict, split="all")
dataset = dataset.train_test_split(test_size=0.2)
dataset = dataset.cast_column("mp3", Audio(sampling_rate=16_000))

In [None]:
mp3_files

In [None]:
dataset

In [None]:
dataset['train']['mp3']

### Creating the tokenizer

First we remove special characters and normalize text

In [None]:

# chars_to_ignore_regex = '[\,\?\.\!\-\;\:"]'
# chars_to_ignore_regex = '[\,\?\.\!\-\;\:\½"]'

# ignore_list = ['½', 'à', 'â', 'é', 'ï', '–', '—', '‘', '’', '“', '”', '…<', '=', '>',
#                '$', '%', '&', '(', ')', '+', '/', '0', '1', '2', '3', '4', '5', '6',
#                '7', '8', '9']
# '%': 'percent',
# '$': 'dollar',
# '+': 'plus',
# '-': 'minus',
# '½': 'half',

chars_to_ignore_regex = "[\,\?\.\!\-\;\:\"½+-0123456789&%$()=><…—–\n]"
#
replace_dict = {
    'à': 'a',
    'â': 'a',
    'é': 'e',
    'ï': 'i',
    '”': '"',
    '“': '"',
    '‘': "'",
    '’': "'",
}


def retrieve_text(batch):
    # load the contents of the file as a string
    txt_file = batch["txt"]
    with open(txt_file, 'r') as f:
        text = f.read()

    for k, v in replace_dict.items():
        text = text.replace(k, v)

    # text = re.sub('[\n]', ' ', text)

    # text = re.sub(chars_to_replace_1, '"', text)

    # do some processing
    batch["txt"] = re.sub(chars_to_ignore_regex, ' ', text).lower()
    return batch


dataset = dataset.map(retrieve_text)

dataset["train"][0]["txt"][:50]


We then extract all the characters in the text to use in our output

In [None]:
# def extract_all_chars(batch):
#     all_text = " ".join(batch["txt"])
#     vocab = list(set(all_text))
#     return {"vocab": [vocab], "all_text": [all_text]}
#
#
# vocabs = dataset.map(extract_all_chars,
#                      batched=True, batch_size=-1,
#                      keep_in_memory=True,
#                      remove_columns=dataset.column_names["train"])
#
# vocab_list = list(
#     set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
#
# vocab_dict = {v: k for k, v in enumerate(vocab_list)}
#
# print(sorted(vocab_dict, key=lambda x: x[0]))
#
# vocab_dict["|"] = vocab_dict[" "]
# del vocab_dict[" "]
#
#
# # TEMP FIX
# # vocab_dict["x"] = len(vocab_dict)
#
#
# vocab_dict["[UNK]"] = len(vocab_dict)
# vocab_dict["[PAD]"] = len(vocab_dict)
# print(len(vocab_dict))
#
#
# with open('../data/inputs/yale/econ251/vocab.json', 'w') as vocab_file:
#     json.dump(vocab_dict, vocab_file)


In [None]:
# tokenizer = Wav2Vec2CTCTokenizer("../data/inputs/yale/econ251/vocab_alt.json",
#                                  unk_token="[UNK]", pad_token="[PAD]",
#                                  word_delimiter_token="|")
tokenizer = Wav2Vec2CTCTokenizer("../data/inputs/yale/econ251/vocab.json",
                                 unk_token="[UNK]", pad_token="[PAD]",
                                 word_delimiter_token="|")

### Creating the feature extractor and combine with tokenizer into processor

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=False)
#                                              return_attention_mask=True)

In [None]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

### Preprocess data

In [None]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(dataset["train"])-1)

print(dataset["train"][rand_int]["txt"])
ipd.Audio(data=np.asarray(dataset["train"][rand_int]["mp3"]["array"]),
          autoplay=False, rate=16000)


In [None]:
#rand_int = random.randint(0, len(dataset["train"]))
print("Target text [:100]:", dataset["train"][rand_int]["txt"][:100])
print("Input array shape:",
      np.asarray(dataset["train"][rand_int]["mp3"]["array"]).shape)
print("Sampling rate:", dataset["train"][rand_int]["mp3"]["sampling_rate"])

In [None]:
def prepare_dataset(batch):
    audio = batch["mp3"] # load and resample data, this takes the longest time

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio[
        "sampling_rate"]).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["txt"]).input_ids

    return batch

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset
                      .column_names["train"], num_proc=2)

In [None]:
dataset

### Training & evaluation

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
wer_metric = load_metric("wer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # print('before', pred.label_ids)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    # print('after', pred.label_ids)

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
#     "facebook/wav2vec2-base-960h",
    "facebook/wav2vec2-base",
#     'facebook/wav2vec2-large-robust',
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
#     vocab_size = processor.tokenizer.vocab_size
)
model.config.ctc_zero_infinity = True
# vocab_size = len(processor.tokenizer)
# vocab_size = processor.tokenizer.vocab_size
#model.freeze_feature_extractor()
model.freeze_feature_encoder()

In [None]:
training_args = TrainingArguments(
    output_dir="../output/tiny-model",
    push_to_hub=False,
    group_by_length=True,
    per_device_train_batch_size=1, # (2 if 5tiny) max 1 to fit in memory
    evaluation_strategy="steps", #"epoch"
    num_train_epochs=500,#250,#500,#30,50
    warmup_steps=200,  #1000,
    eval_steps=100, # 500,
    logging_steps=100, #500,
    save_steps=100, # 500,
    # save_total_limit=2
    fp16=True,
    gradient_checkpointing=True,
    learning_rate=3e-4,  # was 1e-4 -- finetune these parameters
    weight_decay=0.005,  # finetune these parameters
    #
    gradient_accumulation_steps=2, # use this for fitting in memory
    eval_accumulation_steps=2, # use this for fitting in memory
    optim="adamw_torch" # use the pytorch adam implementation
)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics= compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

In [None]:
save_name = '17-05_1835'
trainer.save_model(f'../output/tiny-model/{save_name}')
processor.save_pretrained(f'../output/tiny-model/{save_name}')

In [None]:
trainer.state.log_history

In [None]:
# CUDA out of memory. Tried to allocate 13.23 GiB (GPU 0; 7.93 GiB total capacity; 757.87 MiB already allocated; 6.16 GiB free; 800.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Evaluation

In [None]:
from transformers import AutoModelForCTC, Wav2Vec2Processor

model_dir = "./output/tiny-model/checkpoint-200" #+ '17-05_1835'


model2 = AutoModelForCTC.from_pretrained(model_dir)
processor2 = Wav2Vec2Processor.from_pretrained(model_dir)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model2.to(device)


In [None]:
dataset

In [None]:
def map_to_result(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["text"] = processor.decode(batch["labels"], group_tokens=False)

    return batch

results = dataset["test"].map(map_to_result, remove_columns=dataset["test"]
                        .column_names)

In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

In [None]:
print('fill')

In [None]:
results['pred_str']

In [None]:
dataset

In [None]:
results['text']

In [None]:
# model.to("cuda")

with torch.no_grad():
    logits = model(torch.tensor(dataset["test"][:1]["input_values"],
                                device="cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)

# convert ids to tokens
" ".join(processor.tokenizer.convert_ids_to_tokens(pred_ids[0].tolist()))

In [None]:
dataset

In [None]:
results['text']

In [None]:
# model.to("cuda")

with torch.no_grad():
    logits = model(torch.tensor(dataset["test"][:1]["input_values"],
                                device="cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)

# convert ids to tokens
" ".join(processor.tokenizer.convert_ids_to_tokens(pred_ids[0].tolist()))