## Install libraries

In [1]:
#!pip3 install torchmetrics==0.4.1 - obsolete
#!pip3 install transformers==4.8.2
#!pip3 install pytorch_lightning==1.3.8 - obsolete

In [2]:
import pandas as pd
from transformers import T5ForConditionalGeneration, AutoTokenizer, Trainer, TrainingArguments
import sys
from torch.utils.data import DataLoader
from pytorch_lightning.utilities.apply_func import move_data_to_device
import torch
import matplotlib.pyplot as plt


## Preview data and fine-tune model

In [4]:
#Only the test data is provided as DanSpeech wishes to keep the entire data set private.
df = pd.read_pickle('clean_data_test.pkl')

X = df['transcription'].tolist() 
y = df['reference_text'].tolist()

We use the small ByT5 pre-trained model. The model and the UTF-8 tokenizer is downloaded from the huggingface library.

In [7]:
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')
tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')

In [8]:
train_inputs = tokenizer(X, return_tensors="pt", padding=True, truncation=True, max_length=256)

train_tgt = tokenizer(y, return_tensors="pt", padding=True, truncation=True, max_length=256)

train_inputs['labels'] = train_tgt['input_ids']

In [15]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, y_encodings=None):
        self.encodings = encodings

    def __getitem__(self, idx):
        item= {key: val[idx] for key, val in self.encodings.items()}
        #item['labels'] = {key: torch.tensor(val[idx]) for key, val in self.y_encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = TextDataset(train_inputs)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    #per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=3000,                # number of warmup steps for learning rate scheduler (used to be 500)
    weight_decay=0.01,               # strength of weight decay
    learning_rate=0.1e-3,           # default = 5e-5=0.5e-4
    logging_dir='./logs',            # directory for storing logs
    logging_steps=50,
    #eval_steps = 100,
    overwrite_output_dir = True,
    save_strategy = 'epoch',
    #logging_strategy = 'epoch',
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)

#trainer.train()  Takes very long if GPU not used.


The model was trained with the training data set and the fine-tuned model has been uploaded to Huggingface. The best ByT5 model can be found at https://huggingface.co/jenspt/byt5_ft_all_clean_data_lr_1e4

## Inferring on the fine-tuned model

The same data set is used in this dummy example. The DataLoader function is used to prepare the input sentences to the inferring.
The fine-tuned models that was uploaded to huggingface is downloaded.

In [None]:
test_inputs = tokenizer(X, return_tensors="pt", padding=True, truncation=True, max_length=256)
test_dataset = TextDataset(test_inputs)

def get_train_dataloader(dataset):
    return DataLoader(
        dataset, batch_size=8, shuffle=False, drop_last=True,
        num_workers=0
    )

dataloader = get_train_dataloader(test_dataset)

#Download fine-tuned model:
model_ft = T5ForConditionalGeneration.from_pretrained('jenspt/byt5_ft_all_clean_data_lr_1e4')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Running on: {}".format(device))
model_ft = model_ft.to(device)


model_ft.eval()

out = []
      
for i, batch in enumerate(dataloader):
  if i % 50 ==0:
    print(i/len(dataloader))
  batch = move_data_to_device(batch, device)
  output = model_ft.generate(
              input_ids=batch["input_ids"], attention_mask=batch["attention_mask"],
              repetition_penalty=1.0, length_penalty=1.0, max_length=256,
              num_beams=1, num_return_sequences=1,
              output_scores=True, return_dict_in_generate=True
          )
    
  outputs = tokenizer.batch_decode(output.sequences, skip_special_tokens=True)
  out.append(outputs)

The corrected sentences were uploaded and saved in the original dataframe. 

## Evaluation

In [18]:
import Levenshtein as Lev
def wer_rasmus(s1, s2):
    """
    Computes the Word Error Rate, defined as the edit distance between the
    two provided sentences after tokenizing to words.
    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """

    # build mapping of words to integers
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))

    # map the words to a char array (Levenshtein packages only accepts
    # strings)
    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]

    return Lev.distance(''.join(w1), ''.join(w2))

def wer(s1, s2):
  return wer_rasmus(s1.lower(),s2.lower()) / len(s2.split(" "))
def calculate_wer(df):
  return wer(df['corrected'],df['reference_text'])

In [20]:
df = pd.read_pickle("ByT5_lr_1e4.pkl")
df['corrected_wer'] = df.apply(calculate_wer,axis=1)
print("Baseline WER: ",df['test_wer'].mean()*100)
print("Best ByT5 model: ",df['corrected_wer'].mean()*100)


Baseline WER:  8.977595922015709
Best ByT5 model:  6.465655469246048
