### Install dependencies and load packages

In [None]:
!pip3 install torchmetrics==0.4.1
!pip3 install transformers==4.8.2
!pip3 install pytorch_lightning==1.3.8
!pip3 install nltk
!pip3 install Levenshtein

In [2]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch
from torch.utils.data import Dataset
from MLN_individual_files.helper_classes import *
import numpy as np
import pickle
from transformers import get_scheduler
from transformers import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

## Preview data and fine-tune MLN model.

In [3]:
model = T5ForConditionalGeneration.from_pretrained('ufal/byt5-small-multilexnorm2021-da')
tokenizer = AutoTokenizer.from_pretrained('ufal/byt5-small-multilexnorm2021-da')

Downloading: 100%|██████████| 706/706 [00:00<00:00, 227kB/s]
Downloading: 100%|██████████| 1.20G/1.20G [01:47<00:00, 11.1MB/s] 
Downloading: 100%|██████████| 2.59k/2.59k [00:00<00:00, 1.07MB/s]
Downloading: 100%|██████████| 2.50k/2.50k [00:00<00:00, 801kB/s]


In [5]:
with open('data/mln_data_test_inputs.pkl', 'rb') as f:
    X_train = pickle.load(f)
with open('data/mln_data_test_outputs.pkl', 'rb') as f:
    y_train = pickle.load(f)

data = MultiPlexDataset(X_train, y_train, only_include_corrections=True)

Dataset initialized...


In this demo the test set is loaded. In the real training we ofcourse used the training set. 
We have only included data points with errors.

In [7]:
data[0] #We see that there is an error.

{'input_sample': 'gyset der har siddet sammenkrøbet i <extra_id_0>nakke<extra_id_1> regionen udløses',
 'expected_output': 'nakkeregionen'}

In [11]:
dataloader = DataLoader(data, batch_size=8, collate_fn=CollateFunctor_Train(tokenizer))

In [12]:
optimizer = AdamW(model.parameters(), lr=0.3e-3)
num_epochs = 1
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=4000,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

2624


In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
running_loss=0
model.train()
for i, batch in enumerate(dataloader):
    batch = {k: v.to(device) for k, v in batch.items() if k != 'sentence_ids' and k != 'word_ids'}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)
    # print statistics
    running_loss += loss.item()
    if i % 50 == 0:    # print every 2000 mini-batches
        print('[%5d] loss: %.3f' %
              (i + 1, running_loss / 50))
        running_loss = 0.0

The MLN model was fine-tuned and the model was uploaded to huggingface.
https://huggingface.co/jenspt/mln_ft 

## Inferring

We do inferring on the fine-tuned model. We download it from the huggingface library. If we wanted to get a baseline using the MLN as it is, we could simply download the one that the MLN team has made available.

In [2]:
model = T5ForConditionalGeneration.from_pretrained('jenspt/mln_ft')
tokenizer = AutoTokenizer.from_pretrained('ufal/byt5-small-multilexnorm2021-da')

In [3]:
with open('data/mln_data_test_inputs.pkl', 'rb') as f:
    X_test = pickle.load(f)
with open('data/mln_data_test_outputs.pkl', 'rb') as f:
    y_test = pickle.load(f)
data = MultilexnormDataset(X_test, y_test)
data_loader = get_train_dataloader(data, tokenizer)

In [None]:
from pytorch_lightning.utilities.apply_func import move_data_to_device
output_dir = "drive/My Drive/projekt/"
assembler = OutputAssembler(output_dir, data)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

for i, batch in enumerate(data_loader):
    batch = move_data_to_device(batch, device)
    sentence_ids, word_ids = batch["sentence_ids"], batch["word_ids"]
    output = model.generate(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"],
            repetition_penalty=1.0, length_penalty=1.0, max_length=32,
            num_beams=1, num_return_sequences=1,
            output_scores=True, return_dict_in_generate=True
        )

    scores = [[0.0] for i in range(len(sentence_ids))]
    outputs = tokenizer.batch_decode(output.sequences, skip_special_tokens=True)
    outputs = [outputs[i:(i+1)] for i in range(len(sentence_ids))]

    out_dict = {
        "predictions": outputs,
        "scores": scores,
        "sentence_ids": sentence_ids,
        "word_ids": word_ids,
    }
    assembler.step(out_dict)
    print(f"{i} / {(len(data) + 8 - 1) // 8}", flush=True)
assembler.flush()

## Evaluation

In this section the test set has been corrected and the results are saved as a .txt file. We now open it and calculate the WER, BLEU and GLEU scores.

In [4]:
### READ FILES

inputs, outputs = open_dataset('data/outputs_mln_ft.txt')
corrected = [' '.join(sentence) for sentence in outputs]
transcribed = [' '.join(sentence) for sentence in inputs]

with open('data/mln_data_test_outputs.pkl', 'rb') as f:
    reference = pickle.load(f)

reference = [" ".join(s) for s in reference]

In [5]:
import pandas as pd
mln_df = pd.DataFrame(list(zip(reference, transcribed, corrected)),
               columns =['reference_text', 'transcription', 'corrected'])
mln_df.head()

Unnamed: 0,reference_text,transcription,corrected
0,gyset der har siddet sammenkrøbet i nakkeregio...,gyset der har siddet sammenkrøbet i nakke regi...,gylfi dér er sidde sammenkrøbene inden nakkere...
1,det er et enormt befolkningstal sammenlignet m...,det er et enormt befolkningstal sammenlignet m...,der er enormt enorm befolkningstal sammenligne...
2,de seks balletter er ikke alle avantgardestykk...,de seks balletter er ikke alle avangard stykke...,det seksballetter balletter har alle al avanga...
3,stakkels davedarling,stakkels dave darling,stakels davedarling darling
4,det får han osse,det får han også,dét for hr osse


In [7]:
mln_df['corrected_wer'] = mln_df.apply(calculate_wer,axis=1)
mln_df.head()

Unnamed: 0,reference_text,transcription,corrected,corrected_wer
0,gyset der har siddet sammenkrøbet i nakkeregio...,gyset der har siddet sammenkrøbet i nakke regi...,gylfi dér er sidde sammenkrøbene inden nakkere...,0.666667
1,det er et enormt befolkningstal sammenlignet m...,det er et enormt befolkningstal sammenlignet m...,der er enormt enorm befolkningstal sammenligne...,0.642857
2,de seks balletter er ikke alle avantgardestykk...,de seks balletter er ikke alle avangard stykke...,det seksballetter balletter har alle al avanga...,0.7
3,stakkels davedarling,stakkels dave darling,stakels davedarling darling,0.666667
4,det får han osse,det får han også,dét for hr osse,0.75


In [8]:
mln_df.corrected_wer.mean()*100

75.16467803146487

In [10]:
print(calculate_bleu_normalized(mln_df))
print(calculate_gleu_normalized(mln_df))

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.006133337391333845
0.1129387766622926
