In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Dec  6 10:59:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


To-do:
* Train framework
  * Load data
  * Ensure enough memory when training
  * Upload trained model directly to hugging face
  * Download trained model and check WER and CER (Ask Rasmus and Martin for metric_function)

### Data load

In [1]:
import sys
from google.colab import drive
drive.mount('/content/drive')
JENS = False
if JENS:
  drive_path = 'drive/My Drive/projekt/'
else:
  drive_path = 'drive/My Drive/Colab Notebooks/Deep learning/projekt/'


sys.path.append(drive_path)

Mounted at /content/drive


### Transformer load

In [2]:
!git clone https://github.com/ufal/multilexnorm2021
#%cd multilexnorm2021

!pip3 install torchmetrics==0.4.1
!pip3 install transformers==4.8.2
!pip3 install pytorch_lightning==1.3.8

Cloning into 'multilexnorm2021'...
remote: Enumerating objects: 153, done.[K
remote: Counting objects: 100% (153/153), done.[K
remote: Compressing objects: 100% (114/114), done.[K
remote: Total 153 (delta 65), reused 108 (delta 33), pack-reused 0[K
Receiving objects: 100% (153/153), 240.55 KiB | 3.39 MiB/s, done.
Resolving deltas: 100% (65/65), done.
Collecting torchmetrics==0.4.1
  Downloading torchmetrics-0.4.1-py3-none-any.whl (234 kB)
[K     |████████████████████████████████| 234 kB 5.3 MB/s 
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.4.1
Collecting transformers==4.8.2
  Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 5.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.7 MB/s 
[?25hCollecting huggi

In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

model = T5ForConditionalGeneration.from_pretrained('ufal/byt5-small-multilexnorm2021-da')
tokenizer = AutoTokenizer.from_pretrained('ufal/byt5-small-multilexnorm2021-da')

#### Load functions and classes from MultiLexNorm github

In [3]:
import torch

In [4]:
from torch.utils.data import Dataset


class AbstractDataset(Dataset):
    def __init__(self, inputs, outputs, filter='none'):
        if filter == "none":
            self.filter = lambda x: True
        elif filter == "alnum":
            def replace(word): return word.replace("'", "").replace("-", "").replace(" ", "")
            self.filter = lambda x: len(replace(x)) == 0 or replace(x).isalnum()

In [5]:
class Indexer:
    def __init__(self, valid_indices):
        self.valid_indices = valid_indices
        lengths = [len(sentence_indices) for sentence_indices in self.valid_indices]
        self.cumsum = torch.LongTensor([0] + lengths).cumsum(dim=0)

    def get_indices(self, index):
        sentence_index = torch.searchsorted(self.cumsum, index, right=True).item() - 1
        word_index = index - self.cumsum[sentence_index]
        word_index = self.valid_indices[sentence_index][word_index]

        return sentence_index, word_index

    def __len__(self):
        return self.cumsum[-1].item()

#### Training dataset class

In [6]:
class MultilexnormDataset(AbstractDataset):
    def __init__(self, inputs, outputs):
        super().__init__(inputs, outputs)
        self.inputs = inputs
        self.outputs = outputs

        valid_indices = [[i for i, word in enumerate(sentence) if self.filter(word)] for sentence in inputs]
        self.indexer = Indexer(valid_indices)

    def __getitem__(self, index):
        sentence_index, word_index = self.indexer.get_indices(index)

        out = self.outputs[sentence_index][word_index]
        raw = self.inputs[sentence_index]

        raw = raw[:word_index] + ["<extra_id_0>", raw[word_index], "<extra_id_1>"] + raw[word_index+1:]
        raw = ' '.join(raw)

        return raw, out, sentence_index, word_index

    def __len__(self):
        return len(self.indexer)

In [7]:
from typing import List
class MultiPlexDataset(Dataset):

    def __init__(self,
                 X,
                 y,
                 only_include_corrections: bool = True):
        """

        :param path_to_files: List of paths to the files with data
        :param only_include_corrections: Whether to only include samples where there are corrections
        """

        self.only_include_corrections = only_include_corrections
        self.dataset_counter = 0
        self.data = {}


        for norms, refs in zip(X,y):
          self.create_samples(norms,refs)

        print("Dataset initialized...")

    def create_samples(self, norms, refs):
        if norms and refs:
            for i, word in enumerate(norms):

                if self.only_include_corrections and word == refs[i]:
                    continue

                if i == 0:
                    sample_input = "<extra_id_0>" + word + "<extra_id_1> " + " ".join(norms[i + 1:])
                elif i == len(norms) - 1:
                    sample_input = " ".join(norms[:i]) + " <extra_id_0>" + word + "<extra_id_1>"
                else:
                    sample_input = " ".join(norms[:i]) + " <extra_id_0>" + word + "<extra_id_1> " + " ".join(
                        norms[i + 1:])

                self.data[self.dataset_counter] = {"input_sample": sample_input, "expected_output": refs[i]}
                self.dataset_counter += 1

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data.keys())

In [8]:
def open_dataset(path, load_outputs=True):
    with open(path) as f:
        sentences = f.read().split("\n\n")[:-1]
    sentences = [s.split('\n') for s in sentences]
    inputs = [[w.split('\t')[0] for w in s] for s in sentences]

    if not load_outputs:
        return inputs

    outputs = [[w.split('\t')[1] for w in s] for s in sentences]
    return inputs, outputs

In [9]:
import numpy as np
import pickle
Write = False

if Write:

  from sklearn.model_selection import train_test_split
  inputs, outputs = open_dataset(drive_path+'dataframes/mln_data.txt')
  X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.1, random_state=42)


  with open(drive_path + 'dataframes/mln_data_test_inputs.pkl', 'wb') as f:
    pickle.dump(X_test, f)
  with open(drive_path + 'dataframes/mln_data_test_outputs.pkl', 'wb') as f:
    pickle.dump(y_test, f)

  with open(drive_path + 'dataframes/mln_data_train_inputs.pkl', 'wb') as f:
    pickle.dump(X_train, f)
  with open(drive_path + 'dataframes/mln_data_train_outputs.pkl', 'wb') as f:
    pickle.dump(y_train, f)

else:
  with open(drive_path + 'dataframes/mln_data_train_inputs.pkl', 'rb') as f:
    X_train = pickle.load(f)
  with open(drive_path + 'dataframes/mln_data_train_outputs.pkl', 'rb') as f:
    y_train = pickle.load(f)


In [10]:
#
data = MultiPlexDataset(X_train, y_train, only_include_corrections=True)
#data.data

Dataset initialized...


In [11]:
print(len(data))

187612


Function that converts the dataclass to tensors

In [None]:

class CollateFunctor_Train:
    def __init__(self, tokenizer, encoder_max_length=320, decoder_max_length=32):
        self.tokenizer = tokenizer
        self.encoder_max_length = encoder_max_length
        self.decoder_max_length = decoder_max_length

    def __call__(self, samples):
        inputs = list(map(lambda x: x["input_sample"], samples))

        inputs = self.tokenizer(
            inputs, padding=True, truncation=True, pad_to_multiple_of=8,
            max_length=self.encoder_max_length, return_attention_mask=True, return_tensors='pt'
        )

        outputs = list(map(lambda x: x["expected_output"], samples))

        outputs = self.tokenizer(
            outputs, padding=True, truncation=True, pad_to_multiple_of=8,
            max_length=self.decoder_max_length, return_attention_mask=True, return_tensors='pt'
        )

        batch = {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "labels": outputs.input_ids,
            "decoder_attention_mask": outputs.attention_mask
        }
        batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100  # used to mask the loss in T5
        return batch


In [None]:
#data_loader = get_train_dataloader(data, tokenizer)

In [None]:
#
GIT_TING = True

if GIT_TING:
  !sudo apt-get install git-lfs
  !git lfs install
  #find auth_token by logging in to huggingface with
  !huggingface-cli login
  #create repo
  #!huggingface-cli repo create mln_ft

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.
Error: Failed to call git rev-parse --git-dir --show-toplevel: "fatal: not a git repository (or any of the parent directories): .git\n"
Git LFS initialized.

        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: jenspt
Password: 
Logi

# Attempt at training loop

In [None]:
# Use with a datalodaer
tokenizer = AutoTokenizer.from_pretrained("ufal/byt5-small-multilexnorm2021-da")
dataloader = DataLoader(data, batch_size=8, collate_fn=CollateFunctor_Train(tokenizer))

from transformers import get_scheduler
from transformers import AdamW


optimizer = AdamW(model.parameters(), lr=0.3e-3)
num_epochs = 1
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=4000,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

23452


In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
running_loss=0
model.train()
for i, batch in enumerate(dataloader):
    batch = {k: v.to(device) for k, v in batch.items() if k != 'sentence_ids' and k != 'word_ids'}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)
    # print statistics
    running_loss += loss.item()
    if i % 50 == 0:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
              (epoch + 1, i + 1, running_loss / 50))
        running_loss = 0.0

  0%|          | 0/23452 [00:00<?, ?it/s]

[1,     1] loss: 0.013
[1,    51] loss: 1.058
[1,   101] loss: 1.014
[1,   151] loss: 0.856
[1,   201] loss: 0.870
[1,   251] loss: 0.887
[1,   301] loss: 0.833
[1,   351] loss: 0.786
[1,   401] loss: 0.769
[1,   451] loss: 0.678
[1,   501] loss: 0.751
[1,   551] loss: 0.641
[1,   601] loss: 0.654
[1,   651] loss: 0.636
[1,   701] loss: 0.573
[1,   751] loss: 0.522
[1,   801] loss: 0.499
[1,   851] loss: 0.502
[1,   901] loss: 0.544
[1,   951] loss: 0.573
[1,  1001] loss: 0.451
[1,  1051] loss: 0.483
[1,  1101] loss: 0.502
[1,  1151] loss: 0.434
[1,  1201] loss: 0.438
[1,  1251] loss: 0.462
[1,  1301] loss: 0.430
[1,  1351] loss: 0.419
[1,  1401] loss: 0.412
[1,  1451] loss: 0.488
[1,  1501] loss: 0.706
[1,  1551] loss: 0.765
[1,  1601] loss: 0.727
[1,  1651] loss: 0.768
[1,  1701] loss: 0.638
[1,  1751] loss: 0.659
[1,  1801] loss: 0.708
[1,  1851] loss: 0.699
[1,  1901] loss: 0.662
[1,  1951] loss: 0.649
[1,  2001] loss: 0.681
[1,  2051] loss: 0.626
[1,  2101] loss: 0.655
[1,  2151] 

In [None]:
model.push_to_hub("jenspt/mln_ft", use_auth_token= 'xLBCMTdwnVYcIXUMjmmLNkHwXYGZDlVmhQNcCOMgmTNPkswRbiDGcfuhhVGAdbDxyHiMxpATszDjiPslvNQXJQuMLpfWTXupWkEvDvKRkGwnCRKPfGbcJGuvOzmKKLwP')

'https://huggingface.co/jenspt/mln_ft/commit/ba9c7030c9414c2cd733e027ddbbcf3ef0b82918'

In [None]:
!git config --global user.email "jens.perregaard.thorsen@gmail.com"
!git config --global user.name "jenspt"


# Baseline using MultiLexNorm as it is

In [None]:
import pickle
with open(drive_path + 'dataframes/mln_data_test_inputs.pkl', 'rb') as f:
  X_test = pickle.load(f)
with open(drive_path + 'dataframes/mln_data_test_outputs.pkl', 'rb') as f:
  y_test = pickle.load(f)
data = MultilexnormDataset(X_test, y_test)

In [None]:
class CollateFunctor:
    def __init__(self, tokenizer, encoder_max_length, decoder_max_length):
        self.tokenizer = tokenizer
        self.encoder_max_length = encoder_max_length
        self.decoder_max_length = decoder_max_length

    def __call__(self, samples):
        inputs, outputs, sentence_indices, word_indices = map(list, zip(*samples))
        
        inputs = self.tokenizer(
            inputs, padding=True, truncation=True, pad_to_multiple_of=8,
            max_length=self.encoder_max_length, return_attention_mask=True, return_tensors='pt'
        )
        outputs = self.tokenizer(
            outputs, padding=True, truncation=True, pad_to_multiple_of=8,
            max_length=self.decoder_max_length, return_attention_mask=True, return_tensors='pt'
        )
        

        batch = {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "labels": outputs.input_ids,
            "decoder_attention_mask": outputs.attention_mask,
            "word_ids": word_indices,
            "sentence_ids": sentence_indices
        }
        batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100  # used to mask the loss in T5
        return batch

from torch.utils.data import DataLoader
from torch.optim import AdamW
def get_train_dataloader(dataset,tokenizer):
    collate_fn = CollateFunctor(tokenizer, 320, 32)

    return DataLoader(
        dataset, batch_size=8, shuffle=False, drop_last=True,
        num_workers=0, collate_fn=collate_fn
    )

In [None]:
import os
import os.path

class OutputAssembler:
    def __init__(self, directory, dataset):
        self.directory = directory
        self.dataset = dataset
        self.postprocessing = {
            "none": NonePostprocessor,
            "alnum": AlnumPostprocessor,
        }['alnum'](1.0)

        self.cache = {}

    def step(self, output_dict):
        output_dict = (output_dict["predictions"], output_dict["scores"], output_dict["sentence_ids"], output_dict["word_ids"])
        for word_preds, scores, sent_id, word_id in zip(*output_dict):
            word_preds = [w.replace('\n', '').replace('\t', ' ') for w in word_preds]
            pairs = list(zip(word_preds, scores))

            self.cache.setdefault(sent_id, {})[word_id] = pairs

    def flush(self):
        predictions = self.assemble(self.cache)
        inputs = self.dataset.inputs

        raw_path = f"{self.directory}/raw_outputs_mln_base.txt"
        postprocessed_path = f"{self.directory}/outputs_mln_base.txt"

        if not os.path.isdir(self.directory):
            os.mkdir(self.directory)

        with open(raw_path, "w") as f:
            for i, input_sentence in enumerate(inputs):
                for j, input_word in enumerate(input_sentence):
                    try:
                        prediction_string = '\t'.join([f"{w}\t{s}" for w, s in predictions[i][j]])
                    except:
                        print(i, j, len(predictions[i]))
                        for k, p in enumerate(predictions[i]):
                            print(k, p)
                        print(flush=True)
                        exit()
                    line = f"{input_word}\t{prediction_string}"
                    f.write(f"{line}\n")
                f.write("\n")

        self.postprocessing.process_file(raw_path, postprocessed_path)

    def assemble(self, prediction_dict):
        prediction_list = []
        for sent_id, raw_sentence in enumerate(self.dataset.inputs):
            prediction_list.append(
                [prediction_dict.get(sent_id, {}).get(word_id, [(raw_word, 0.0)]) for word_id, raw_word in enumerate(raw_sentence)]
            )

        return prediction_list

class AbstractPostprocessor:
    def __init__(self, bias=1.0):
        self.bias = bias

    def __call__(self, raw, predictions):
        pass

    def process_file(self, input_path, output_path):
        with open(input_path, "r") as f:
            sentences = f.read().split("\n\n")[:-1]
            sentences = [s.split('\n') for s in sentences]

        with open(output_path, "w") as f:
            for sentence in sentences:
                for word in sentence:
                    raw, *predictions = word.split('\t')
                    predictions = [(word, float(score)) for word, score in zip(predictions[::2], predictions[1::2])]
                    prediction = self(raw, predictions)
                    f.write(f"{raw}\t{prediction}\n")
                f.write("\n")

    def rebalance(self, raw, predictions):
        predictions = [(w, s) if w != raw else (w, s*self.bias) for w, s in predictions]
        predictions = sorted(predictions, key=lambda item: item[1], reverse=True)
        return predictions


class NonePostprocessor(AbstractPostprocessor):
    def __call__(self, raw, predictions):
        predictions = self.rebalance(raw, predictions)
        return predictions[0][0]


class AlnumPostprocessor(AbstractPostprocessor):
    def __call__(self, raw, predictions):
        if raw.isdigit() and len(raw) > 1:
            return raw
        if not raw.replace("'", "").isalnum():
            return raw
        predictions = self.rebalance(raw, predictions)
        return predictions[0][0]

In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

model = T5ForConditionalGeneration.from_pretrained('ufal/byt5-small-multilexnorm2021-da')
tokenizer = AutoTokenizer.from_pretrained('ufal/byt5-small-multilexnorm2021-da')

Downloading:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

In [None]:
data_loader = get_train_dataloader(data, tokenizer)

In [None]:
from pytorch_lightning.utilities.apply_func import move_data_to_device
output_dir = "drive/My Drive/projekt/"
assembler = OutputAssembler(output_dir, data)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

for i, batch in enumerate(data_loader):
    batch = move_data_to_device(batch, device)
    sentence_ids, word_ids = batch["sentence_ids"], batch["word_ids"]
    output = model.generate(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"],
            repetition_penalty=1.0, length_penalty=1.0, max_length=32,
            num_beams=1, num_return_sequences=1,
            output_scores=True, return_dict_in_generate=True
        )

    scores = [[0.0] for i in range(len(sentence_ids))]
    outputs = tokenizer.batch_decode(output.sequences, skip_special_tokens=True)
    outputs = [outputs[i:(i+1)] for i in range(len(sentence_ids))]

    out_dict = {
        "predictions": outputs,
        "scores": scores,
        "sentence_ids": sentence_ids,
        "word_ids": word_ids,
    }
    assembler.step(out_dict)
    print(f"{i} / {(len(data) + 8 - 1) // 8}", flush=True)
assembler.flush()

[1;30;43mStreaming af output blev afkortet til de sidste 5000 linjer.[0m
15583 / 20584
15584 / 20584
15585 / 20584
15586 / 20584
15587 / 20584
15588 / 20584
15589 / 20584
15590 / 20584
15591 / 20584
15592 / 20584
15593 / 20584
15594 / 20584
15595 / 20584
15596 / 20584
15597 / 20584
15598 / 20584
15599 / 20584
15600 / 20584
15601 / 20584
15602 / 20584
15603 / 20584
15604 / 20584
15605 / 20584
15606 / 20584
15607 / 20584
15608 / 20584
15609 / 20584
15610 / 20584
15611 / 20584
15612 / 20584
15613 / 20584
15614 / 20584
15615 / 20584
15616 / 20584
15617 / 20584
15618 / 20584
15619 / 20584
15620 / 20584
15621 / 20584
15622 / 20584
15623 / 20584
15624 / 20584
15625 / 20584
15626 / 20584
15627 / 20584
15628 / 20584
15629 / 20584
15630 / 20584
15631 / 20584
15632 / 20584
15633 / 20584
15634 / 20584
15635 / 20584
15636 / 20584
15637 / 20584
15638 / 20584
15639 / 20584
15640 / 20584
15641 / 20584
15642 / 20584
15643 / 20584
15644 / 20584
15645 / 20584
15646 / 20584
15647 / 20584
15648 / 20584
1