# Preparing dataset

## Module importing

In [2]:
import os

import numpy as np
import pandas as pd
import yaml

In [3]:
from pathlib import Path
from typing import Dict

from tqdm import tqdm

## Settings and useful functions

In [4]:
def get_config(filename: Path) -> Dict:
    with open(filename, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return config

In [5]:
PATH2ROOT = Path('..')
PATH2CONFIG = Path(PATH2ROOT / 'configs')

In [6]:
CONFIG = get_config(PATH2CONFIG / 'config.yml')

In [7]:
PATH2COURPUS = Path(PATH2ROOT / CONFIG['data']['path_to_corpus_folder'])

In [8]:
data = pd.read_pickle(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

In [9]:
MODEL_NAME = 'youscan/ukr-roberta-base'

## Create Dataset

In [10]:
from typing import Dict, Iterable, Mapping, Union

import torch
import transformers
from torch.utils.data import Dataset
from tqdm.auto import tqdm
from transformers import AutoTokenizer


class NamedEntityRecognitionDataset(Dataset):
    """
    Dataset for NER task.
    """

    def __init__(
        self,
        texts: Iterable[Iterable[str]],
        tags: Iterable[Iterable[str]] = None,
        tags_dict: Mapping[str, int] = None,
        tokenizer: Union[
            str, transformers.tokenization_utils.PreTrainedTokenizer
        ] = 'distilbert-base-uncased',
        max_seq_len: int = None,
        lazy_mode: bool = True,
    ):
        self.tags = tags
        self.texts = texts
        self.tags_dict = tags_dict

        if isinstance(tokenizer, str):
            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        elif isinstance(tokenizer, transformers.tokenization_utils.PreTrainedTokenizer):
            self.tokenizer = tokenizer
        else:
            raise TypeError(
                "You pass wrong type of tokenizer. It should be a model name or PreTrainedTokenizer."
            )

        self.max_seq_len = max_seq_len
        self.length = len(texts)

        if self.max_seq_len < 3:
            raise ValueError("Max sequence length should be greather than 2")

        if self.tags_dict is None and tags is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.tags_dict = dict(
                zip(
                    sorted(set([item for sublist in tags for item in sublist])),
                    range(len(set([item for sublist in tags for item in sublist]))),
                )
            )

        if not lazy_mode:
            pbar = tqdm(self.length, desc="tokenizing texts")
            self.encoded = [self._getitem_lazy(idx) for idx in pbar]
            del self.texts
            del self.tags

        self._getitem_fn = self._getitem_lazy if lazy_mode else self._getitem_encoded

    def __len__(self) -> int:
        return self.length

    def _getitem_encoded(self, index: int) -> Dict[str, torch.Tensor]:
        return torch.tensor(self.encoded[index])

    def _getitem_lazy(self, index: int) -> Dict[str, torch.Tensor]:
        sentence = self.texts[index]
        tag = self.tags[index]

        input_ids = []
        target_tag = []

        for i, word in enumerate(sentence):
            words_piece_ids = self.tokenizer.encode(
                word,
                max_length=self.max_seq_len,
                truncation=True,
                add_special_tokens=False,
            )
            input_ids.extend(words_piece_ids)
            if self.tags is not None:
                target_tag.extend([tag[i]] * len(words_piece_ids))

        input_ids = [101] + input_ids[: self.max_seq_len - 2] + [102]

        attention_mask = [1] * len(input_ids)
        token_type_ids = [0] * len(input_ids)

        padding_len = self.max_seq_len - len(input_ids)
        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)

        if self.tags is not None:
            target_tag = [self.tags_dict.get(y, -1) for y in target_tag]
            target_tag = [0] + target_tag[: self.max_seq_len - 2] + [0]
            target_tag = target_tag + ([0] * padding_len)

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target_tag': torch.tensor(target_tag, dtype=torch.long),
        }

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        return self._getitem_fn(index)

In [13]:
from sklearn import model_selection, preprocessing

In [11]:
texts = data['text'].values.tolist()
tags = data['tags'].values.tolist()

In [11]:
dataset = NamedEntityRecognitionDataset(
    texts, tags=tags, tokenizer=MODEL_NAME, max_seq_len=16, lazy_mode=True
)

In [14]:
num_tag = len(set([item for sublist in tags for item in sublist]))

train_texts, val_texts, train_tags, val_tags = model_selection.train_test_split(
    texts,
    tags,
    random_state=CONFIG['general']['seed'],
    test_size=CONFIG['general']['test_size'],
)

train_dataset = NamedEntityRecognitionDataset(
    texts=train_texts,
    tags=train_tags,
    tokenizer=MODEL_NAME,
    max_seq_len=4,
    lazy_mode=True,
)
val_dataset = NamedEntityRecognitionDataset(
    texts=val_texts, tags=val_tags, tokenizer=MODEL_NAME, max_seq_len=4, lazy_mode=True
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=CONFIG['training']['train_batch_size'], num_workers=1
)

val_data_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=CONFIG['training']['valid_batch_size'], num_workers=1
)

## Model

In [15]:
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoModel


def loss_fn(
    output: torch.Tensor,
    target: torch.Tensor,
    attention_mask: torch.Tensor,
    num_labels: int,
):
    lfn = nn.CrossEntropyLoss()
    active_loss = attention_mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss, target.view(-1), torch.tensor(lfn.ignore_index).type_as(target)
    )

    loss = lfn(active_logits, active_labels)
    return loss


class NamedEntityRecognitionBertModel(nn.Module):
    def __init__(self, pretrained_model_name: str, num_tag: int):
        super().__init__()

        self.num_tag = num_tag

        config = AutoConfig.from_pretrained(pretrained_model_name)

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
        self.droupout1 = nn.Dropout(0.3)
        self.linear1 = nn.Linear(768, self.num_tag)

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        token_type_ids: torch.Tensor,
        target_tag: torch.Tensor,
    ):
        o1, _ = self.model(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )

        bo_tag = self.droupout1(o1)

        tag = self.linear1(bo_tag)

        loss = loss_fn(tag, target_tag, attention_mask, self.num_tag)

        return tag, loss

In [16]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [18]:
device = device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model = NamedEntityRecognitionBertModel(pretrained_model_name=MODEL_NAME, num_tag=num_tag)
model.to(device)

cuda:0


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=506539003.0), HTML(value='')))




NamedEntityRecognitionBertModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.001,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(
    len(train_texts)
    / CONFIG['training']['train_batch_size']
    * CONFIG['training']['num_epochs']
)
optimizer = AdamW(optimizer_parameters, lr=CONFIG['training']['learning_rate'])
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
)

best_loss = np.inf
for epoch in range(CONFIG['training']['num_epochs']):
    train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
    test_loss = eval_fn(val_data_loader, model, device)
    print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
    if test_loss < best_loss:
        torch.save(model.state_dict(), PATH2ROOT / CONFIG['data']['path_to_logdir'])
        best_loss = test_loss

print('Best loss:', best_loss)

## Runners

In [None]:
from tqdm.auto import tqdm


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0

    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()

        _, _, loss = model(**data)
        loss.backward()

        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0

    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)

        _, _, loss = model(**data)

        final_loss += loss.item()
    return final_loss / len(data_loader)