# Preparing dataset

In [1]:
!pip install -q pyyaml==5.3.1

In [2]:
!pip install -q transformers==4.1.1

## Module importing

In [1]:
import os

import joblib
import numpy as np
import pandas as pd
import yaml

In [2]:
from pathlib import Path
from typing import Dict

from tqdm import tqdm

## Settings and useful functions

In [3]:
## проверить другой датасет
## улосжнить архитектуру: добавить Bi-LSTM
## попробовать другие лосс функции

In [4]:
def get_config(filename: Path) -> Dict:
    with open(filename, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return config

In [5]:
PATH2ROOT = Path('')
PATH2CONFIG = Path(PATH2ROOT / 'configs')

In [6]:
CONFIG = get_config(PATH2CONFIG / 'config.yml')

In [7]:
PATH2COURPUS = Path(PATH2ROOT / CONFIG['data']['path_to_corpus_folder'])

In [8]:
data = joblib.load(PATH2ROOT / CONFIG['data']['path_to_preproc_data'])

In [9]:
MODEL_NAME = 'youscan/ukr-roberta-base'

## Create Dataset

In [10]:
from typing import Dict, Iterable, Mapping, Union

import torch
import transformers
from torch.utils.data import Dataset
from tqdm.auto import tqdm
from transformers import AutoTokenizer


class NamedEntityRecognitionDataset(Dataset):
    """
    Dataset for NER task.
    """

    def __init__(
        self,
        texts: Iterable[Iterable[str]],
        tags: Iterable[Iterable[str]] = None,
        tokenizer: Union[
            str, transformers.tokenization_utils.PreTrainedTokenizer
        ] = 'distilbert-base-uncased',
        max_seq_len: int = None,
        lazy_mode: bool = True,
    ):
        self.tags = tags
        self.texts = texts

        if isinstance(tokenizer, str):
            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        elif isinstance(tokenizer, transformers.tokenization_utils.PreTrainedTokenizer):
            self.tokenizer = tokenizer
        else:
            raise TypeError(
                "You pass wrong type of tokenizer. It should be a model name or PreTrainedTokenizer."
            )

        self.max_seq_len = max_seq_len
        self.length = len(texts)

        if self.max_seq_len < 3:
            raise ValueError("Max sequence length should be greather than 2")

        if not lazy_mode:
            pbar = tqdm(self.length, desc="tokenizing texts")
            self.encoded = [self._getitem_lazy(idx) for idx in pbar]
            del self.texts
            del self.tags

        self._getitem_fn = self._getitem_lazy if lazy_mode else self._getitem_encoded

    def __len__(self) -> int:
        return self.length

    def _getitem_encoded(self, index: int) -> Dict[str, torch.Tensor]:
        return torch.tensor(self.encoded[index])

    def _getitem_lazy(self, index: int) -> Dict[str, torch.Tensor]:
        sentence = self.texts[index]
        tag = self.tags[index]

        input_ids = []
        target_tag = []

        for i, word in enumerate(sentence):
            words_piece_ids = self.tokenizer.encode(
                word,
                max_length=self.max_seq_len,
                truncation=True,
                add_special_tokens=False,
            )
            input_ids.extend(words_piece_ids)
            target_tag.extend([tag[i]] * len(words_piece_ids))

        input_ids = (
            [self.tokenizer.cls_token_id]
            + input_ids[: self.max_seq_len - 2]
            + [self.tokenizer.sep_token_id]
        )

        attention_mask = [1] * len(input_ids)
        token_type_ids = [0] * len(input_ids)

        padding_len = self.max_seq_len - len(input_ids)
        input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)

        target_tag = [0] + target_tag[: self.max_seq_len - 2] + [0]
        target_tag = target_tag + ([0] * padding_len)

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target_tag': torch.tensor(target_tag, dtype=torch.long),
        }

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        return self._getitem_fn(index)

In [11]:
from sklearn import model_selection

In [12]:
texts = data['text'].values.tolist()
tags = data['tags'].values.tolist()

In [13]:
tag_map = {}
# For each label...
for (i, tag) in enumerate(set([item for sublist in tags for item in sublist])):
    # Map it to its integer.
    tag_map[tag] = i

In [14]:
for i in range(len(tags)):
    for j in range(len(tags[i])):
        tags[i][j] = tag_map[tags[i][j]]

In [15]:
num_tag = len(tag_map.keys())

In [16]:
train_texts, val_texts, train_tags, val_tags = model_selection.train_test_split(
    texts,
    tags,
    random_state=CONFIG['general']['seed'],
    test_size=CONFIG['general']['test_size'],
)

train_dataset = NamedEntityRecognitionDataset(
    texts=train_texts,
    tags=train_tags,
    tokenizer=MODEL_NAME,
    max_seq_len=CONFIG['model']['max_seq_length'],
    lazy_mode=True,
)

val_dataset = NamedEntityRecognitionDataset(
    texts=val_texts,
    tags=val_tags,
    tokenizer=MODEL_NAME,
    max_seq_len=CONFIG['model']['max_seq_length'],
    lazy_mode=True,
)

## попробовать больше воркеров
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=CONFIG['training']['train_batch_size'],
    num_workers=1,
    shuffle=True,
)

val_data_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=CONFIG['training']['valid_batch_size'],
    num_workers=1,
)

## Model

In [17]:
# loss_fct = CrossEntropyLoss(ignore_index=0)
#             # Only keep active parts of the loss
#             attention_mask_label = None
#             if attention_mask_label is not None:
#                 active_loss = attention_mask_label.view(-1) == 1
#                 active_logits = logits.view(-1, self.num_labels)[active_loss]
#                 active_labels = labels.view(-1)[active_loss]
#                 loss = loss_fct(active_logits, active_labels)
#             else:
#                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
#             return loss

In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoConfig, AutoModel


def loss_fn(
    output: torch.Tensor,
    target: torch.Tensor,
    attention_mask: torch.Tensor,
    num_labels: int,
):
    lfn = nn.CrossEntropyLoss(ignore_index=0)
    active_loss = attention_mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)[active_loss]
    active_labels = target.view(-1)[active_loss]

    loss = lfn(active_logits, active_labels)
    return loss


class NamedEntityRecognitionBertModel(nn.Module):
    def __init__(self, pretrained_model_name: str, num_tag: int):
        super().__init__()

        ## вынести все в конфиг файл
        hidden_dim = 256
        num_layers = 2
        dropout_rate_lstm = 0.3
        dropout_rate_fc = 0.4
        bidirectional = True
        output_dim = num_tag
        self.num_tag = num_tag

        config = AutoConfig.from_pretrained(pretrained_model_name)

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)

        self.lstm = nn.LSTM(
            self.model.config.hidden_size,
            hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=dropout_rate_lstm if num_layers > 1 else 0,
        )

        self.droupout = nn.Dropout(dropout_rate_fc)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        token_type_ids: torch.Tensor,
        **kwargs: Dict,
    ):
        o1, _ = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False,
        )

        # print('o1', o1.size()) [32, 128, 768] == [batch size, sent len, emb dim]
        # o1 = o1.permute(1, 0, 2) # [sent len, batch size, emb dim]

        logits, (hidden, cell) = self.lstm(o1)
        # logits = logits.permute(1, 0, 2)
        logits = self.fc(self.droupout(logits))
        return logits

    def freeze(self):
        for param in self.model.parameters():
            param.requires_grad = False

    def unfreeze(self):
        for param in self.model.parameters():
            param.requires_grad = True

In [61]:
## поскольку я больше не обучаю берт, то есть смысл переписать все на стандартные оптимизаторы торча
from transformers import AdamW, get_linear_schedule_with_warmup

In [62]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model = NamedEntityRecognitionBertModel(pretrained_model_name=MODEL_NAME, num_tag=num_tag)
model = model.to(device)

cuda:0


## Runners

In [63]:
## клипать градиент
## добавить метрики
## переписать все на каталист

In [64]:
from tqdm.auto import tqdm


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0

    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()

        output_tag = model(**data)

        target_tag = data['target_tag']
        attention_mask = data['attention_mask']
        loss = loss_fn(output_tag, target_tag, attention_mask, model.num_tag)
        loss.backward()

        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0

    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)

        output_tag = model(**data)

        target_tag = data['target_tag']
        attention_mask = data['attention_mask']
        loss = loss_fn(output_tag, target_tag, attention_mask, model.num_tag)

        final_loss += loss.item()
    return final_loss / len(data_loader)

In [65]:
model.freeze()
param_optimizer = list(filter(lambda p: p[1].requires_grad, model.named_parameters()))
model.unfreeze()

no_decay = [
    "bias",
    "LayerNorm.bias",
    "LayerNorm.weight",
    'gamma',
    'beta',
    'final_layer_norm.weight',
]
param_optimizer = [
    {
        "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.001,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(
    len(train_texts)
    / CONFIG['training']['train_batch_size']
    * CONFIG['training']['num_epochs']
)
optimizer = AdamW(param_optimizer, lr=float(CONFIG['training']['learning_rate']))
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
)

In [66]:
model.freeze()

best_loss = np.inf
for epoch in range(CONFIG['training']['num_epochs']):
    train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
    test_loss = eval_fn(val_data_loader, model, device)
    print(f"Epoch {epoch}: Train Loss = {train_loss} Valid Loss = {test_loss}")
    if test_loss < best_loss:
        torch.save(model.state_dict(), PATH2ROOT / CONFIG['data']['path_to_logdir'])
        best_loss = test_loss

print('Best loss:', best_loss)

model.unfreeze()

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 0: Train Loss = 0.45387501269578934 Valid Loss = 0.3544324040412903


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 1: Train Loss = 0.24303041491657495 Valid Loss = 0.3122253119945526


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 2: Train Loss = 0.20226876717060804 Valid Loss = 0.3010270595550537


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 3: Train Loss = 0.17082868609577417 Valid Loss = 0.21730752289295197


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 4: Train Loss = 0.11663021799176931 Valid Loss = 0.19086824357509613


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 5: Train Loss = 0.10201627761125565 Valid Loss = 0.15390175580978394


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 6: Train Loss = 0.09400866134092212 Valid Loss = 0.14126679301261902


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 7: Train Loss = 0.07264558365568519 Valid Loss = 0.1264319270849228


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 8: Train Loss = 0.06904858537018299 Valid Loss = 0.146367609500885


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 9: Train Loss = 0.05500514735467732 Valid Loss = 0.13003167510032654


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 10: Train Loss = 0.04646294598933309 Valid Loss = 0.12456575781106949


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 11: Train Loss = 0.041842702543362975 Valid Loss = 0.12541718780994415


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 12: Train Loss = 0.04126089019700885 Valid Loss = 0.13212187588214874


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 13: Train Loss = 0.04701291909441352 Valid Loss = 0.1329248696565628


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Epoch 14: Train Loss = 0.034304189146496356 Valid Loss = 0.1329248696565628
Best loss: 0.12456575781106949


## Predition

In [67]:
# sentence = [['Звісно', 'Рома', 'дуже', 'хоче', 'аби', 'все', 'було', 'добре'], ['Організація', 'ООН', 'дуже', 'занепокоєна', 'станом', 'речей', 'в', 'Україні']]
sentence = 'всю округу фотографа , заповнив краєзнавець і тележурналіст Василь Нагірний , який народився в Україні та працював в ООН'
sentence = 'я був неймовірно здивований , коли Рома влаштувався в ТОВ " Київоблгазбуд "'
tokenized_sentence = val_dataset.tokenizer.encode(sentence, add_special_tokens=False)
sentence = sentence.split()
print(sentence)
print(tokenized_sentence)
print(len(sentence), len(tokenized_sentence))

['я', 'був', 'неймовірно', 'здивований', ',', 'коли', 'Рома', 'влаштувався', 'в', 'ТОВ', '"', 'Київоблгазбуд', '"']
[276, 994, 16920, 26726, 1421, 1393, 920, 47206, 11394, 2651, 281, 4299, 556, 1534, 368, 274, 10783, 2257, 556]
13 19


In [68]:
# model.load_state_dict(torch.load(PATH2ROOT / CONFIG['data']['path_to_logdir']))
# model = model.to(device)

In [69]:
test_dataset = NamedEntityRecognitionDataset(
    texts=[sentence],
    tags=[[0] * len(sentence)],
    tokenizer=MODEL_NAME,
    max_seq_len=CONFIG['model']['max_seq_length'],
    lazy_mode=True,
)

In [70]:
model.eval()

with torch.no_grad():
    for d in tqdm(test_dataset, total=len(test_dataset)):
        for k, v in d.items():
            d[k] = v.to(device).unsqueeze(0)

        outputs = model(**d)
        # tokens = train_dataset.tokenizer.convert_ids_to_tokens(outputs.argmax(2))
        print(
            outputs.argmax(2).cpu().numpy().reshape(-1)[1 : len(tokenized_sentence) + 1]
        )
        # print(tokens.cpu().numpy())

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

[2 2 2 2 2 2 2 2 2 4 4 2 2 2 2 2 3 3 3]



In [None]:
## добавить нормальную визуализацию результата

In [71]:
tag_map

{'LOC': 0, 'MISC': 1, 'O': 2, 'ORG': 3, 'PERS': 4}

In [None]:
outputs.size()

torch.Size([1, 128, 5])