In [45]:
from crf import CRF
from transformers import RobertaModel, AutoTokenizer
import torch.nn as nn
import numpy as np
import torch
from typing import List, Tuple
from torch.utils.data import Dataset
from torch.nn.functional import log_softmax
import evaluate
from dataclasses import dataclass
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score

@dataclass
class Instance:
	words: List[str]
	ori_words: List[str]
	labels: List[str] = None
	prediction: List[str]  = None

In [46]:
ROBERTA_PATH = '/home/dzigen/Desktop/medics2023/NLP_MODULE/models/RuBioRoBERTa'
BATCH_SIZE = 4
DEVICE = 'cuda'
EPOCHS = 30
LAYERS_TO_HOLD = ['23', '22', '21', 'pooler']

In [47]:
tagged_names = ["КТР","ЧСС","ТВП","ТРИКУСП РЕГУРГ","ХГЧ","PAPP-A","АФП",
                "Ингибин А","св эстрадиол","Возраст","Вес","Рост","ИМТ","ФК"]

LABEL_LIST = ['O']
for i, f_name in enumerate(tagged_names):
    f_b, f_i = f"B-L{i+1}", f"I-L{i+1}"
    LABEL_LIST.append(f_b)
    LABEL_LIST.append(f_i)
print(LABEL_LIST)

LABEL2ID = {label: i for i, label in enumerate(LABEL_LIST)}
ID2LABEL = {i: label for i, label in enumerate(LABEL_LIST)}

print(LABEL2ID)
print(ID2LABEL)

['O', 'B-L1', 'I-L1', 'B-L2', 'I-L2', 'B-L3', 'I-L3', 'B-L4', 'I-L4', 'B-L5', 'I-L5', 'B-L6', 'I-L6', 'B-L7', 'I-L7', 'B-L8', 'I-L8', 'B-L9', 'I-L9', 'B-L10', 'I-L10', 'B-L11', 'I-L11', 'B-L12', 'I-L12', 'B-L13', 'I-L13', 'B-L14', 'I-L14']
{'O': 0, 'B-L1': 1, 'I-L1': 2, 'B-L2': 3, 'I-L2': 4, 'B-L3': 5, 'I-L3': 6, 'B-L4': 7, 'I-L4': 8, 'B-L5': 9, 'I-L5': 10, 'B-L6': 11, 'I-L6': 12, 'B-L7': 13, 'I-L7': 14, 'B-L8': 15, 'I-L8': 16, 'B-L9': 17, 'I-L9': 18, 'B-L10': 19, 'I-L10': 20, 'B-L11': 21, 'I-L11': 22, 'B-L12': 23, 'I-L12': 24, 'B-L13': 25, 'I-L13': 26, 'B-L14': 27, 'I-L14': 28}
{0: 'O', 1: 'B-L1', 2: 'I-L1', 3: 'B-L2', 4: 'I-L2', 5: 'B-L3', 6: 'I-L3', 7: 'B-L4', 8: 'I-L4', 9: 'B-L5', 10: 'I-L5', 11: 'B-L6', 12: 'I-L6', 13: 'B-L7', 14: 'I-L7', 15: 'B-L8', 16: 'I-L8', 17: 'B-L9', 18: 'I-L9', 19: 'B-L10', 20: 'I-L10', 21: 'B-L11', 22: 'I-L11', 23: 'B-L12', 24: 'I-L12', 25: 'B-L13', 26: 'I-L13', 27: 'B-L14', 28: 'I-L14'}


In [48]:
import json

In [49]:
test_metrics = json.loads(open("./test_metrics.json", 'r', encoding='utf-8').read())

In [58]:
len(test_metrics['true_labels'])

11077

In [9]:
class RobertaCrf(nn.Module):
    def __init__(self,num_labels: int,base_path: str,):
        super().__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(base_path)

        # Замораживаем часть слоёв бекбона
        for name, param in self.roberta.named_parameters():
            param.requires_grad = False
        for name, param in self.roberta.named_parameters():
            for hold_l in LAYERS_TO_HOLD:
                if hold_l in name:
                    param.requires_grad = True

        self.dropout = nn.Dropout(0.2)
        self.hidden2label = nn.Linear(self.roberta.config.hidden_size, num_labels)

        self.start_transitions = nn.Parameter(torch.empty(num_labels))
        self.end_transitions = nn.Parameter(torch.empty(num_labels))
        self.transitions = nn.Parameter(torch.empty(num_labels, num_labels))

        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
        nn.init.uniform_(self.transitions, -0.1, 0.1)

    def _compute_log_denominator(self, features: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        seq_len = features.shape[0]

        log_score_over_all_seq = self.start_transitions + features[0]

        for i in range(1, seq_len):
            next_log_score_over_all_seq = torch.logsumexp(
                log_score_over_all_seq.unsqueeze(2) + self.transitions + features[i].unsqueeze(1),
                dim=1,
            )
            log_score_over_all_seq = torch.where(
                mask[i].unsqueeze(1),
                next_log_score_over_all_seq,
                log_score_over_all_seq,
            )
        log_score_over_all_seq += self.end_transitions
        return torch.logsumexp(log_score_over_all_seq, dim=1)

    def _compute_log_numerator(self, features: torch.Tensor, labels: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        seq_len, bs, _ = features.shape

        score_over_seq = self.start_transitions[labels[0]] + features[0, torch.arange(bs), labels[0]]

        for i in range(1, seq_len):
            score_over_seq += (
                self.transitions[labels[i - 1], labels[i]] + features[i, torch.arange(bs), labels[i]]
            ) * mask[i]
        seq_lens = mask.sum(dim=0) - 1
        last_tags = labels[seq_lens.long(), torch.arange(bs)]
        score_over_seq += self.end_transitions[last_tags]
        return score_over_seq

    def get_roberta_features(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        hidden = self.roberta(input_ids, attention_mask=attention_mask)["last_hidden_state"]
        hidden = self.dropout(hidden)
        return self.hidden2label(hidden), hidden

    def forward(self,input_ids: torch.Tensor,
                attention_mask: torch.Tensor,labels: torch.Tensor,) -> torch.Tensor:
        features, _ = self.get_roberta_features(input_ids=input_ids, attention_mask=attention_mask)
        attention_mask = attention_mask.bool()

        features = torch.swapaxes(features, 0, 1)
        attention_mask = torch.swapaxes(attention_mask, 0, 1)
        labels = torch.swapaxes(labels, 0, 1)

        log_numerator = self._compute_log_numerator(features=features, labels=labels, mask=attention_mask)
        log_denominator = self._compute_log_denominator(features=features, mask=attention_mask)

        return torch.mean(log_denominator - log_numerator)


    def _viterbi_decode(self, features: torch.Tensor, mask: torch.Tensor) -> List[List[int]]:
        seq_len, bs, _ = features.shape

        log_score_over_all_seq = self.start_transitions + features[0]

        backpointers = torch.empty_like(features)

        for i in range(1, seq_len):
            next_log_score_over_all_seq = (
                log_score_over_all_seq.unsqueeze(2) + self.transitions + features[i].unsqueeze(1)
            )

            next_log_score_over_all_seq, indices = next_log_score_over_all_seq.max(dim=1)

            log_score_over_all_seq = torch.where(
                mask[i].unsqueeze(1),
                next_log_score_over_all_seq,
                log_score_over_all_seq,
            )
            backpointers[i] = indices

        backpointers = backpointers[1:].int()

        log_score_over_all_seq += self.end_transitions
        seq_lens = mask.sum(dim=0) - 1

        best_paths = []
        for seq_ind in range(bs):
            best_label_id = torch.argmax(log_score_over_all_seq[seq_ind]).item()
            best_path = [best_label_id]

            for backpointer in reversed(backpointers[: seq_lens[seq_ind]]):
                best_path.append(backpointer[seq_ind][best_path[-1]].item())

            best_path.reverse()
            best_paths.append(best_path)

        return best_paths

    def decode(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> List[List[int]]:
        features, _ = self.get_roberta_features(input_ids=input_ids, attention_mask=attention_mask)
        attention_mask = attention_mask.bool()

        features = torch.swapaxes(features, 0, 1)
        mask = torch.swapaxes(attention_mask, 0, 1)
        return self._viterbi_decode(features=features, mask=mask)


In [4]:
class RobertaCRF(nn.Module):
    def __init__(self, label_size, roberta_path=ROBERTA_PATH, device=DEVICE):
        super(RobertaCRF, self).__init__()

        self.encoder = RobertaModel.from_pretrained(roberta_path).to(device)
        self.dropout = nn.Dropout(0.5)     
        self.linear = nn.Linear(self.encoder.config.hidden_size, label_size)
        self.crf = CRF(label_size)

        # Замораживаем часть слоёв бекбона
        for name, param in self.encoder.named_parameters():
            param.requires_grad = False
        for name, param in self.encoder.named_parameters():
            for hold_l in LAYERS_TO_HOLD:
                if hold_l in name:
                    param.requires_grad = True

    def forward(self, input_ids, attention_mask, labels=None, mode='train'):
        embeddings = self.encoder(input_ids=input_ids, attention_mask= attention_mask)
        drop_out = self.dropout(embeddings.last_hidden_state)
        linear_out = self.linear(drop_out)

        if mode == 'train':
            crf_out = self.crf(linear_out, mask=attention_mask, 
                               labels=labels)
        elif mode == 'eval':
            crf_out = self.crf.viterbi_decode(linear_out, mask=attention_mask)
        else:
            raise KeyError
        
        return crf_out

In [5]:
FINETUNED_MODEL_PATH = '/home/dzigen/Desktop/ITMO/sem1/DLtech/dl_tech_learn/lab4/saved_model/ner_model'

In [6]:
TOKENIZER = AutoTokenizer.from_pretrained(ROBERTA_PATH, add_prefix_space=True, use_fast=True)

In [7]:
METRIC = evaluate.load("seqeval")

In [10]:
MODEL = RobertaCrf(len(LABEL_LIST), ROBERTA_PATH).to(DEVICE)

In [None]:
MODEL = RobertaCRF(len(LABEL_LIST)).to(DEVICE)

MODEL.load_state_dict(torch.load(FINETUNED_MODEL_PATH))
MODEL.eval()

In [5]:
from transformers import RobertaForTokenClassification

In [None]:
FINETUNED_MODEL_PATH = "/home/dzigen/Desktop/medics2023/NLP_MODULE/models/tagged/ner_model_epoch5"
MODEL = RobertaForTokenClassification.from_pretrained(ROBERTA_PATH, id2label=ID2LABEL, label2id=LABEL2ID).to(DEVICE)
MODEL.load_state_dict(torch.load(FINETUNED_MODEL_PATH))
MODEL.eval()

In [10]:
class NerDataset(Dataset):

    def __init__(self, file, label2idx, tokenizer):
        """
        sents: we use sentences if we want to build dataset from sentences directly instead of file
        """
        ## read all the instances. sentences and labels
        insts = self.read_file(file=file) 
        self.insts = insts
        self.label2idx = label2idx
        self.tokenizer = tokenizer

        len_input_ids = [
            len(self.tokenizer(self.insts[i].words, is_split_into_words=True)['input_ids']) 
            for i in range(len(self.insts))
            ]
        long_samples_ids = sorted(list(map(lambda vv: vv[0], filter(lambda v: v[1] > 512, enumerate(len_input_ids)))), reverse=True)
        for long_train_id in long_samples_ids:
            del self.insts[long_train_id]
        print(f"{len(long_samples_ids)} samples was deletet because its long len")

    def read_file(self, file: str, number: int = -1) -> List[Instance]:
        insts = []
        with open(file, 'r', encoding='utf-8') as f:
            words = []
            ori_words = []
            labels = []
            for line in tqdm(f.readlines()):
                line = line.rstrip()
                if line == "":
                    insts.append(Instance(words=words, ori_words=ori_words, labels=labels))
                    words = []
                    ori_words = []
                    labels = []
                    if len(insts) == number:
                        break
                    continue
                ls = line.split()
                word, label = ls[0],ls[-1]
                ori_words.append(word)
                words.append(word)
                labels.append(label)
        return insts

    def __len__(self):
        return len(self.insts)
    
    def tokenize_and_align_labels(self, example, label_all_tokens = True):
        tokenized_inputs = TOKENIZER(example.words, truncation=True, padding='max_length', max_length=512, 
                                     is_split_into_words=True)

        word_ids = tokenized_inputs.word_ids()
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to 0 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(0)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(example.labels[word_idx])
            # For the other tokens in a word, we set the label to either the current label or 0, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(example.labels[word_idx] if label_all_tokens else 0)
            previous_word_idx = word_idx

        tokenized_inputs["labels"] = list(map(lambda v: LABEL2ID.get(v, 0), label_ids))
        return tokenized_inputs

    def __getitem__(self, index):
        item = self.tokenize_and_align_labels(self.insts[index])
        return {'input_ids': torch.tensor(item['input_ids']), 
                'attention_mask': torch.ByteTensor(item['attention_mask']),
                'labels': torch.tensor(item['labels'])}
    

In [9]:
TEST_FILE = '/home/dzigen/Desktop/ITMO/sem1/DLtech/dl_tech_learn/lab4/medics/train.txt'

In [11]:
test_ds = NerDataset(TEST_FILE,LABEL_LIST,TOKENIZER)
test_dataloader = DataLoader(test_ds, shuffle=False, batch_size=8)

100%|██████████| 9166325/9166325 [00:04<00:00, 1932499.05it/s]


29 samples was deletet because its long len


In [82]:
from collections import Counter

In [None]:
print(compute_metrics(accum_preds, accum_refs))

In [18]:
batch1 = next(iter(test_dataloader))

In [None]:
output = MODEL(batch1['input_ids'].to(DEVICE), 
               attention_mask=batch1['attention_mask'].to(DEVICE), mode='eval')

In [30]:
from functools import reduce

In [34]:
reduce(lambda acc, v: acc + [v], [1,2,3,4], [5,6])

[5, 6, 1, 2, 3, 4]

In [43]:
def compute_metrics(predictions, labels):

    # Remove ignored index (special tokens)
    true_predictions = reduce(lambda acc, v: acc + v, 
                              [[p for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)], [])
    true_labels = reduce(lambda acc, v: acc + v, 
                         [[l for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)], [])

    return { "f1_micro": f1_score(true_predictions, true_labels, average="micro"),
             "f1_weighted": f1_score(true_predictions, true_labels, average="weighted"),
             "accuracy": accuracy_score(true_predictions, true_labels)}

In [37]:
compute_metrics([[0,0,0,0]],[[0,0,0,0]])

1.0

In [44]:
accum_preds = []
accum_refs = []
MODEL.eval()
for batch in tqdm(test_dataloader):
    with torch.no_grad():
        output = MODEL(input_ids=batch['input_ids'].to(DEVICE), attention_mask=batch['attention_mask'].to(DEVICE), mode='eval')
    preds = output
    refs = batch['labels'].to('cpu').numpy().tolist()

    accum_preds += preds
    accum_refs += refs

    print(compute_metrics(accum_preds, accum_refs))

  0%|          | 1/11073 [00:02<6:17:48,  2.05s/it]

{'f1_micro': 1.0, 'f1_weighted': 1.0, 'accuracy': 1.0}


  0%|          | 2/11073 [00:04<6:12:31,  2.02s/it]

{'f1_micro': 1.0, 'f1_weighted': 1.0, 'accuracy': 1.0}


  0%|          | 2/11073 [00:05<8:19:43,  2.71s/it]


KeyboardInterrupt: 

In [69]:
output = MODEL(input_ids=input_ids.to(DEVICE), attention_mask=mask.to(DEVICE), mode='eval')

In [77]:
compute_metrics(output, [test_ds[20000]['labels'].cpu().numpy().tolist()])

{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'accuracy': 1.0}

In [None]:
print(output[0])
print(test_ds[20000]['labels'])

In [10]:
METRIC = evaluate.load("seqeval")

In [12]:
input_ids = torch.tensor(np.random.randint(100, size=(2,514)))
mask = torch.ByteTensor([[1]*100 + [0]*414]*2)
labels = torch.tensor([list(np.random.randint(3, size=100)) + [0]*414]*2)

In [13]:
output = model(input_ids=input_ids, attention_mask=mask, mode='eval')

In [19]:
compute_metrics(output, labels.to('cpu').numpy().tolist())

  _warn_prf(average, modifier, msg_start, len(result))


{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'accuracy': 0.09}

In [25]:
len(true_labels[0])

100

In [20]:
true_predictions = [
        [ID2LABEL[p] for (p, l) in zip(prediction, label)]
        for prediction, label in zip(output, labels.to('cpu').numpy().tolist())
    ]
true_labels = [
    [ID2LABEL[l] for (p, l) in zip(prediction, label.to('cpu').numpy().tolist())]
    for prediction, label in zip(output, labels)
]

In [27]:
METRIC.compute(predictions=true_labels, references=true_labels)

{'L1': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 94},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [58]:
target = torch.randint(0, 10, (10,))

In [59]:
loss = criterion(output, target)
print(loss)

tensor(2.4786)


In [None]:
criterion()

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
out = model(input_ids, mask, labels, mode='train')

In [10]:
out

tensor([346.0506, 331.8098], grad_fn=<SubBackward0>)

In [9]:
out.mean()

tensor(339.4282, grad_fn=<MeanBackward0>)

In [34]:
out.backward()

In [25]:
output = encoder(input_ids=input_ids)

In [105]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.6007, -0.3789,  1.6257,  ..., -0.2636,  0.1943, -0.9833],
         [-0.2066,  1.4493, -0.2869,  ...,  0.4865,  1.6656,  0.2705],
         [-0.0813,  3.3444,  0.7160,  ..., -0.0481,  1.0002, -0.3339],
         ...,
         [-0.5181,  1.2309,  0.9647,  ...,  0.3538,  0.5873, -0.2277],
         [ 0.0342,  1.9868, -0.3185,  ...,  0.5481,  2.0470,  0.2225],
         [ 0.0752,  2.7654,  0.1172,  ..., -0.3603,  0.8225,  0.3939]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.0290,  0.5206, -0.6405,  ...,  0.5000, -0.3417, -0.4772]],
       grad_fn=<TanhBackward0>), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

In [34]:
linear = nn.Linear(encoder.config.hidden_size, 21)

In [37]:
l_out = linear(output.last_hidden_state)

In [47]:
l_out.size()

torch.Size([1, 514, 21])

In [62]:
crf = CRF(21)

In [64]:
loss = crf(l_out,labels=labels,mask=mask)

In [70]:
crf_out = crf.viterbi_decode(l_out,mask=mask)

In [73]:
len(crf_out[0])

100

In [None]:
labels=labels,