# NLP Information Extraction: Named Entity Recognition

In [5]:
from tqdm import tqdm, trange
from pprint import pprint
import pandas as pd
import numpy as np
import json
import re
import random
import torch
from datasets import load_from_disk

## Load data

In [3]:
raw_dataset = load_from_disk("../data/raw")
test = pd.read_json(f'../data/test.json')

In [7]:
with open(f'../data/train.json', 'rb') as f:
    train = json.load(f)

In [15]:
with open(f'../data/test.json', 'rb') as f:
    test = json.load(f)

In [5]:
LABELS = {
    'обеспечение исполнения контракта': {
        'short': 'CE',
        'full': 'CONTRACT-ENFORCEMENT',
        'id': 1,
        'emb': [],
        'emb_tuned': [],
        'regex_pattern': r'',
    },
    'обеспечение гарантийных обязательств': {
        'short': 'WO',
        'full': 'WARRANTY-OBLIGRATIONS',
        'id': 2,
        'emb': [],
        'emb_tuned': [],
        'regex_pattern': r'',
    },
}
LABELS_INV = {
    'CE': 'обеспечение исполнения контракта',
    'WO': 'обеспечение гарантийных обязательств',
}


In [8]:
train_data = []
for d in train:
    ext_d = d['extracted_part']
    ann = (ext_d['answer_start'][0], ext_d['answer_end'][0], d['label'])
    train_data.append((d['text'], ann))
train_data = pd.DataFrame(train_data, columns=['text', 'annotation'])
train_data.head()

Unnamed: 0,text,annotation
0,Извещение о проведении открытого конкурса в эл...,"(1279, 1343, обеспечение исполнения контракта)"
1,ТРЕБОВАНИЯ К СОДЕРЖАНИЮ ЗАЯВКИ участника запро...,"(1222, 1318, обеспечение исполнения контракта)"
2,Извещение о проведении электронного аукциона д...,"(1297, 1343, обеспечение исполнения контракта)"
3,Извещение о проведении электронного аукциона д...,"(1304, 1350, обеспечение исполнения контракта)"
4,Извещение о проведении электронного аукциона д...,"(1302, 1348, обеспечение исполнения контракта)"


In [9]:
test_data = []
for d in test:
    ext_d = d['extracted_part']
    ann = (ext_d['answer_start'][0], ext_d['answer_end'][0], d['label'])
    test_data.append((d['text'], ann))
test_data = pd.DataFrame(test_data, columns=['text', 'annotation'])
test_data.head()

KeyError: 'extracted_part'

## SpaCy Preprocessing

In [10]:
import spacy
from spacy.tokens import DocBin, Span, SpanGroup
from spacy.util import filter_spans
from sklearn.model_selection import train_test_split
from natasha import Segmenter, Doc, NewsEmbedding, MorphVocab, NewsMorphTagger
from collections import Counter

In [11]:
def prepare_spacy_data(data, filename):
  nlp = spacy.blank("ru")
  db = DocBin()
  for text, (start, end, label) in data.to_numpy():
      doc = nlp(text)
      if start == 0 and end == 0: # No labeled info in the doc
        db.add(doc)
        continue
      span = doc.char_span(start, end, label=label, alignment_mode="contract")
      if span:
          doc.ents = [span]
      db.add(doc)
  db.to_disk(filename)

In [12]:
# train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, dev_data = train_test_split(train_data, test_size=0.1, random_state=42)

In [13]:
prepare_spacy_data(train_data, './train.spacy')
prepare_spacy_data(dev_data, './dev.spacy')

### Tok2Vec

- built-in in spaCy

### Transformers list

  - `bert-base-multilingual-uncased`
  - `DeepPavlov/rubert-base-cased`
  - `ai-forever/rugpt3large_based_on_gpt2`
  - `ai-forever/ruBert-large`
  - `ai-forever/ruRoberta-large`
  - `ai-forever/ruBert-base`
  - `ai-forever/rugpt3small_based_on_gpt2`
  - `cointegrated/rubert-tiny2`

---

  - `ai-forever/sbert_large_mt_nlu_ru`
  - `ai-forever/sbert_large_nlu_ru`

---

- `ai-forever/bert-base-NER-reptile-5-dataset`s (NER: не успел попробовать дообучить на свои лейблы)
- `Davlan/distilbert-base-multilingual-cased-ner-hrl` (NER: не успел попробовать дообучить на свои лейблы)
- `cointegrated/rut5-base-multitask` (не подходящий формат)

### Init

In [1]:
python -m spacy init fill-config ./base_config_tok2vec.cfg ./config.cfg

/home/commas/anaconda3/bin/python: No module named spacy


### Debug

In [None]:
!python -m spacy debug config ./config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy

In [None]:
python -m spacy debug data ./config.cfg --verbose --paths.train ./train.spacy --paths.dev ./dev.spacy

In [None]:
# !python -m spacy debug model ./config.cfg transformer --gpu-id 0 -PAR -P0 --paths.train ./train.spacy --paths.dev ./dev.spacy

### Train

In [None]:
python -m spacy train config.cfg --gpu-id -1 --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
# !zip -r model-best.zip ./output/model-best

In [None]:
python -m spacy train config.cfg --gpu-id 0 --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
# !zip -r model-best.zip ./output/model-best

In [None]:
!python -m spacy train config.cfg --gpu-id 0 --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
!zip -r model-best.zip ./output/model-best

## Evaluation

### Model load

In [14]:
nlp = spacy.load(f"output/model-best")

### By hand

In [None]:
sample = test_data[:50]
for text, (start, end, label) in sample.to_numpy():
    doc = nlp(text)
    print('label:', label)
    print('answer:', start, end, text[start:end])
    if doc.ents:
      e = doc.ents[0]
      print('model: ', doc[e.start].idx, doc[e.end].idx - 1, e.text)
    else:
      if start != 0 and end != 0:
        print(text)
      print('model: 0 0')
    print('---')

### By function

In [None]:
def eval_accuracy_with_error_window(data, nlp, windows = [0]):
  correct = [0] * len(windows)

  for text, (start, end, label) in tqdm(data):
    doc = nlp(text)
    for w in windows:
      if doc.ents:
        for ent in doc.ents:
          answer_start, answer_end, answer_label = doc[ent.start].idx, doc[ent.end].idx - 1, ent.label_
          if LABELS[label]['short'] == answer_label\
              and answer_start in range(start - w, start + w + 1)\
              and answer_end in range(end - w, end + w + 1):
            correct[w] += 1
    elif (start, end) == (0, 0):
      correct[w] += 1

  
  print(f'\nAccuracy: {correct[0]}/{len(data)}: {correct[0]/len(data)*100}%\n')
  
  return [c/len(data)*100 for c in correct]

In [None]:
import matplotlib.pyplot as plt


MAX_WINDOW = 1
ws = range(MAX_WINDOW + 1)

In [None]:
accuracy = eval_accuracy_with_error_window(test_data_with_labels.to_numpy(), nlp, windows=ws)

plt.plot(ws, accuracy, **{'color': 'blue', 'marker': 'o'})
plt.title("Accuracy dependence on the size of the allowable deviation from the fragment index")
plt.xlabel("Window size (acceptable deviation from the fragment index)")
plt.ylabel("accuracy, %")
plt.show()

## `predictions.json`

In [17]:
for e in tqdm(test):
  doc = nlp(e['text'])
  e['extracted_part'] = {}
  if doc.ents:
    for ent in doc.ents:
        if e['label'] == ent.label_:
          e['extracted_part']['text'] = [ent.text]
          e['extracted_part']['answer_start'] = [doc[ent.start].idx]
          e['extracted_part']['answer_end'] = [doc[ent.end].idx - 1]
        else:
          e['extracted_part']['text'] = ['']
          e['extracted_part']['answer_start'] = [0]
          e['extracted_part']['answer_end'] = [0]
  else:
    e['extracted_part']['text'] = ['']
    e['extracted_part']['answer_start'] = [0]
    e['extracted_part']['answer_end'] = [0]

with open('predictions_tok2vec.json', 'w', encoding='utf-8') as f:
    json.dump(test, f, ensure_ascii=False, indent=4)

100%|██████████| 318/318 [00:14<00:00, 21.91it/s]


## Results of NER pipeline

### tok2vec
- Accuracy: 64.15%
### cointegrated/rubert-tiny2
- Accuracy: 67.60%
### base-base-multilingual-cased (spaCy default)
- Accuracy: 73.33%