# Введение в обработку естественного языка
## Урок 5. Part-of-Speech разметка, NER, извлечение отношений
### Задание 1. Написать теггер на данных с русским языком
- проверить UnigramTagger, BigramTagger, TrigramTagger и их комбмнации
- написать свой теггер как на занятии, попробовать разные векторайзеры, добавить знание не только букв но и слов
- сравнить все реализованные методы сделать выводы


In [1]:
# !pip uninstall pycocotools
# !pip install pycocotools --no-binary pycocotools
# !pip uninstall numpy
# !pip install numpy
# !pip install -U gensim

In [2]:
from gensim.models import Word2Vec, FastText

from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger, TrigramTagger
from nltk.tag import RegexpTagger
from nltk.corpus import names
import nltk
nltk.download('names')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
!pip install pyconll
!pip install corus
# !pip install deeppavlov
!pip install razdel
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyconll
  Downloading pyconll-3.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyconll
Successfully installed pyconll-3.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting corus
  Downloading corus-0.9.0-py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 1.3 MB/s 
[?25hInstalling collected packages: corus
Successfully installed corus-0.9.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |██████████████

In [5]:
import pyconll
import corus
from corus import load_ne5

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from scipy.sparse import hstack

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import string

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

import spacy
from spacy import displacy

# from deeppavlov import configs, build_model

from razdel import tokenize

In [6]:
%%capture
!wget -O /content/ru_syntagrus-ud-train.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train-a.conllu
!wget -O /content/ru_syntagrus-ud-dev.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu

In [7]:
X_train = pyconll.load_from_file('/content/ru_syntagrus-ud-train.conllu')
X_test = pyconll.load_from_file('/content/ru_syntagrus-ud-dev.conllu')

In [8]:
data_train = []
for sent in X_train[:]:
    data_train.append([(token.form, token.upos) for token in sent])
    
data_test = []
for sent in X_test[:]:
    data_test.append([(token.form, token.upos) for token in sent])
    
data_sent = []
for sent in X_test[:]:
    data_sent.append([token.form for token in sent])

print('Max sentence length: ', max(len(sent) for sent in X_train))
print('Max token length: ', max(len(token.form) for sent in X_train for token in sent))

Max sentence length:  194
Max token length:  31


In [9]:
default_tagger = nltk.DefaultTagger('NOUN')
print("default_tagger evaluation: ", default_tagger.evaluate(data_test))

unigram_tagger = UnigramTagger(data_train)
print("unigram_tagger evaluation: ", unigram_tagger.evaluate(data_test))

bigram_tagger = BigramTagger(data_train, backoff=unigram_tagger)
print("bigram_tagger evaluation: ", bigram_tagger.evaluate(data_test))

trigram_tagger = TrigramTagger(data_train, backoff=bigram_tagger)
print("trigram_tagger evaluation: ", trigram_tagger.evaluate(data_test))

default_tagger evaluation:  0.23593983983332248
unigram_tagger evaluation:  0.823732013802982
bigram_tagger evaluation:  0.8292792499511688
trigram_tagger evaluation:  0.8291425222996289


In [10]:
def combination(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff

In [11]:
backoff = DefaultTagger('NOUN') 
tag = combination(data_train,
                  [
                  UnigramTagger, 
                  BigramTagger, 
                  TrigramTagger
                  ],  
                  backoff = backoff
                  ) 
  
print("evaluation of combination: ", tag.evaluate(data_test))

evaluation of combination:  0.8787746598085813


In [12]:
token_tr = []
label_tr = []
for sent in data_train[:]:
    for tok in sent:
        token_tr.append(tok[0])
        label_tr.append('NO_TAG' if tok[1] is None else tok[1])
        
token_ts = []
label_ts = []
for sent in data_test[:]:
    for tok in sent:
        token_ts.append(tok[0] if tok[0] is not None else "")
        label_ts.append('NO_TAG' if tok[1] is None else tok[1])
        
        
le = LabelEncoder()
labels_tr = le.fit_transform(label_tr) 
labels_ts = le.transform(label_ts)

In [13]:
vectorizers = [CountVectorizer, HashingVectorizer, TfidfVectorizer]

for vect in vectorizers:

    vectorizer = vect(ngram_range=(1, 5), analyzer='char')
    X_train = vectorizer.fit_transform(token_tr)
    X_test = vectorizer.transform(token_ts)

    lr = LogisticRegression(random_state=0)
    lr.fit(X_train, labels_tr)

    pred = lr.predict(X_test)
    name_vect = ''.join([ch for ch in list(re.split('\.', str(vect))[-1] ) if ch not in string.punctuation])
    print(name_vect, accuracy_score(labels_ts, pred))

CountVectorizer 0.9394687154111596
HashingVectorizer 0.9203789309199818
TfidfVectorizer 0.9289862621264405


In [14]:
vectorizers = [CountVectorizer, HashingVectorizer, TfidfVectorizer]

for vect in vectorizers:

    vectorizer = vect(ngram_range=(1, 5), analyzer='word')
    X_train = vectorizer.fit_transform(token_tr)
    X_test = vectorizer.transform(token_ts)

    lr = LogisticRegression(random_state=0)
    lr.fit(X_train, labels_tr)

    pred = lr.predict(X_test)
    name_vect = ''.join([ch for ch in list(re.split('\.', str(vect))[-1] ) if ch not in string.punctuation])
    print(name_vect, accuracy_score(labels_ts, pred))

CountVectorizer 0.6331466892375806
HashingVectorizer 0.62416172927925
TfidfVectorizer 0.6400872452633635


In [15]:
scaler = StandardScaler(with_mean=False)
coder_1 = TfidfVectorizer(ngram_range=(1, 5), analyzer='char')
coder_2 = HashingVectorizer(ngram_range=(1, 5), analyzer='word')

X_train_1 = coder_1.fit_transform(token_tr)
X_test_1 = coder_1.transform(token_ts)

X_train_2 = coder_2.fit_transform(token_tr)
X_test_2 = coder_2.transform(token_ts)


X_train = hstack((X_train_1,X_train_2))
X_test = hstack((X_test_1,X_test_2))

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)    


print(X_train.shape)
lr = LogisticRegression(random_state=0, max_iter = 100, n_jobs=7)
lr.fit(X_train, labels_tr)

pred = lr.predict(X_test)

print('TfidfVectorizer_char + HashingVectorizer_word :', accuracy_score(labels_ts, pred))

(426182, 1160794)
TfidfVectorizer_char + HashingVectorizer_word : 0.9335503613516505


Выводы:  
- современные средства позволяют использовать теггеры с высокой эффективностью;
- теггер является весьма "тонким" инструментом и для работы с ним требуется весомый опыт работы.

### Задание 2. Проверить насколько хорошо работает NER данные брать из http://www.labinform.ru/pub/named_entities/  
* проверить NER из nltk/spacy/deeppavlov
* написать свой нер попробовать разные подходы  
  - передаём в сетку токен и его соседей  
  - передаём в сетку только токен  
  - свой вариант  
* сравнить ваши реализованные подходы на качество (вывести precision/recall/f1_score)


In [16]:
!wget http://www.labinform.ru/pub/named_entities/collection5.zip


--2022-06-16 14:31:26--  http://www.labinform.ru/pub/named_entities/collection5.zip
Resolving www.labinform.ru (www.labinform.ru)... 95.181.230.181
Connecting to www.labinform.ru (www.labinform.ru)|95.181.230.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1899530 (1.8M) [application/zip]
Saving to: ‘collection5.zip’


2022-06-16 14:31:27 (2.07 MB/s) - ‘collection5.zip’ saved [1899530/1899530]



In [17]:
!unzip collection5.zip

Archive:  collection5.zip
   creating: Collection5/
  inflating: Collection5/001.ann     
  inflating: Collection5/001.txt     
  inflating: Collection5/002.ann     
  inflating: Collection5/002.txt     
  inflating: Collection5/003.ann     
  inflating: Collection5/003.txt     
  inflating: Collection5/004.ann     
  inflating: Collection5/004.txt     
  inflating: Collection5/005.ann     
  inflating: Collection5/005.txt     
  inflating: Collection5/006.ann     
  inflating: Collection5/006.txt     
  inflating: Collection5/007.ann     
  inflating: Collection5/007.txt     
  inflating: Collection5/008.ann     
  inflating: Collection5/008.txt     
  inflating: Collection5/009.ann     
  inflating: Collection5/009.txt     
  inflating: Collection5/010.ann     
  inflating: Collection5/010.txt     
  inflating: Collection5/011.ann     
  inflating: Collection5/011.txt     
  inflating: Collection5/012.ann     
  inflating: Collection5/012.txt     
  inflating: Collection5/013.ann    

In [18]:
records = load_ne5('/content/Collection5')
document = next(records).text
{(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label') }

{('Матвиенко', 'PERSON'),
 ('Напомним', 'PERSON'),
 ('Петровский', 'ORGANIZATION')}

In [19]:
dp_doc = document.split()
dp_doc = ' '.join(dp_doc[:200])

In [None]:
# ner_model = build_model(configs.ner.ner_ontonotes_bert_torch, download=True)

# ner_model([dp_doc])

In [25]:
!pip install -U spacy
!spacy download ru_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 4.6 MB/s 
[?25hCollecting thinc<8.1.0,>=8.0.14
  Downloading thinc-8.0.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (660 kB)
[K     |████████████████████████████████| 660 kB 22.0 MB/s 
[?25hCollecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.1-py3-none-any.whl (27 kB)
Collecting typing-extensions<4.2.0,>=3.7.4
  Downloading typing_extensions-4.1.1-py3-none-any.whl (26 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 837 kB/s 
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |███████████

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ru-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.3.0/ru_core_news_sm-3.3.0-py3-none-any.whl (15.3 MB)
[K     |████████████████████████████████| 15.3 MB 5.1 MB/s 
[?25hCollecting pymorphy2>=0.9
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.9 MB/s 
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 7.6 MB/s 
Collecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2, ru-core-news-sm
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844 ru-core-news-sm-3.3.0
[38;5;2m✔ Download and installati

In [38]:
ru_core_news_sm

pathlib.Path

In [36]:
import importlib
import ru_core_news_sm

importlib.reload(spacy)
ner = ru_core_news_sm.load()

document = ner(document)
displacy.render(document, jupyter=True, style='ent')

ImportError: ignored

In [None]:
records = load_ne5(path)
words_docs = []
for ix, rec in enumerate(records):
    words = []
    for token in tokenize(rec.text):
       
        result = 'None'        
        
        for item in rec.spans:            
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'PER'):
                result = 'PER'
                break
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'ORG'):
                result = 'ORG'
                break
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'MEDIA'):
                result = 'MEDIA'
                break
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'LOC'):
                result = 'LOC'
                break
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'GEOPOLIT'):
                result = 'GEOPOLIT'
                break
                
    
        words.append([token.text, result])
    words_docs.extend(words)

In [None]:
df_words = pd.DataFrame(words_docs, columns=['word', 'tag'])

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df_words['word'], df_words['tag'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y))
valid_data = tf.data.Dataset.from_tensor_slices((valid_x, valid_y))

train_data = train_data.batch(16)
valid_data = valid_data.batch(16)

AUTOTUNE = tf.data.AUTOTUNE

train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
valid_data = valid_data.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
def custom_standardization(input_data):
        return input_data

def data_prep(train_data, seq_len=1, vocab_size = 30000):    
    
    vocab_size = 30000

    vectorize_layer = TextVectorization(
        standardize=custom_standardization,
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=seq_len)

    text_data = train_data.map(lambda x, y: x)
    vectorize_layer.adapt(text_data)
    return vectorize_layer

In [None]:
embedding_dim = 64

class modelNER(tf.keras.Model):
    def __init__(self):
        super(modelNER, self).__init__()
        self.emb = Embedding(vocab_size, embedding_dim)
        self.gPool = GlobalMaxPooling1D()
        self.fc1 = Dense(300, activation='relu')
        self.fc2 = Dense(50, activation='relu')
        self.fc3 = Dense(len(df_words['tag'].value_counts()), activation='softmax')

    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        pool_x = self.gPool(x)
        
        fc_x = self.fc1(pool_x)
        fc_x = self.fc2(fc_x)
        
        concat_x = tf.concat([pool_x, fc_x], axis=1)
        return self.fc3(concat_x)

In [None]:
vocab_size = 30000
vectorize_layer = data_prep(train_data, seq_len = 1, vocab_size = vocab_size)


mmodel = modelNER()
mmodel.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
mmodel.fit(train_data, validation_data=valid_data, epochs=5)

In [None]:
vocab_size = 30000
vectorize_layer = data_prep(train_data, seq_len = 3, vocab_size = vocab_size)


mmodel = modelNER()
mmodel.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
mmodel.fit(train_data, validation_data=valid_data, epochs=5)