# Seminar 4 - NER

### Install env

In [1]:
%%bash 
rm -rf /usr/local/cuda 
ln -s /usr/local/cuda-10.1 /usr/local/cuda

In [None]:
!pip install gensim==3.8.0

In [3]:
!wget https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextskipgram_300_5_2018.tgz

--2022-03-02 23:34:11--  https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextskipgram_300_5_2018.tgz
Resolving rusvectores.org (rusvectores.org)... 116.203.104.23
Connecting to rusvectores.org (rusvectores.org)|116.203.104.23|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2659449819 (2.5G) [application/x-gzip]
Saving to: ‘araneum_none_fasttextskipgram_300_5_2018.tgz’


2022-03-02 23:35:30 (32.2 MB/s) - ‘araneum_none_fasttextskipgram_300_5_2018.tgz’ saved [2659449819/2659449819]



In [4]:
!tar -xzf araneum_none_fasttextskipgram_300_5_2018.tgz

In [None]:
!pip install sklearn-crfsuite==0.3.6

In [None]:
!pip install -U 'scikit-learn<0.24'

In [None]:
!git clone https://github.com/king-menin/mipt-nlp2022.git

Data from https://github.com/dialogue-evaluation/factRuEval-2016

In [9]:
!ls mipt-nlp2022/seminars/sem4/

sem4_ner.ipynb	test.csv  train.csv


In [None]:
!pip install natasha

### Run exps
FactRuEval-2016 competition of NER
http://www.dialog-21.ru/evaluation/2016/letter/



In [None]:
%load_ext autoreload
%autoreload 2


import sys
import warnings


warnings.filterwarnings("ignore")
sys.path.append("mipt-nlp2021/seminars/sem4/")

In [1]:
import gensim

In [2]:
model = gensim.models.KeyedVectors.load('araneum_none_fasttextskipgram_300_5_2018.model')

In [3]:
import pandas as pd

In [4]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [49]:
def word2features(sent, i, use_emd=False):
    word = sent[i]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if use_emd:
      for idx, e in enumerate(model[word]):
        features[f"embedding{idx}"] = e
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
        # for idx, e in enumerate(model[word1]):
        #   features[f"-1:embedding{idx}"] = e
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
        # for idx, e in enumerate(model[word1]):
        #   features[f"+1:embedding{idx}"] = e
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent, use_emd=False):
    return [word2features(sent, i, use_emd=use_emd) for i in range(len(sent))]

### Load data

In [29]:
train_df = pd.read_csv("mipt-nlp2022/seminars/sem4/train.csv", sep="\t")

In [30]:
train_df.head()

Unnamed: 0,labels,text,clf
0,O O O O O O B_ORG I_ORG O B_LOC I_LOC O O O O ...,В понедельник 28 июня у здания мэрии Москвы на...,False
1,O O O O O O O O O O O B_LOC B_PER I_PER O O O ...,"Среди требований , выдвигаемых организаторами ...",False
2,O O O O O O O O O O O O O O O O O O O O B_PER ...,Участникам акции предлагалось принести с собой...,False
3,O O O O O O O O O O O O O O O O O,Начало акции было намечено на 19 часов ; подчё...,True
4,O O O O O O O O O O O O O B_LOC I_LOC I_LOC O ...,"Освещающие акцию блоггеры сообщили , что автоб...",False


In [31]:
train_sents = list(map(lambda x: x.split(), train_df.text))

In [32]:
train_labels = list(map(lambda x: x.split(), train_df.labels))

In [33]:
print(list(zip(train_sents[0][:10], train_labels[0][:10])))

[('В', 'O'), ('понедельник', 'O'), ('28', 'O'), ('июня', 'O'), ('у', 'O'), ('здания', 'O'), ('мэрии', 'B_ORG'), ('Москвы', 'I_ORG'), ('на', 'O'), ('Тверской', 'B_LOC')]


In [34]:
labels = []
for l in train_labels:
  labels.extend(l)

labels = set(labels)

In [35]:
labels

{'B_LOC', 'B_ORG', 'B_PER', 'I_LOC', 'I_ORG', 'I_PER', 'O'}

In [36]:
# sent2features(train_sents[0])[0]

### Prepare features

In [37]:
test_df = pd.read_csv("mipt-nlp2022/seminars/sem4/test.csv", sep="\t")

In [38]:
test_sents = list(map(lambda x: x.split(), test_df.text))

In [39]:
test_labels = list(map(lambda x: x.split(), test_df.labels))

In [40]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = train_labels

X_test = [sent2features(s) for s in test_sents]
y_test = test_labels

CPU times: user 428 ms, sys: 2.14 s, total: 2.57 s
Wall time: 2.56 s


### Learn crf

In [41]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

### Predict on test

In [42]:
y_pred = crf.predict(X_test)


In [43]:
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=list(crf.classes_))

0.9148296688647011

In [44]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B_ORG', 'I_ORG', 'B_LOC', 'I_LOC', 'B_PER', 'I_PER']

In [45]:
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.5611414601220519

In [46]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))



              precision    recall  f1-score   support

       B_ORG      0.735     0.304     0.430      1905
       I_ORG      0.624     0.347     0.446      1960
       B_LOC      0.681     0.735     0.707      1324
       I_LOC      0.402     0.176     0.245       233
       B_PER      0.737     0.637     0.683      1335
       I_PER      0.736     0.898     0.809       777

   micro avg      0.693     0.507     0.586      7534
   macro avg      0.652     0.516     0.553      7534
weighted avg      0.687     0.507     0.561      7534



## Add word embedding

In [50]:
%%time
X_train = [sent2features(s, True) for s in train_sents]
y_train = train_labels

X_test = [sent2features(s, True) for s in test_sents]
y_test = test_labels

CPU times: user 12.9 s, sys: 2.11 s, total: 15 s
Wall time: 13.9 s


In [51]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [52]:
y_pred = crf.predict(X_test)

In [53]:
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=list(crf.classes_))

0.9194891513334035

In [54]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B_ORG', 'I_ORG', 'B_LOC', 'I_LOC', 'B_PER', 'I_PER']

In [55]:
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.5895331219058161

In [56]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))



              precision    recall  f1-score   support

       B_ORG      0.714     0.343     0.463      1905
       I_ORG      0.657     0.400     0.497      1960
       B_LOC      0.679     0.738     0.707      1324
       I_LOC      0.377     0.185     0.248       233
       B_PER      0.759     0.650     0.700      1335
       I_PER      0.781     0.918     0.844       777

   micro avg      0.706     0.536     0.609      7534
   macro avg      0.661     0.539     0.577      7534
weighted avg      0.697     0.536     0.590      7534



## Natasha

In [65]:
from natasha import Doc, NewsNERTagger, NewsEmbedding, Segmenter

In [66]:
segmenter = Segmenter()
emb = NewsEmbedding()

ner_tagger = NewsNERTagger(emb)

In [67]:
doc = Doc(train_df.text[0])

In [68]:
doc.segment(segmenter)
doc.tag_ner(ner_tagger)

In [72]:
print(doc.spans)

[DocSpan(start=37, stop=43, type='LOC', text='Москвы', tokens=[...]), DocSpan(start=47, stop=63, type='LOC', text='Тверской площади', tokens=[...])]


In [69]:
doc.ner.print()

В понедельник 28 июня у здания мэрии Москвы на Тверской площади 
                                     LOC───    LOC───────────── 
состоялась очередная несанкционированная акция протеста « День гнева »
 , в этот раз направленная , главным образом , против политики 
московских и подмосковных властей


In [74]:
print(list(zip(train_sents[0][:15], train_labels[0][:15])))

[('В', 'O'), ('понедельник', 'O'), ('28', 'O'), ('июня', 'O'), ('у', 'O'), ('здания', 'O'), ('мэрии', 'B_ORG'), ('Москвы', 'I_ORG'), ('на', 'O'), ('Тверской', 'B_LOC'), ('площади', 'I_LOC'), ('состоялась', 'O'), ('очередная', 'O'), ('несанкционированная', 'O'), ('акция', 'O')]
