In [35]:
import corus
import matplotlib
import nltk
import numpy as np
import os
import pandas as pd
import pyconll
import ru_core_news_sm
import spacy
import tensorflow
import warnings

from corus import load_ne5
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from nltk.tag import DefaultTagger, RegexpTagger, UnigramTagger
from nltk.tag import BigramTagger, TrigramTagger
from razdel import tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report
from spacy import displacy
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Input
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D, GRU, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
warnings.filterwarnings("ignore")
%matplotlib inline

## POS

In [59]:
nltk.download('tagsets')
# Требуется для токенизации
nltk.download('punkt')
# Требуется для parts of speech tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\xiaomi\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xiaomi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\xiaomi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\xiaomi\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\xiaomi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
nltk.help.upenn_tagset('RB')
nltk.help.upenn_tagset('NN')
nltk.help.upenn_tagset('VB')

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...


In [20]:
patterns = [
    (r'.*ет$', 'VB'),                
    (r'.*ал$', 'VBD'),                 
    (r'.*ешь$', 'VB'),                 
    (r'.*ое$', 'NN'),                  
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  
    (r'.*ая', 'NN'), 
    (r'.*', 'NN'),                      # nouns (default) 
    (r'.*ий$', 'NN'),                 
    (r'.*ую$', 'NN')                   
]

In [6]:
full_train = pyconll.load_from_file('ru_syntagrus-ud-train-a.conllu')
full_test = pyconll.load_from_file('ru_syntagrus-ud-dev.conllu')

In [7]:
for sent in full_train[:2]:
    for token in sent:
        print(token.form, token.upos)

Анкета NOUN
. PUNCT
Начальник NOUN
областного ADJ
управления NOUN
связи NOUN
Семен PROPN
Еремеевич PROPN
был AUX
человек NOUN
простой ADJ
, PUNCT
приходил VERB
на ADP
работу NOUN
всегда ADV
вовремя ADV
, PUNCT
здоровался VERB
с ADP
секретаршей NOUN
за ADP
руку NOUN
и CCONJ
иногда ADV
даже PART
писал VERB
в ADP
стенгазету NOUN
заметки NOUN
под ADP
псевдонимом NOUN
" PUNCT
Муха NOUN
" PUNCT
. PUNCT


In [8]:
fdata_train = []
for sent in full_train[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])
    
fdata_test = []
for sent in full_test[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])
    
fdata_sent_test = []
for sent in full_test[:]:
    fdata_sent_test.append([token.form for token in sent])

In [9]:
len(fdata_train), len(fdata_test), len(fdata_sent_test)

(24516, 8906, 8906)

In [25]:
# DefaultTagger
D_TAG = DefaultTagger(fdata_train)
accuracy_D_TAG = D_TAG.evaluate(fdata_test)

# R_TAG = RegexpTagger(patterns)
# R_TAG.evaluate(fdata_test)

# UnigramTagger
U_TAG = UnigramTagger(fdata_train)
accuracy_U_TAG = U_TAG.evaluate(fdata_test)

# BigramTagger
Bi_TAG = BigramTagger(fdata_train)
accuracy_Bi_TAG = Bi_TAG.evaluate(fdata_test)

# TrigramTagger
T_TAG = TrigramTagger(fdata_train)
accuracy_T_TAG = T_TAG.evaluate(fdata_test)

In [27]:
print(f'accuracy Default tagger={accuracy_D_TAG:.3f}')
print(f'accuracy Unigram tagger={accuracy_U_TAG:.3f}')
print(f'accuracy Bigram tagger={accuracy_Bi_TAG:.3f}')
print(f'accuracy Trigram tagger={accuracy_T_TAG:.3f}')

accuracy Default tagger=0.000
accuracy Unigram tagger=0.824
accuracy Bigram tagger=0.609
accuracy Trigram tagger=0.178


#### Комбинация тэггеров

In [32]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff


backoff = DefaultTagger('NN') 
Combi_TAG = backoff_tagger(fdata_train,  
                     [UnigramTagger, BigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
accuracy_Combi_TAG = Combi_TAG.evaluate(fdata_test)

In [34]:
print(f'accuracy Combi tagger={accuracy_Combi_TAG:.3f}')

accuracy Combi tagger=0.828


In [35]:
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        test_tok.append(tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [38]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label)
test_enc_labels = le.transform(test_label)
le.classes_

array(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN',
       'NO_TAG', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
       'VERB', 'X'], dtype='<U6')

In [39]:
vectorizers = [CountVectorizer(ngram_range=(1, 3), analyzer='char'), 
               TfidfVectorizer(ngram_range=(1, 3), analyzer='char'), 
               HashingVectorizer(ngram_range=(1, 3), analyzer='char', n_features=1000)] 
vectorizers_word = [CountVectorizer(ngram_range=(1, 3), analyzer='word'), 
               TfidfVectorizer(ngram_range=(1, 3), analyzer='word'), 
               HashingVectorizer(ngram_range=(1, 3), analyzer='word', n_features=1000)] 
n_features = [2000, 3000, 5000, 10000]
hvectorizer = [HashingVectorizer(ngram_range=(1, 3), analyzer='char', n_features=feat) for feat in n_features]
hvectorizer_word = [HashingVectorizer(ngram_range=(1, 3), analyzer='word', n_features=feat) for feat in n_features]

In [40]:
f1_scores = []
acc_scores = []

for vectorizer in vectorizers + vectorizers_word + hvectorizer + hvectorizer_word:
    X_train = vectorizer.fit_transform(train_tok)
    X_test = vectorizer.transform(test_tok[:115000])
    
    lr = LogisticRegression(random_state=0, max_iter=100)
    lr.fit(X_train, train_enc_labels)
    pred = lr.predict(X_test)
    f1 = f1_score(test_enc_labels[:115000], pred, average='weighted')
    f1_scores.append(f1)
    acc = accuracy_score(test_enc_labels[:115000], pred)
    acc_scores.append(acc)
    
    print(vectorizer)
    print(classification_report(test_enc_labels[:115000], pred, target_names=le.classes_))

CountVectorizer(analyzer='char', ngram_range=(1, 3))
              precision    recall  f1-score   support

         ADJ       0.92      0.91      0.92     11247
         ADP       0.98      1.00      0.99     10255
         ADV       0.92      0.90      0.91      5986
         AUX       0.81      0.97      0.88      1058
       CCONJ       0.88      0.98      0.93      4276
         DET       0.88      0.75      0.81      2978
        INTJ       0.36      0.36      0.36        11
        NOUN       0.92      0.95      0.94     27241
      NO_TAG       1.00      1.00      1.00       197
         NUM       0.85      0.91      0.88      1436
        PART       0.95      0.78      0.86      3762
        PRON       0.83      0.89      0.86      5346
       PROPN       0.79      0.59      0.67      4315
       PUNCT       1.00      1.00      1.00     21941
       SCONJ       0.81      0.91      0.86      2176
         SYM       1.00      0.68      0.81        53
        VERB       0.94     

HashingVectorizer(analyzer='char', n_features=3000, ngram_range=(1, 3))
              precision    recall  f1-score   support

         ADJ       0.87      0.88      0.87     11247
         ADP       0.98      0.99      0.98     10255
         ADV       0.87      0.82      0.85      5986
         AUX       0.81      0.97      0.88      1058
       CCONJ       0.89      0.97      0.93      4276
         DET       0.87      0.72      0.79      2978
        INTJ       0.00      0.00      0.00        11
        NOUN       0.87      0.93      0.90     27241
      NO_TAG       1.00      1.00      1.00       197
         NUM       0.83      0.83      0.83      1436
        PART       0.94      0.78      0.85      3762
        PRON       0.82      0.91      0.86      5346
       PROPN       0.79      0.38      0.51      4315
       PUNCT       1.00      1.00      1.00     21941
       SCONJ       0.81      0.90      0.85      2176
         SYM       1.00      0.79      0.88        53
        V

In [43]:
result_model = pd.DataFrame({'Vectorizer': vectorizers + vectorizers_word + hvectorizer + hvectorizer_word,
                            'f1_score': f1_scores})
result_model.sort_values('f1_score', ascending=False)

Unnamed: 0,Vectorizer,f1_score
0,"CountVectorizer(analyzer='char', ngram_range=(...",0.927971
1,"TfidfVectorizer(analyzer='char', ngram_range=(...",0.921185
9,"HashingVectorizer(analyzer='char', n_features=...",0.903654
8,"HashingVectorizer(analyzer='char', n_features=...",0.901192
7,"HashingVectorizer(analyzer='char', n_features=...",0.896959
6,"HashingVectorizer(analyzer='char', n_features=...",0.895273
2,"HashingVectorizer(analyzer='char', n_features=...",0.882215
4,"TfidfVectorizer(ngram_range=(1, 3))",0.662748
3,"CountVectorizer(ngram_range=(1, 3))",0.650922
13,"HashingVectorizer(n_features=10000, ngram_rang...",0.620016


In [44]:
result_model_acc = pd.DataFrame({'Vectorizer': vectorizers + vectorizers_word + hvectorizer + hvectorizer_word,
                            'Accuracy': acc_scores})
result_model_acc.sort_values('Accuracy', ascending=False)

Unnamed: 0,Vectorizer,Accuracy
0,"CountVectorizer(analyzer='char', ngram_range=(...",0.929643
1,"TfidfVectorizer(analyzer='char', ngram_range=(...",0.923609
9,"HashingVectorizer(analyzer='char', n_features=...",0.907417
8,"HashingVectorizer(analyzer='char', n_features=...",0.904974
7,"HashingVectorizer(analyzer='char', n_features=...",0.901243
6,"HashingVectorizer(analyzer='char', n_features=...",0.898939
2,"HashingVectorizer(analyzer='char', n_features=...",0.885157
4,"TfidfVectorizer(ngram_range=(1, 3))",0.65293
3,"CountVectorizer(ngram_range=(1, 3))",0.642148
13,"HashingVectorizer(n_features=10000, ngram_rang...",0.629365


#### Вывод: лучшую метрику показали символьные N-граммы на CountVectorizer

## NER

#### NLTK

In [5]:
# Просматриваем содержание коллекции
print(os.listdir("Collection5"))

['001.ann', '001.txt', '002.ann', '002.txt', '003.ann', '003.txt', '004.ann', '004.txt', '005.ann', '005.txt', '006.ann', '006.txt', '007.ann', '007.txt', '008.ann', '008.txt', '009.ann', '009.txt', '010.ann', '010.txt', '011.ann', '011.txt', '012.ann', '012.txt', '013.ann', '013.txt', '014.ann', '014.txt', '015 (!).ann', '015 (!).txt', '016.ann', '016.txt', '017.ann', '017.txt', '018.ann', '018.txt', '019.ann', '019.txt', '020.ann', '020.txt', '021.ann', '021.txt', '022.ann', '022.txt', '023.ann', '023.txt', '025.ann', '025.txt', '026.ann', '026.txt', '027.ann', '027.txt', '028.ann', '028.txt', '029.ann', '029.txt', '030.ann', '030.txt', '031.ann', '031.txt', '032.ann', '032.txt', '033.ann', '033.txt', '034.ann', '034.txt', '035.ann', '035.txt', '036.ann', '036.txt', '037.ann', '037.txt', '038.ann', '038.txt', '039.ann', '039.txt', '03_12_12a.ann', '03_12_12a.txt', '03_12_12b.ann', '03_12_12b.txt', '03_12_12c.ann', '03_12_12c.txt', '03_12_12d.ann', '03_12_12d.txt', '03_12_12g.ann', '0

In [6]:
# Собираем только текстовые файлы коллекции
fileDir = r"Collection5"
fileExt = r".txt"
documents_txt = [_ for _ in os.listdir(fileDir) if _.endswith(fileExt)]
print(documents_txt)

['001.txt', '002.txt', '003.txt', '004.txt', '005.txt', '006.txt', '007.txt', '008.txt', '009.txt', '010.txt', '011.txt', '012.txt', '013.txt', '014.txt', '015 (!).txt', '016.txt', '017.txt', '018.txt', '019.txt', '020.txt', '021.txt', '022.txt', '023.txt', '025.txt', '026.txt', '027.txt', '028.txt', '029.txt', '030.txt', '031.txt', '032.txt', '033.txt', '034.txt', '035.txt', '036.txt', '037.txt', '038.txt', '039.txt', '03_12_12a.txt', '03_12_12b.txt', '03_12_12c.txt', '03_12_12d.txt', '03_12_12g.txt', '03_12_12h.txt', '040.txt', '041.txt', '042.txt', '043.txt', '044.txt', '045.txt', '046.txt', '047.txt', '048.txt', '049.txt', '04_02_13a_abdulatipov.txt', '04_03_13a_sorokin.txt', '04_12_12b.txt', '04_12_12d.txt', '04_12_12f.txt', '04_12_12g.txt', '04_12_12h_corr.txt', '050.txt', '051.txt', '052.txt', '053.txt', '054.txt', '055.txt', '056.txt', '057.txt', '058.txt', '059.txt', '060.txt', '061.txt', '062.txt', '063.txt', '064.txt', '065.txt', '066.txt', '067.txt', '068.txt', '069.txt', '

In [7]:
# Заносим данные файлов txt в датасет
text_list = []
for file in documents_txt:
    doc = open('Collection5/' + file, encoding='utf-8')
    text = doc.read()
    text_list.append(text)
    
data_text = pd.DataFrame({'text': text_list })
data_text

Unnamed: 0,text
0,Россия рассчитывает на конструктивное воздейст...
1,Комиссар СЕ критикует ограничительную политику...
2,"Пулеметы, автоматы и снайперские винтовки изъя..."
3,4 октября назначены очередные выборы Верховног...
4,Следственное управление при прокуратуре требуе...
...,...
994,"Депутат от ""ЕР"": К отставке А.Сердюкова причас..."
995,\nСи Цзиньпин избран генсеком Коммунистической...
996,"""Ведомости"" узнали о смене лидера московских е..."
997,СМИ узнали о кутежах туркменского чиновника на...


In [18]:
# Пример текста
document = data_text.text[0]

# Разбиваем документ на токены и применяем pos tagging (на выходе список кортежей (токен, часть речи))
nltk.pos_tag(nltk.word_tokenize(document))

[('Россия', 'JJ'),
 ('рассчитывает', 'NNP'),
 ('на', 'NNP'),
 ('конструктивное', 'NNP'),
 ('воздействие', 'NNP'),
 ('США', 'NNP'),
 ('на', 'NNP'),
 ('Грузию', 'VBD'),
 ('04/08/2008', 'CD'),
 ('12:08', 'CD'),
 ('МОСКВА', 'NN'),
 (',', ','),
 ('4', 'CD'),
 ('авг', 'SYM'),
 ('-', ':'),
 ('РИА', 'NN'),
 ('Новости', 'NN'),
 ('.', '.'),
 ('Россия', 'JJ'),
 ('рассчитывает', 'NN'),
 (',', ','),
 ('что', 'NNP'),
 ('США', 'NNP'),
 ('воздействуют', 'NNP'),
 ('на', 'NNP'),
 ('Тбилиси', 'NNP'),
 ('в', 'NNP'),
 ('связи', 'NNP'),
 ('с', 'NNP'),
 ('обострением', 'NNP'),
 ('ситуации', 'NNP'),
 ('в', 'NNP'),
 ('зоне', 'NNP'),
 ('грузино-осетинского', 'JJ'),
 ('конфликта', 'NNP'),
 ('.', '.'),
 ('Об', 'VB'),
 ('этом', 'JJ'),
 ('статс-секретарь', 'JJ'),
 ('-', ':'),
 ('заместитель', 'NN'),
 ('министра', 'JJ'),
 ('иностранных', 'NNP'),
 ('дел', 'NNP'),
 ('России', 'NNP'),
 ('Григорий', 'NNP'),
 ('Карасин', 'NNP'),
 ('заявил', 'NNP'),
 ('в', 'NNP'),
 ('телефонном', 'NNP'),
 ('разговоре', 'NNP'),
 ('с', 'NNP

In [57]:
# Распознаем именнованные сущности с помощью классификатора (Person, Organization, GPE)
{(' '.join(c[0] for c in chunk), chunk.label()) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label')}

{('МИД России', 'ORGANIZATION'),
 ('МОСКВА', 'ORGANIZATION'),
 ('РИА Новости', 'ORGANIZATION'),
 ('России Григорий Карасин', 'PERSON'),
 ('Россия', 'PERSON'),
 ('Тбилиси', 'PERSON')}

In [58]:
# Разметка из Collection5
pd.read_csv('Collection5/2003.ann', delimiter='\t')

Unnamed: 0,T1,PER 20 27,Ассанжу
0,T2,PER 51 67,Джулиана Ассанжа
1,T3,PER 220 226,Ассанж
2,T4,ORG 514 523,WikiLeaks
3,T5,PER 649 658,Д. Ассанж
4,T6,PER 1038 1047,Д. Ассанж
...,...,...,...
122,T124,GEOPOLIT 11243 11249,Швеции
123,T125,PER 11349 11358,Д. Ассанж
124,T126,PER 11473 11483,Д. Ассанжа
125,T127,GEOPOLIT 11849 11857,Эквадора


#### SPACY

In [70]:
nlp = spacy.load("ru_core_news_sm")
ny_bb = data_text.text[1]
article = nlp(ny_bb)

In [71]:
displacy.render(article, jupyter=True, style='ent')

In [72]:
for token in article:
    print(token.text, token.pos_, token.dep_)

Комиссар NOUN nsubj
СЕ PROPN nmod
критикует VERB ROOT
ограничительную ADJ amod
политику NOUN obj
в ADP case
отношении NOUN fixed
беженцев NOUN nmod
в ADP case
европейских ADJ amod
странах NOUN nmod


 SPACE dep
05/08/2008 NUM appos
10:32 NUM appos


 SPACE dep
МОСКВА PROPN obl
, PUNCT punct
5 ADJ obl
августа NOUN flat
/Новости NOUN flat
- NOUN nsubj
Грузия/. PROPN nsubj
  SPACE dep
Проводимая VERB acl
в ADP case
европейских ADJ amod
странах NOUN obl
ограничительная ADJ amod
политика NOUN nsubj
в ADP case
отношении NOUN fixed
беженцев NOUN nmod
нарушает VERB conj
ряд NOUN obj
международных ADJ amod
стандартов NOUN nmod
, PUNCT punct
в ADP discourse
частности NOUN fixed
, PUNCT punct
право NOUN parataxis
на ADP case
воссоединение NOUN nmod
семей NOUN nmod
, PUNCT punct
заявляет VERB parataxis
Комиссар PROPN nsubj
Совета PROPN nmod
Европы PROPN nmod
по ADP case
правам NOUN nmod
человека NOUN nmod
Томас PROPN appos
Хаммарберг PROPN flat:name
( PUNCT punct
Thomas PROPN appos
Hammarberg PROP

#### Deeppavlov

###### pip install pytorch-crf==0.4.0 needed.

In [1]:
import deeppavlov
from deeppavlov import configs, build_model
deeppavlov_ner = build_model(configs.ner.ner_rus_bert, download=True)
rus_document = "Нью-Йорк, США, 30 апреля 2020, 01:01 — REGNUM В администрации президента США Дональда Трампа планируют пройти все этапы создания вакцины от коронавируса в ускоренном темпе и выпустить 100 млн доз до конца 2020 года, передаёт агентство Bloomberg со ссылкой на осведомлённые источники"
deeppavlov_ner([rus_document])

2023-07-18 18:58:36.358 INFO in 'deeppavlov.core.data.utils'['utils'] at line 95: Downloading from http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch_new.tar.gz to C:\Users\xiaomi\.deeppavlov\models\ner_rus_bert_torch_new.tar.gz
100%|█████████████████████████████████████████████████████████████████████████████| 1.44G/1.44G [10:45<00:00, 2.23MB/s]
2023-07-18 19:09:22.373 INFO in 'deeppavlov.core.data.utils'['utils'] at line 276: Extracting C:\Users\xiaomi\.deeppavlov\models\ner_rus_bert_torch_new.tar.gz archive into C:\Users\xiaomi\.deeppavlov\models\ner_rus_bert_torch


Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

RuntimeError: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 2.00 GiB total capacity; 1.66 GiB already allocated; 0 bytes free; 1.74 GiB reserved in total by PyTorch)

### К сожалению не хватает памяти((

In [8]:
# Собираем только ann файлы коллекции
fileDir = r"Collection5"
fileExt = r".ann"
documents_ann = [_ for _ in os.listdir(fileDir) if _.endswith(fileExt)]
print(documents_ann)

['001.ann', '002.ann', '003.ann', '004.ann', '005.ann', '006.ann', '007.ann', '008.ann', '009.ann', '010.ann', '011.ann', '012.ann', '013.ann', '014.ann', '015 (!).ann', '016.ann', '017.ann', '018.ann', '019.ann', '020.ann', '021.ann', '022.ann', '023.ann', '025.ann', '026.ann', '027.ann', '028.ann', '029.ann', '030.ann', '031.ann', '032.ann', '033.ann', '034.ann', '035.ann', '036.ann', '037.ann', '038.ann', '039.ann', '03_12_12a.ann', '03_12_12b.ann', '03_12_12c.ann', '03_12_12d.ann', '03_12_12g.ann', '03_12_12h.ann', '040.ann', '041.ann', '042.ann', '043.ann', '044.ann', '045.ann', '046.ann', '047.ann', '048.ann', '049.ann', '04_02_13a_abdulatipov.ann', '04_03_13a_sorokin.ann', '04_12_12b.ann', '04_12_12d.ann', '04_12_12f.ann', '04_12_12g.ann', '04_12_12h_corr.ann', '050.ann', '051.ann', '052.ann', '053.ann', '054.ann', '055.ann', '056.ann', '057.ann', '058.ann', '059.ann', '060.ann', '061.ann', '062.ann', '063.ann', '064.ann', '065.ann', '066.ann', '067.ann', '068.ann', '069.ann', '

In [9]:
ann = pd.read_csv('Collection5/003.ann', delimiter='\t', header=None)
ann

Unnamed: 0,0,1,2
0,T1,LOC 82 89,Бишкеке
1,T2,LOC 113 119,БИШКЕК
2,T3,MEDIA 132 146,Новости-Грузия
3,T4,GEOPOLIT 175 183,Киргизии
4,T5,GEOPOLIT 225 228,США
5,T6,LOC 231 238,Бишкеке
6,T7,ORG 316 319,МВД
7,T8,GEOPOLIT 320 328,Киргизии
8,T9,GEOPOLIT 492 500,Киргизии
9,T10,GEOPOLIT 525 528,США


In [10]:
# Составляем списки токенов и интенсов (из файла .ann делается словарь {слово : интенс}, из словаря каждому токену сопоствляем интенс)
docs = []
for i in range(len(documents_ann)):
    words = []
    labels = []
    # Подготавливаем текст
    text = data_text['text'][i]
    
    df = pd.read_csv('Collection5/' + documents_ann[i], delimiter='\t', header=None)
    df_ann = pd.DataFrame()
    df_ann['Token'] = df.loc[:, 2]
    split_1 = [loc.split() for loc in df.loc[:, 1].values]
    df_ann['Entity'] = [loc[0] for loc in split_1]
       
    dic = {}
    for j in range(len(df)):
        token = df_ann['Token'][j].lower().split()
        entity = df_ann['Entity'][j]
        for tok in token:
            dic[tok] = entity

    for token in tokenize(text):
        if (token.text.lower() in dic.keys()):
            words.append(token.text)
            labels.append(dic[token.text.lower()])
        else:
            words.append(token.text)
            labels.append('OUT')
    
    docs.append([words, labels])

In [11]:
data, labels = list(zip(*docs))
for w, e in zip(data[0], labels[0]):
    print(f'{w}\t{e}')

Россия	GEOPOLIT
рассчитывает	OUT
на	OUT
конструктивное	OUT
воздействие	OUT
США	GEOPOLIT
на	OUT
Грузию	GEOPOLIT
04/08/2008	OUT
12	OUT
:	OUT
08	OUT
МОСКВА	LOC
,	OUT
4	OUT
авг	OUT
-	OUT
РИА	MEDIA
Новости	MEDIA
.	OUT
Россия	GEOPOLIT
рассчитывает	OUT
,	OUT
что	OUT
США	GEOPOLIT
воздействуют	OUT
на	OUT
Тбилиси	GEOPOLIT
в	OUT
связи	OUT
с	OUT
обострением	OUT
ситуации	OUT
в	OUT
зоне	OUT
грузино-осетинского	OUT
конфликта	OUT
.	OUT
Об	OUT
этом	OUT
статс-секретарь	OUT
-	OUT
заместитель	OUT
министра	OUT
иностранных	OUT
дел	OUT
России	GEOPOLIT
Григорий	PER
Карасин	PER
заявил	OUT
в	OUT
телефонном	OUT
разговоре	OUT
с	OUT
заместителем	OUT
госсекретаря	OUT
США	GEOPOLIT
Дэниэлом	PER
Фридом	PER
.	OUT
"	OUT
С	OUT
российской	OUT
стороны	OUT
выражена	OUT
глубокая	OUT
озабоченность	OUT
в	OUT
связи	OUT
с	OUT
новым	OUT
витком	OUT
напряженности	OUT
вокруг	OUT
Южной	GEOPOLIT
Осетии	GEOPOLIT
,	OUT
противозаконными	OUT
действиями	OUT
грузинской	OUT
стороны	OUT
по	OUT
наращиванию	OUT
своих	OUT
вооруженных	OUT
сил	OUT

In [12]:
df = pd.DataFrame({'sent_id': [i for j in [[i] * len(s) for i, s in enumerate(data)] for i in j],
                   'data': [i for j in data for i in j],
                   'entities': [i for j in labels for i in j]})
df.head(50)

Unnamed: 0,sent_id,data,entities
0,0,Россия,GEOPOLIT
1,0,рассчитывает,OUT
2,0,на,OUT
3,0,конструктивное,OUT
4,0,воздействие,OUT
5,0,США,GEOPOLIT
6,0,на,OUT
7,0,Грузию,GEOPOLIT
8,0,04/08/2008,OUT
9,0,12,OUT


In [13]:
from sklearn import model_selection, preprocessing, linear_model

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['data'], df['entities'])

In [14]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [15]:
encoder.classes_

array(['GEOPOLIT', 'LOC', 'MEDIA', 'ORG', 'OUT', 'PER'], dtype=object)

In [16]:
train_x.apply(len).max(axis=0)

55

In [18]:
train_data = tensorflow.data.Dataset.from_tensor_slices((train_x, train_y))
valid_data = tensorflow.data.Dataset.from_tensor_slices((valid_x, valid_y))

train_data = train_data.batch(8)
valid_data = valid_data.batch(8)

In [20]:
AUTOTUNE = tensorflow.data.experimental.AUTOTUNE

train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
valid_data = valid_data.cache().prefetch(buffer_size=AUTOTUNE)

In [21]:
def custom_standardization(input_data):
    # Здесь может быть предобработка текста
    return input_data

vocab_size = 30000
seq_len = 10

vectorize_layer = TextVectorization(  
                            standardize=custom_standardization,
                            max_tokens=vocab_size,
                            output_mode='int',
                            #ngrams=(1, 3),
                            output_sequence_length=seq_len)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_data = train_data.map(lambda x, y: x)
vectorize_layer.adapt(text_data)

In [31]:
mmodel = modelNER()

In [32]:
mmodel.compile(optimizer='adam',
              loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

In [33]:
mmodel.fit(train_data, validation_data=valid_data, epochs=3)

Epoch 1/3


KeyboardInterrupt: 

In [36]:
pred_y = mmodel.predict(valid_x)
y_pred_classes = np.argmax(pred_y,axis=1)



In [37]:
f1 = f1_score(valid_y, y_pred_classes, average= "weighted")
f1

0.940405093291417