In [1]:
import os
import pathlib
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models import FastText

from lightgbm import LGBMClassifier

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


DATA = pathlib.Path('data') / 'coll'

__load data__

In [2]:
def loader(dir):
    texts = []
    spans = []
    files = list(set(f.split('.')[0] for f in os.listdir(dir)))
    for f in tqdm(files, desc='loading'):
        txt = open(DATA.joinpath(f + '.txt').as_posix(), 'r', encoding='utf-8').read()
        ann = open(DATA.joinpath(f + '.ann').as_posix(), 'r', encoding='utf-8').readlines()
        ann = [item.replace(u'\t' , ' ').strip().split(maxsplit=4) for item in ann]
        sp = [[idx, tag, int(start), int(stop), text] for idx, tag, start, stop, text in ann]
        texts.append(txt)
        spans.append(sp)
    return texts, spans

In [3]:
texts, spans = loader(DATA.as_posix())

loading: 100%|██████████| 1000/1000 [00:00<00:00, 4123.54it/s]


In [4]:
# build dataframe
docs = []
for ix, (txt, ann) in enumerate(zip(texts, spans)):
    words = []
    for token in word_tokenize(txt, language='russian'):
        tag = 'OUT'
        for item in ann:
            if txt[item[2]:item[3]] == token == item[4]:    # если токен в тексте совпадает с токеном разметки
                tag = item[1]
                break
        words.append([ix, token, tag])
    docs.extend(words)

data = pd.DataFrame(docs, columns=['sent', 'word', 'tag'])
data.head()

Unnamed: 0,sent,word,tag
0,0,Министра,OUT
1,0,Белуджистана,LOC
2,0,отправили,OUT
3,0,в,OUT
4,0,отставку,OUT


In [5]:
data['tag'].value_counts()

OUT         255273
ORG           1397
PER           1356
GEOPOLIT      1188
LOC            468
MEDIA           62
Name: tag, dtype: int64

__natasha NER__

In [6]:
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsNERTagger, Doc

In [7]:
# prepare
segmenter = Segmenter()
emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)

In [8]:
doc = Doc(texts[0])
doc.segment(segmenter)

doc.tag_ner(ner_tagger)
doc.ner.print()

Министра Белуджистана отправили в отставку из-за терактов
         PER─────────                                    
Премьер-министр Пакистана Раджа Первез Ашраф 14 января отправил в 
                LOC────── PER───────────────                      
отставку главного министра провинции Белуджистан, где 10 января в 
                                     LOC────────                  
результате серии терактов погибли не менее 115 человек. Об этом 
сообщает Agence France-Presse.
         ORG───────────────── 
Такое решение премьер-министр принял после встречи в городе Кветта с 
                                                            LOC───   
мусульманами-шиитами, протестующими против бездействия властей в сфере
 безопасности. Как заявил премьер-министр, в течение следующих двух 
месяцев после отставки министра Белуджистана контролировать провинцию 
                                PER─────────                          
будет специально назначенный губернатор.
Кроме того, премьер-минист

__prepare data__

In [9]:
# сборщик эмбеддингов соседних токенов
def neighbours(arr, *, n=1):
    length = arr.shape
    for i in range(length[0]):
        lpad = n - i
        rpad = n - (length[0] - i - 1)
        if lpad > 0:
            res = np.hstack([np.zeros(length[1] * lpad, dtype=arr.dtype), *arr[:i+n+1]])
        if rpad > 0:
            res = np.hstack([*arr[i-n:], np.zeros(length[1] * rpad, dtype=arr.dtype)])
        if (lpad <= 0) and (rpad <= 0):
            res = np.hstack([*arr[i-n:i+n+1]])
        yield res

In [10]:
VEC_SIZE = 500

# embeddings
sentences = data.groupby('sent')['word'].agg(list)
ft = FastText(sentences, vector_size=VEC_SIZE, min_count=5)
embs = np.array(data['word'].apply(lambda val: ft.wv[val]).tolist())
embs = np.array(list(neighbours(embs, n=2)))      # собрать по 2 токена с каждой стороны

# encode tokens
VOCAB_SIZE = data['word'].unique().size
idx2word = dict(enumerate(data['word'].unique()))
word2idx = {v: k for k, v in idx2word.items()}
data['word_idx'] = data['word'].map(word2idx)

# encode labels
enc = LabelEncoder()
data['tag_idx'] = enc.fit_transform(data['tag'])
labels = np.eye(enc.classes_.size)[data['tag_idx'].values]

data.head()

Unnamed: 0,sent,word,tag,word_idx,tag_idx
0,0,Министра,OUT,0,4
1,0,Белуджистана,LOC,1,1
2,0,отправили,OUT,2,4
3,0,в,OUT,3,4
4,0,отставку,OUT,4,4


In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer='word', max_df=0.9, max_features=2500)
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='word', max_df=0.9, max_features=2500)
cnvec = vectorizer.fit_transform(data['word']).astype('float')

__custom NER__

In [12]:
# fit
train_x, valid_x, train_y, valid_y = train_test_split(embs, data['tag_idx'].values, test_size=0.2, shuffle=False)

model = LGBMClassifier(n_estimators=500, num_leaves=29, random_state=19, n_jobs=-1)
model.fit(train_x, train_y)

# evaluate
predicts = model.predict(valid_x)
print(classification_report(valid_y, predicts))

              precision    recall  f1-score   support

           0       0.28      0.22      0.24       271
           1       0.09      0.01      0.02       120
           2       0.00      0.00      0.00        13
           3       0.30      0.19      0.23       327
           4       0.98      0.99      0.99     50900
           5       0.20      0.16      0.17       318

    accuracy                           0.97     51949
   macro avg       0.31      0.26      0.28     51949
weighted avg       0.97      0.97      0.97     51949



In [13]:
# fit
train_x, valid_x, train_y, valid_y = train_test_split(cnvec, data['tag_idx'].values, test_size=0.2, shuffle=False)

model = LGBMClassifier(n_estimators=500, num_leaves=29, random_state=19, n_jobs=-1)
model.fit(train_x, train_y)

# evaluate
predicts = model.predict(valid_x)
print(classification_report(valid_y, predicts))

              precision    recall  f1-score   support

           0       0.75      0.14      0.24       271
           1       0.44      0.12      0.18       120
           2       0.00      0.00      0.00        13
           3       0.68      0.30      0.42       327
           4       0.98      1.00      0.99     50900
           5       0.36      0.03      0.06       318

    accuracy                           0.98     51949
   macro avg       0.54      0.27      0.32     51949
weighted avg       0.97      0.98      0.97     51949



__NN keras__

In [20]:
import tensorflow as tf
import tensorflow.keras.layers as layer
from tensorflow.keras import Sequential
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [15]:
# train/valid split
train_x, valid_x, train_y, valid_y = train_test_split(data['word'].values, labels, test_size=0.2, shuffle=False)

BATCH_SIZE = 64
AUTOTUNE = tf.data.AUTOTUNE

train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y)).batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)
valid_data = tf.data.Dataset.from_tensor_slices((valid_x, valid_y)).batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)

2022-10-22 20:31:27.003539: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-22 20:31:27.198429: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-10-22 20:31:27.198464: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-10-22 20:31:27.229857: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

In [16]:
def custom_standardization(input_data):
    return input_data

# VOCAB_SIZE = 30000
SEQ_LEN = 32

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    #ngrams=(1, 3),
    output_sequence_length=SEQ_LEN)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_data = train_data.map(lambda x, y: x)
vectorize_layer.adapt(text_data)

In [17]:
class KerasNER(tf.keras.Model):
    def __init__(self, vocab_size, embdim, hdim, out):
        super().__init__()
        self.emb = layer.Embedding(vocab_size, embdim)
        self.gPool = layer.GlobalMaxPooling1D()
        self.fc1 = layer.Dense(2 * hdim, activation='relu')
        self.fc2 = layer.Dense(hdim, activation='relu')
        self.fc3 = layer.Dense(out, activation='softmax')

    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        pool_x = self.gPool(x)
        
        fc_x = self.fc1(pool_x)
        fc_x = self.fc2(fc_x)
        
        concat_x = tf.concat([pool_x, fc_x], axis=1)
        prob = self.fc3(concat_x)
        return prob

In [18]:
# class KerasNER(tf.keras.Model):
#     def __init__(self, vocab_size, embdim, hdim, out, layers=2):
#         super().__init__()
#         self.emb = layer.Embedding(vocab_size, embdim)
#         self.lstm = layer.LSTM(layers)
#         self.fc1 = layer.Dense(2 * hdim, activation='relu')
#         self.fc2 = layer.Dense(hdim, activation='relu')
#         self.fc3 = layer.Dense(out, activation='softmax')

#     def call(self, x):
#         x = vectorize_layer(x)
#         x = self.emb(x)
#         x_lstm = self.lstm(x)

#         x_fc = self.fc1(x_lstm)
#         x_fc = self.fc2(x_fc)
#         x_cat = tf.concat([x_lstm, x_fc], axis=1)
#         prob = self.fc3(x_cat)
#         return prob

In [21]:
model = KerasNER(VOCAB_SIZE, embdim=256, hdim=128, out=enc.classes_.size)
# model = KerasNER(VOCAB_SIZE, embdim=256, hdim=128, out=enc.classes_.size, layers=3)

model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['Precision'])
model.fit(train_data, validation_data=valid_data, epochs=3)

Epoch 1/3


2022-10-22 20:42:35.162807: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 36549632 exceeds 10% of free system memory.
2022-10-22 20:42:35.178033: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 36549632 exceeds 10% of free system memory.
2022-10-22 20:42:35.205864: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 36549632 exceeds 10% of free system memory.
2022-10-22 20:42:35.342599: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 36549632 exceeds 10% of free system memory.
2022-10-22 20:42:35.349021: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 36549632 exceeds 10% of free system memory.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f0b8c3338e0>

In [22]:
# v1: pool
predicts = model.predict(valid_data)
print(classification_report(valid_y.argmax(axis=1), predicts.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.57      0.17      0.26       271
           1       0.36      0.40      0.38       120
           2       0.00      0.00      0.00        13
           3       0.54      0.37      0.44       327
           4       0.98      0.99      0.99     50900
           5       0.50      0.16      0.24       318

    accuracy                           0.98     51949
   macro avg       0.49      0.35      0.38     51949
weighted avg       0.98      0.98      0.98     51949



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


__NN torch__

Не работает

In [23]:
import torch
from common import TaggerDataset, TorchTrainable

In [68]:
# class Net(torch.nn.Module, TorchTrainable):
#     def __init__(self, vocab_size, dim, out, drop=0.2):
#         super().__init__()
#         self.emb = torch.nn.Embedding(vocab_size, 2 * dim)
#         self.pool = torch.nn.MaxPool1d(2)
#         self.fc1 = torch.nn.Linear(dim, 2 * dim)
#         self.fc2 = torch.nn.Linear(2 * dim, dim)
#         self.fc3 = torch.nn.Linear(2 * dim, out)
#         self.dp = torch.nn.Dropout(drop)

#     def forward(self, x):
#         x = self.emb(x)
#         pool_x = self.pool(x)
        
#         fc_x = self.fc1(pool_x)
#         fc_x = torch.relu(fc_x)
#         fc_x = self.fc2(fc_x)
#         fc_x = torch.relu(fc_x)
#         concat_x = torch.cat([pool_x, fc_x], axis=1)
        
#         x = self.fc3(concat_x)
#         x = torch.softmax(x, dim=1)
#         return x

In [28]:
class Net(torch.nn.Module, TorchTrainable):
    def __init__(self, vocab_size, inp, dim, out, drop=0.2):
        super().__init__()
        self.pool = torch.nn.AvgPool1d(2)
        self.fc1 = torch.nn.Linear(inp // 2, 2 * dim)
        self.fc2 = torch.nn.Linear(2 * dim, dim)
        self.fc3 = torch.nn.Linear(dim + inp // 2, out)
        self.dp = torch.nn.Dropout(drop)

    def forward(self, x):
        x_pool = self.pool(x)
        x_fc = self.fc1(x_pool)
        x_fc = torch.relu(x_fc)
        x_fc = self.fc2(x_fc)
        x_fc = torch.relu(x_fc)
        concat_x = torch.cat([x_pool, x_fc], axis=1)
        
        x = self.fc3(concat_x)
        x = torch.softmax(x, dim=1)
        return x

In [16]:
# class Net(torch.nn.Module, TorchTrainable):
#     def __init__(self, vocab_size, dim, out, drop=0.2, layers=2, avg=torch.mean):
#         super().__init__()
#         self.emb = torch.nn.Embedding(vocab_size, 2 * dim)        
#         self.lstm = torch.nn.LSTM(2 * dim, dim, num_layers=layers, batch_first=True, bidirectional=True, dropout=drop)
#         # self.gru = torch.nn.GRU(2 * dim, dim, num_layers=layers, batch_first=True, bidirectional=True, dropout=drop)
#         self.avg = avg
#         self.linear = torch.nn.Linear(2 * dim, out)
#         self.dp = torch.nn.Dropout(drop)

#     def forward(self, x):
#         x = self.emb(x)
#         x = self.dp(x)
#         x, ht = self.lstm(x)
#         # x, ht = self.gru(x)
#         # x = x[:, -1, :] if self.avg is None else self.avg(x, dim=1)
#         x = self.linear(x)
#         x = torch.softmax(x, dim=0)
#         return x

In [29]:
# train_x, valid_x, train_y, valid_y = train_test_split(data['word_idx'].values, labels, test_size=0.2, shuffle=False)
train_x, valid_x, train_y, valid_y = train_test_split(cnvec.toarray(), labels, test_size=0.2, shuffle=False)
# train_x, valid_x, train_y, valid_y = train_test_split(embs, labels, test_size=0.2, shuffle=False)

# VOCAB_SIZE = 30000
BATCH_SIZE = 256

train_dataset = TaggerDataset(train_x, train_y, dtype=torch.float)
valid_dataset = TaggerDataset(valid_x, valid_y, dtype=torch.float)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [30]:
# fit
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Selected device: {device}')
# net = Net(VOCAB_SIZE, dim=512, out=enc.classes_.size, drop=0.1).to(device)
net = Net(VOCAB_SIZE, inp=2500, dim=512, out=enc.classes_.size, drop=0.1).to(device)
# net = Net(VOCAB_SIZE, dim=512, out=enc.classes_.size, drop=0.1, layers=3).to(device)

optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

net.fit(train_loader, optimizer, criterion, epochs=3, device=device)

Selected device: cuda


Epoch 1/3: 100%|██████████| 812/812 [00:20<00:00, 40.28it/s, cumulative loss per item=0.00415]
Epoch 2/3: 100%|██████████| 812/812 [00:08<00:00, 96.19it/s, cumulative loss per item=0.00414]
Epoch 3/3: 100%|██████████| 812/812 [00:12<00:00, 65.68it/s, cumulative loss per item=0.00414]


Done.





In [31]:
# predict
predicts = net.predict(valid_loader)
lb_pred = predicts.argmax(axis=1)
print(classification_report(valid_y.argmax(axis=1), lb_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       271
           1       0.00      0.00      0.00       120
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00       327
           4       0.98      1.00      0.99     50900
           5       0.00      0.00      0.00       318

    accuracy                           0.98     51949
   macro avg       0.16      0.17      0.16     51949
weighted avg       0.96      0.98      0.97     51949



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
#