In [45]:
import pathlib
import pyconll
import nltk
import numpy as np
import pandas as pd
from nltk import tag

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, precision_score

from gensim.models import FastText

DATA = pathlib.Path('data') / 'tags'

In [4]:
# load (https://github.com/UniversalDependencies/UD_Russian-SynTagRus)
train_raw = pyconll.iter_from_file(DATA / 'ru_syntagrus-ud-train-a.conllu')
valid_raw = pyconll.iter_from_file(DATA / 'ru_syntagrus-ud-dev.conllu')

# train/valid prepare
# исходные данные содержат ошибки
train = [[(token.form, token.upos) for token in sentence if token.upos] for sentence in train_raw]
valid = [[(token.form, token.upos) for token in sentence if token.upos] for sentence in valid_raw]

# train = [[(token.form, token.upos if token.upos else 'NO_TAG') for token in sentence] for sentence in train_raw]
# valid = [[(token.form, token.upos if token.upos else 'NO_TAG') for token in sentence] for sentence in valid_raw]

In [5]:
# проверка на None
for n, sent in enumerate(train):
    for token in sent:
        if token[0] is None:
            print(f'#{n}: has None token')
        if token[1] is None:
            print(f'#{n}: has None upos')

In [6]:
# проверка на None
for n, sent in enumerate(valid):
    for token in sent:
        if token[0] is None:
            print(f'#{n}: has None token')
        if token[1] is None:
            print(f'#{n}: has None upos')

__default tagging__

In [7]:
metrics = {}
# unigram
tagger = tag.UnigramTagger(train, backoff=tag.DefaultTagger('PUNCT'))
metrics['unigram'] = tagger.precision(valid)     # evaluate is deprecated
metrics['unigram']['accuracy'] = tagger.accuracy(valid)

# bigram
tagger = tag.BigramTagger(train, backoff=tag.DefaultTagger('PUNCT'))
metrics['bigram'] = tagger.precision(valid)
metrics['bigram']['accuracy'] = tagger.accuracy(valid)

# trigram
tagger = tag.TrigramTagger(train, backoff=tag.DefaultTagger('PUNCT'))
metrics['trigram'] = tagger.precision(valid)
metrics['trigram']['accuracy'] = tagger.accuracy(valid)

# combined
def backoff_tagger(sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(sents, backoff=backoff)
    return backoff

tagger = backoff_tagger(train, [tag.UnigramTagger, tag.BigramTagger, tag.TrigramTagger], backoff=tag.DefaultTagger('PUNCT'))
metrics['combo'] = tagger.precision(valid)
metrics['combo']['accuracy'] = tagger.accuracy(valid)

# precision comparison
pd.DataFrame(metrics).T

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X,accuracy
unigram,0.949051,0.996358,0.937281,0.822987,0.892897,0.892938,1.0,0.993293,0.887717,0.958795,0.868977,0.958674,0.56875,0.773733,1.0,0.988468,0.777778,0.823466
bigram,0.950179,0.995835,0.948523,0.87083,0.95312,0.900887,1.0,0.996152,0.884233,0.938561,0.876818,0.986184,0.403005,0.823565,1.0,0.976067,0.666667,0.69367
trigram,0.919094,0.997784,0.943699,0.879776,0.945858,0.810033,0.545455,0.995101,0.874656,0.955359,0.880085,0.994275,0.286398,0.839043,1.0,0.944337,0.0,0.505697
combo,0.951696,0.995631,0.942861,0.887384,0.952992,0.877184,1.0,0.993996,0.898977,0.943256,0.890181,0.965021,0.56875,0.84222,1.0,0.979816,0.777778,0.828717


__custom tagger__

In [8]:
# разделение на tokens и tags
tokens = {
    'train': [token[0] for sentence in train for token in sentence],
    'valid': [token[0] for sentence in valid for token in sentence]
}
tags = {
    'train': [token[1] for sentence in train for token in sentence],
    'valid': [token[1] for sentence in valid for token in sentence]
}

# подготовка тренировочного корпуса для FastText
corpus = {
    'train': [[token[0] for token in sent] for sent in train],
    'valid': [[token[0] for token in sent] for sent in valid]
}
corp_tags = {
    'train': [[token[1] for token in sent] for sent in train],
    'valid': [[token[1] for token in sent] for sent in valid]
}

In [9]:
# подготовка FT и получение эмбеддингов слов
ft = FastText(corpus['train'], vector_size=200, window=5, min_count=2)
embeddings = {key: np.array([ft.wv[token] for token in values]) for key, values in tokens.items()}

In [10]:
# кодируем теги
le = LabelEncoder()
lb = {
    'train': le.fit_transform(tags['train']),
    'valid': le.transform(tags['valid'])
}

labels = {
    'train': np.eye(lb['train'].max() + 1)[lb['train']],
    'valid': np.eye(lb['valid'].max() + 1)[lb['valid']],
}

In [11]:
# prepare train/valid vectorized data
vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer='char', max_df=1.0, max_features=300)
# vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char', max_df=1.0, max_features=300)
matrix = {
    'train': vectorizer.fit_transform(tokens['train']),
    'valid': vectorizer.transform(tokens['valid'])
}

In [12]:
# concat features
features = {
    'train': np.hstack([embeddings['train'], matrix['train'].toarray()]),
    'valid': np.hstack([embeddings['valid'], matrix['valid'].toarray()])
}

In [13]:
# fit LGBM
model = LGBMClassifier(n_estimators=50, num_leaves=23, random_state=17)
model.fit(features['train'], lb['train'])

# evaluate
lb_pred = model.predict(features['valid'])
print(classification_report(lb['valid'], lb_pred))

              precision    recall  f1-score   support

           0       0.81      0.76      0.78     15103
           1       0.95      0.93      0.94     13717
           2       0.74      0.58      0.65      7783
           3       0.67      0.60      0.63      1390
           4       0.77      0.97      0.86      5672
           5       0.73      0.52      0.61      4265
           6       0.00      0.00      0.00        24
           7       0.79      0.88      0.83     36238
           8       0.68      0.64      0.66      1734
           9       0.61      0.64      0.62      5125
          10       0.58      0.77      0.66      7444
          11       0.58      0.38      0.46      5473
          12       0.99      0.99      0.99     29186
          13       0.30      0.20      0.24      2865
          14       0.06      0.03      0.04        62
          15       0.80      0.73      0.76     17110
          16       0.06      0.08      0.07       134

    accuracy              

__NN approach__

In [14]:
import torch
from tqdm import tqdm
from common import TaggerDataset, TorchTrainable

In [41]:
class NetLSTM(torch.nn.Module, TorchTrainable):
    def __init__(self, inp, dim, out, drop=0.2, layers=2):
        super().__init__()
        self.lstm = torch.nn.LSTM(inp, dim, num_layers=layers, batch_first=True, bidirectional=True, dropout=0.2)
        self.linear = torch.nn.Linear(2*dim, out)
        self.dp = torch.nn.Dropout(drop)
        
    def forward(self, x):
        x, ht = self.lstm(x)
        x = self.dp(x)
        x = self.linear(x)
        x = torch.softmax(x, dim=1)
        return x

In [42]:
BATCH_SIZE = 256

train_dataset = TaggerDataset(features['train'], labels['train'], dtype=torch.float)
valid_dataset = TaggerDataset(features['valid'], labels['valid'], dtype=torch.float)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Selected device: {device}')
net = NetLSTM(inp=500, dim=256, out=le.classes_.size, drop=0.2, layers=2).to(device)

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

net.fit(train_loader, optimizer, criterion, epochs=3, device=device)

Selected device: cuda


Epoch 1/3: 100%|██████████| 1663/1663 [01:21<00:00, 20.37it/s, cumulative loss per item=0.00937]
Epoch 2/3: 100%|██████████| 1663/1663 [01:20<00:00, 20.57it/s, cumulative loss per item=0.00889]
Epoch 3/3: 100%|██████████| 1663/1663 [01:20<00:00, 20.61it/s, cumulative loss per item=0.00855]


Done.





In [54]:
# predict
predicts = net.predict(valid_loader)
lb_pred = predicts.argmax(axis=1)
print(classification_report(lb['valid'], lb_pred))

              precision    recall  f1-score   support

           0       0.85      0.77      0.81     15103
           1       1.00      0.94      0.97     13717
           2       0.71      0.79      0.75      7783
           3       0.00      0.00      0.00      1390
           4       0.87      0.99      0.92      5672
           5       0.90      0.60      0.72      4265
           6       0.00      0.00      0.00        24
           7       0.78      0.96      0.86     36238
           8       0.47      0.59      0.52      1734
           9       0.89      0.70      0.79      5125
          10       0.69      0.98      0.81      7444
          11       0.00      0.00      0.00      5473
          12       1.00      1.00      1.00     29186
          13       0.00      0.00      0.00      2865
          14       0.00      0.00      0.00        62
          15       0.81      0.85      0.83     17110
          16       0.00      0.00      0.00       134

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#