# Тестирование TreeTagger

Если TreeTagger не установлен, прогоните следующую ячейку (скрипт установки в папке с заданием):

In [None]:
!sh get_treetagger.sh

In [2]:
!echo 'Я хочу съесть яблоко!' | cmd/tree-tagger-russian 

	reading parameters ...
	tagging ...
Я	P-1-snn	я
хочу	Vmip1s-a-e	хотеть
съесть	Vmn----a-p	съесть
яблоко	Ncnsan	яблоко
!	SENT	!
	 finished.


In [1]:
from lxml import etree
from collections import defaultdict
from tqdm import tqdm

open_corpora = etree.fromstring(open('annot.opcorpora.no_ambig_strict.xml', 'rb').read())

Итого у нас 10590 предложений. В таком случае разделим выборку на 10 частей: в каждой итерации будет 1059 примеров на тестовой выборке и 9531 предложение в тренировочной. Создадим массив кортежей (форматированое предложение для обучение, стандартное предложение для теста, верный грамматический разбор)

In [54]:
vocab = defaultdict(set)
tags = set()

corpus = []

for sentence in open_corpora.xpath('//tokens'):
    formatted_sent = ""
    standart_sent = []
    gram_infos = []
    length = len(sentence.xpath('token'))
    ended = False
    for i,token in enumerate(sentence.xpath('token')):
        word = token.xpath('@text')
        gram_info = token.xpath('tfr/v/l/g/@v')
        if (i + 1) == length and gram_info[0] == 'PNCT':
            gram_info = ['SENT']
            ended = True
        formatted_sent += word[0] + '\t' + ','.join(gram_info) + '\n'
        standart_sent.append(word[0])
        lemma = token.xpath('tfr/v/l/@t')[0]
        vocab[word[0].lower()].add((','.join(gram_info), lemma.lower()))
        tags.add(','.join(gram_info))
        gram_infos.append((word[0], gram_info))
    if not ended:
        formatted_sent += '.\tSENT\n'
    standart_sent = " ".join(standart_sent)
    corpus.append((formatted_sent, standart_sent, gram_infos))

print(len(corpus))
for i in range(2):
    print(corpus[i])

10590
('«\tPNCT\nШкола\tNOUN,inan,femn,sing,nomn\nзлословия\tNOUN,inan,neut,sing,gent\n»\tPNCT\nучит\tVERB,impf,tran,sing,3per,pres,indc\nприкусить\tINFN,perf,tran\nязык\tNOUN,inan,masc,sing,accs\n.\tSENT\n', '« Школа злословия » учит прикусить язык', [('«', ['PNCT']), ('Школа', ['NOUN', 'inan', 'femn', 'sing', 'nomn']), ('злословия', ['NOUN', 'inan', 'neut', 'sing', 'gent']), ('»', ['PNCT']), ('учит', ['VERB', 'impf', 'tran', 'sing', '3per', 'pres', 'indc']), ('прикусить', ['INFN', 'perf', 'tran']), ('язык', ['NOUN', 'inan', 'masc', 'sing', 'accs'])])
('Сохранится\tVERB,perf,intr,sing,3per,futr,indc\nли\tPRCL\nградус\tNOUN,inan,masc,sing,nomn\nдискуссии\tNOUN,inan,femn,sing,gent\nв\tPREP\nновом\tADJF,Qual,masc,sing,loct\nсезоне\tNOUN,inan,masc,sing,loct\n?\tSENT\n', 'Сохранится ли градус дискуссии в новом сезоне ?', [('Сохранится', ['VERB', 'perf', 'intr', 'sing', '3per', 'futr', 'indc']), ('ли', ['PRCL']), ('градус', ['NOUN', 'inan', 'masc', 'sing', 'nomn']), ('дискуссии', ['NOUN', '

In [55]:
f = open('lexicon.txt', 'w')

for word in vocab:
    f.write(word + '\t')
    f.write('\t'.join([' '.join(pair) for pair in vocab[word]]))
    f.write('\n')
f.close()

f = open('open_class.txt', 'w')

f.write('\n'.join([tag for tag in tags if 'NOUN' in tag or 'VERB' in tag or 'ADJF' in tag]))
f.close()

In [56]:
cross_val_folds = []

fold = []

for i in range(len(corpus)):
    fold.append(corpus[i])
    if i % (len(corpus) / 10) == 0:
        cross_val_folds.append(fold)
        fold = []
        
print(len(cross_val_folds))

10


Функция для парсинга выхода модели: возвращает массив предсказанных частей речи.

In [76]:
import numpy as np

def get_predicted_pos(filename):
    with open(filename, 'r') as fd:
        content = fd.read()
    content = content.split("\n")
    pos = []
    for word in content:
        if len(word) == 0:
            pass
        else:
            pos.append(word.split(',')[0])
    return pos

def get_true_pos(cross_val_fold):
    pos = []
    for sent in cross_val_fold:
        for word in sent[2]:
            pos.append((word[0], word[1][0]))
    return pos

Функция для определения ошибки: сравнивает предсказанный результат с истинным и возвращает массив из нулей и единиц.

In [80]:
import sys
from collections import Counter

mistakes = Counter()

def if_mistake(output_filename, cross_val_fold):
    fold_mistake = Counter()
    score = []
    predicted = get_predicted_pos(output_filename)
    true = get_true_pos(cross_val_fold)
    for pos in range(len(predicted)):
        try:
            if predicted[pos] == true[pos][1]:
                score.append(1)
            else:
                global mistakes
                mistakes.update([(true[pos][0], true[pos][1], predicted[pos])])
                fold_mistake.update([(true[pos][0], true[pos][1], predicted[pos])])
                score.append(0)
        except IndexError:
            sys.exit(0)
    return (score, fold_mistake)

In [81]:
mistake_score = []

for i in tqdm(range(len(cross_val_folds))):
    !rm corpus_train.txt
    !rm corpus_test.txt
    fd_train = open('corpus_train.txt', 'w')
    fd_test = open('corpus_test.txt', 'w')
    j = len(cross_val_folds) - 1
    while j != -1:
        if j == i:
            for sent in cross_val_folds[i]:
                fd_test.write('\n' + '\n'.join(sent[1].split()))
        else:
            for sent in cross_val_folds[j]:
                fd_train.write(sent[0])
        j -= 1
    !./bin/train-tree-tagger lexicon.txt open_class.txt corpus_train.txt model_oc
    !./bin/tree-tagger model_oc corpus_test.txt output.txt
    mistake_score.append(if_mistake("output.txt", cross_val_folds[i]))

  0%|          | 0/10 [00:00<?, ?it/s]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
53000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1796 nodes
	reading classes ...
	making ngram table ...
65627	32360
finished.
	making decision tree ...
86	saving parameters ...

Number of nodes: 87
Max. path length: 15

done.
	reading parameters ...
	tagging ...
	 finished.


 10%|█         | 1/10 [00:03<00:33,  3.73s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1799 nodes
	reading classes ...
	making ngram table ...
57000	28980
finished.
	making decision tree ...
86	saving parameters ...

Number of nodes: 87
Max. path length: 15

done.
	reading parameters ...
	tagging ...
7000	 finished.


 20%|██        | 2/10 [00:08<00:31,  3.92s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1796 nodes
	reading classes ...
	making ngram table ...
57383	28907
finished.
	making decision tree ...
90	saving parameters ...

Number of nodes: 91
Max. path length: 16

done.
	reading parameters ...
	tagging ...
7000	 finished.


 30%|███       | 3/10 [00:11<00:26,  3.85s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1796 nodes
	reading classes ...
	making ngram table ...
57461	28792
finished.
	making decision tree ...
86	saving parameters ...

Number of nodes: 87
Max. path length: 15

done.
	reading parameters ...
	tagging ...
7000	 finished.


 40%|████      | 4/10 [00:16<00:24,  4.09s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1793 nodes
	reading classes ...
	making ngram table ...
57290	28718
finished.
	making decision tree ...
94	saving parameters ...

Number of nodes: 95
Max. path length: 16

done.
	reading parameters ...
	tagging ...
7000	 finished.


 50%|█████     | 5/10 [00:21<00:22,  4.50s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1797 nodes
	reading classes ...
	making ngram table ...
57574	29328
finished.
	making decision tree ...
88	saving parameters ...

Number of nodes: 89
Max. path length: 14

done.
	reading parameters ...
	tagging ...
7000	 finished.


 60%|██████    | 6/10 [00:25<00:17,  4.32s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
47000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1795 nodes
	reading classes ...
	making ngram table ...
58850	30259
finished.
	making decision tree ...
70	saving parameters ...

Number of nodes: 71
Max. path length: 15

done.
	reading parameters ...
	tagging ...
6000	 finished.


 70%|███████   | 7/10 [00:29<00:12,  4.04s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
48000	making affix tree ...
prefix lexicon: 806 nodes
suffix lexicon: 1795 nodes
	reading classes ...
	making ngram table ...
59098	30045
finished.
	making decision tree ...
90	saving parameters ...

Number of nodes: 91
Max. path length: 15

done.
	reading parameters ...
	tagging ...
5000	 finished.


 80%|████████  | 8/10 [00:33<00:08,  4.02s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
49000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1798 nodes
	reading classes ...
	making ngram table ...
60215	30625
finished.
	making decision tree ...
82	saving parameters ...

Number of nodes: 83
Max. path length: 14

done.
	reading parameters ...
	tagging ...
4000	 finished.


 90%|█████████ | 9/10 [00:38<00:04,  4.30s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
48000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1799 nodes
	reading classes ...
	making ngram table ...
59143	29956
finished.
	making decision tree ...
92	saving parameters ...

Number of nodes: 93
Max. path length: 18

done.
	reading parameters ...
	tagging ...
5000	 finished.


100%|██████████| 10/10 [00:43<00:00,  4.51s/it]


Посмотрим на последнем фолде, адекватно ли вообще выглядит выход. Вроде более-менее.

In [89]:
demo = open('demo.txt', 'w')

print(cross_val_folds[len(cross_val_folds) - 1][0][1] + "\n")
demo.write('\n' + '\n'.join(cross_val_folds[len(cross_val_folds) - 1][0][1].split()))
    
!./bin/tree-tagger model_oc demo.txt output.txt

!cat output.txt

— Не смей ругать мою землю .

	reading parameters ...
	tagging ...
	 finished.
PNCT
NOUN,inan,masc,sing,nomn
VERB,impf,intr,sing,impr,excl
INFN,impf,tran
ADJF,Apro,femn,sing,accs
NOUN,inan,femn,sing,accs
SENT


Усередним ошибку по каждому фолду и посмотрим на общую ошибку. Вроде ничего так.

In [87]:
folds_result = []

for i in range(1, len(mistake_score)):
    fold_result = np.mean(mistake_score[i][0])
    print("fold #", i, "\t score: ", fold_result)
    folds_result.append(fold_result)
    
print("\nTotal score: ", np.mean(folds_result[1:]))
print("Std score: ", np.std(folds_result[1:]))

fold # 1 	 score:  0.8967030211859095
fold # 2 	 score:  0.9005208333333333
fold # 3 	 score:  0.9017787078210829
fold # 4 	 score:  0.8982542991141219
fold # 5 	 score:  0.8901786878831355
fold # 6 	 score:  0.9148299748110831
fold # 7 	 score:  0.8904109589041096
fold # 8 	 score:  0.8724179829890644
fold # 9 	 score:  0.8854296388542964

Total score:  0.8942276354637784
Std score:  0.011839130678523721


В общем картина по ошибкам следующая: проблемы с определением, последний ли знак препинания в предложении (?) и почему-то местоимения и предлоги - это существительные, а не что-то еще.

In [79]:
mistakes.most_common(10)

[(('В', 'PREP', 'NOUN'), 413),
 (('.', 'PNCT', 'SENT'), 324),
 (('Я', 'NPRO', 'NOUN'), 174),
 ((',', 'SENT', 'PNCT'), 124),
 (('Не', 'PRCL', 'NOUN'), 109),
 (('На', 'PREP', 'NOUN'), 93),
 ((':', 'PNCT', 'SENT'), 91),
 (('Он', 'NPRO', 'NOUN'), 85),
 (('»', 'SENT', 'PNCT'), 78),
 (('С', 'PREP', 'NOUN'), 71)]

В среднем по фолдам ситуация одинакова, не очень понятно, почему так. Существуют ли вообще теги для местоимений и, союзов и предлогов?

In [86]:
for i in range(1, len(mistake_score)):
    print("fold #", i)
    for el in mistake_score[i][1].most_common(5):
        print(el)
    print("\n")

fold # 1
(('.', 'PNCT', 'SENT'), 61)
(('В', 'PREP', 'NOUN'), 43)
((':', 'PNCT', 'SENT'), 23)
(('»', 'SENT', 'PNCT'), 16)
(('Не', 'PRCL', 'NOUN'), 16)


fold # 2
(('В', 'PREP', 'NOUN'), 68)
(('Я', 'NPRO', 'NOUN'), 15)
(('И', 'CONJ', 'NOUN'), 14)
(('.', 'PNCT', 'SENT'), 14)
(('Мы', 'NPRO', 'NOUN'), 14)


fold # 3
(('В', 'PREP', 'NOUN'), 47)
(('.', 'PNCT', 'SENT'), 39)
(('Я', 'NPRO', 'NOUN'), 24)
(('На', 'PREP', 'NOUN'), 17)
(('Мы', 'NPRO', 'NOUN'), 15)


fold # 4
(('В', 'PREP', 'NOUN'), 60)
(('Я', 'NPRO', 'NOUN'), 30)
(('Мы', 'NPRO', 'NOUN'), 13)
((',', 'SENT', 'PNCT'), 13)
(('Не', 'PRCL', 'NOUN'), 13)


fold # 5
(('.', 'PNCT', 'SENT'), 65)
(('В', 'PREP', 'NOUN'), 43)
(('Я', 'NPRO', 'NOUN'), 27)
((')', 'SENT', 'PNCT'), 24)
((',', 'SENT', 'PNCT'), 15)


fold # 6
(('.', 'PNCT', 'SENT'), 40)
(('В', 'PREP', 'NOUN'), 33)
(('.', 'SENT', 'PNCT'), 31)
((',', 'SENT', 'PNCT'), 19)
(('Российской', 'ADJF', 'NOUN'), 16)


fold # 7
((',', 'SENT', 'PNCT'), 38)
(('.', 'PNCT', 'SENT'), 38)
(('В', 'PREP',

In [99]:
from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()

morph.tag("слово")[0].POS

'NOUN'

In [119]:
pm2_mistakes = Counter()
score = []

for el in cross_val_folds[9]:
    sent = el[2]
    for word in sent:
        if word[1][0] == 'PNCT' or word[1][0] == 'SENT':
            pass
        # пропустим определение знаков препинания
        else:
            analysis = morph.tag(word[0])[0].POS
            if analysis == word[1][0]:
                score.append(1)
            else:
                score.append(0)
                pm2_mistakes.update([(word[0], analysis, word[1][0])])

In [120]:
print("Pymorphy score: ", np.mean(score))

Pymorphy score:  0.8912466843501327


In [121]:
pm2_mistakes.most_common(5)

[(('1', None, 'NUMB'), 12),
 (('также', 'CONJ', 'PRCL'), 9),
 (('2', None, 'NUMB'), 9),
 (('a', None, 'LATN'), 8),
 (('3', None, 'NUMB'), 8)]

In [127]:
morph.parse('1')[0].tag

OpencorporaTag('NUMB,intg')

Качество у Pymorphy, скорее всего, выше тритеггера - эти штуки, которые посчитались неправильно, на самом деле, опознаны верно, просто часть речи оттуда так просто не достать. Просто у такого рода входа не парсится часть речи как отдельный атрибут класса.