# Тестирование TreeTagger

Если TreeTagger не установлен, прогоните следующую ячейку (скрипт установки в папке с заданием):

In [None]:
!sh get_treetagger.sh

In [2]:
!echo 'Я хочу съесть яблоко!' | cmd/tree-tagger-russian 

	reading parameters ...
	tagging ...
Я	P-1-snn	я
хочу	Vmip1s-a-e	хотеть
съесть	Vmn----a-p	съесть
яблоко	Ncnsan	яблоко
!	SENT	!
	 finished.


In [1]:
from lxml import etree
from collections import defaultdict
from tqdm import tqdm

open_corpora = etree.fromstring(open('annot.opcorpora.no_ambig_strict.xml', 'rb').read())

Итого у нас 10590 предложений. В таком случае разделим выборку на 10 частей: в каждой итерации будет 1059 примеров на тестовой выборке и 9531 предложение в тренировочной. Создадим массив кортежей (форматированое предложение для обучение, стандартное предложение для теста, верный грамматический разбор)

In [2]:
vocab = defaultdict(set)
tags = set()

corpus = []

for sentence in open_corpora.xpath('//tokens'):
    formatted_sent = ""
    standart_sent = []
    gram_infos = []
    length = len(sentence.xpath('token'))
    ended = False
    for i,token in enumerate(sentence.xpath('token')):
        word = token.xpath('@text')
        gram_info = token.xpath('tfr/v/l/g/@v')
        if (i + 1) == length and gram_info[0] == 'PNCT':
            gram_info = ['SENT']
            ended = True
        formatted_sent += word[0] + '\t' + ','.join(gram_info) + '\n'
        standart_sent.append(word[0])
        lemma = token.xpath('tfr/v/l/@t')[0]
        vocab[word[0].lower()].add((','.join(gram_info), lemma.lower()))
        tags.add(','.join(gram_info))
        gram_infos.append(gram_info)
    if not ended:
        formatted_sent += '.\tSENT\n'
    standart_sent = " ".join(standart_sent)
    corpus.append((formatted_sent, standart_sent, gram_infos))

print(len(corpus))
for i in range(2):
    print(corpus[i])

10590
('«\tPNCT\nШкола\tNOUN,inan,femn,sing,nomn\nзлословия\tNOUN,inan,neut,sing,gent\n»\tPNCT\nучит\tVERB,impf,tran,sing,3per,pres,indc\nприкусить\tINFN,perf,tran\nязык\tNOUN,inan,masc,sing,accs\n.\tSENT\n', '« Школа злословия » учит прикусить язык', [['PNCT'], ['NOUN', 'inan', 'femn', 'sing', 'nomn'], ['NOUN', 'inan', 'neut', 'sing', 'gent'], ['PNCT'], ['VERB', 'impf', 'tran', 'sing', '3per', 'pres', 'indc'], ['INFN', 'perf', 'tran'], ['NOUN', 'inan', 'masc', 'sing', 'accs']])
('Сохранится\tVERB,perf,intr,sing,3per,futr,indc\nли\tPRCL\nградус\tNOUN,inan,masc,sing,nomn\nдискуссии\tNOUN,inan,femn,sing,gent\nв\tPREP\nновом\tADJF,Qual,masc,sing,loct\nсезоне\tNOUN,inan,masc,sing,loct\n?\tSENT\n', 'Сохранится ли градус дискуссии в новом сезоне ?', [['VERB', 'perf', 'intr', 'sing', '3per', 'futr', 'indc'], ['PRCL'], ['NOUN', 'inan', 'masc', 'sing', 'nomn'], ['NOUN', 'inan', 'femn', 'sing', 'gent'], ['PREP'], ['ADJF', 'Qual', 'masc', 'sing', 'loct'], ['NOUN', 'inan', 'masc', 'sing', 'loct'],

In [3]:
f = open('lexicon.txt', 'w')

for word in vocab:
    f.write(word + '\t')
    f.write('\t'.join([' '.join(pair) for pair in vocab[word]]))
    f.write('\n')
f.close()

f = open('open_class.txt', 'w')

f.write('\n'.join([tag for tag in tags if 'NOUN' in tag or 'VERB' in tag or 'ADJF' in tag]))
f.close()

In [4]:
cross_val_folds = []

fold = []

for i in range(len(corpus)):
    fold.append(corpus[i])
    if i % (len(corpus) / 10) == 0:
        cross_val_folds.append(fold)
        fold = []
        
print(len(cross_val_folds))

10


Функция для парсинга выхода модели: возвращает массив предсказанных частей речи.

In [17]:
import numpy as np

def get_predicted_pos(filename):
    with open(filename, 'r') as fd:
        content = fd.read()
    content = content.split("\n")
    pos = []
    for word in content:
        if len(word) == 0:
            pass
        else:
            pos.append(word.split(',')[0])
    return pos

def get_true_pos(cross_val_fold):
    pos = []
    for sent in cross_val_fold:
        for word in sent[2]:
            pos.append(word[0])
    return pos

Функция для определения ошибки: сравнивает предсказанный результат с истинным и возвращает массив из нулей и единиц.

In [18]:
import sys

def if_mistake(output_filename, cross_val_fold):
    score = []
    predicted = get_predicted_pos(output_filename)
    true = get_true_pos(cross_val_fold)
    for pos in range(len(predicted)):
        try:
            if predicted[pos] == true[pos]:
                score.append(1)
            else:
                score.append(0)
        except IndexError:
            sys.exit(0)
    return score

In [None]:
mistake_score = []

for i in tqdm(range(len(cross_val_folds))):
    !rm corpus_train.txt
    !rm corpus_test.txt
    fd_train = open('corpus_train.txt', 'w')
    fd_test = open('corpus_test.txt', 'w')
    j = len(cross_val_folds) - 1
    while j != -1:
        if j == i:
            for sent in cross_val_folds[i]:
                fd_test.write('\n' + '\n'.join(sent[1].split()))
        else:
            for sent in cross_val_folds[j]:
                fd_train.write(sent[0])
        j -= 1
    !./bin/train-tree-tagger lexicon.txt open_class.txt corpus_train.txt model_oc
    !./bin/tree-tagger model_oc corpus_test.txt output.txt
    mistake_score.append(if_mistake("output.txt", cross_val_folds[i]))

  0%|          | 0/10 [00:00<?, ?it/s]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
53000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1796 nodes
	reading classes ...
	making ngram table ...
65627	32360
finished.
	making decision tree ...
86	saving parameters ...

Number of nodes: 87
Max. path length: 15

done.
	reading parameters ...
	tagging ...
	 finished.


 10%|█         | 1/10 [00:03<00:35,  3.96s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1799 nodes
	reading classes ...
	making ngram table ...
57000	28980
finished.
	making decision tree ...
86	saving parameters ...

Number of nodes: 87
Max. path length: 15

done.
	reading parameters ...
	tagging ...
7000	 finished.


 20%|██        | 2/10 [00:08<00:32,  4.09s/it]


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1796 nodes
	reading classes ...
	making ngram table ...
57383	28907
finished.
	making decision tree ...
24

Посмотрим на последнем фолде, адекватно ли вообще выглядит выход. Вроде более-менее.

In [80]:
demo = open('demo.txt', 'w')

print(cross_val_folds[len(cross_val_folds) - 1][0][1] + "\n")
demo.write('\n' + '\n'.join(cross_val_folds[len(cross_val_folds) - 1][0][1].split()))
    
!./bin/tree-tagger model_oc demo.txt output.txt

!cat output.txt

— Не смей ругать мою землю .

	reading parameters ...
	tagging ...
	 finished.
PNCT
NOUN,inan,masc,sing,nomn
VERB,impf,intr,sing,impr,excl
INFN,impf,tran
ADJF,Apro,femn,sing,accs
NOUN,inan,femn,sing,accs
SENT


Усередним ошибку по каждому фолду и посмотрим на общую ошибку.

In [24]:
folds_result = []

for i in range(len(mistake_score)):
    fold_result = np.mean(mistake_score[i])
    print("fold #", i + 1, "\t score: ", fold_result)
    folds_result.append(fold_result)
    
print("\nTotal score: ", np.mean(folds_result))

fold # 1 	 score:  nan
fold # 2 	 score:  0.8967030211859095
fold # 3 	 score:  0.9005208333333333
fold # 4 	 score:  0.9017787078210829
fold # 5 	 score:  0.8982542991141219
fold # 6 	 score:  0.8901786878831355
fold # 7 	 score:  0.9148299748110831
fold # 8 	 score:  0.8904109589041096
fold # 9 	 score:  0.8724179829890644
fold # 10 	 score:  0.8854296388542964

Total score:  nan
