# Тестирование TreeTagger

Если TreeTagger не установлен, прогоните следующую ячейку (скрипт установки в папке с заданием):

In [None]:
!sh get_treetagger.sh

In [2]:
!echo 'Я хочу съесть яблоко!' | cmd/tree-tagger-russian 

	reading parameters ...
	tagging ...
Я	P-1-snn	я
хочу	Vmip1s-a-e	хотеть
съесть	Vmn----a-p	съесть
яблоко	Ncnsan	яблоко
!	SENT	!
	 finished.


In [64]:
from lxml import etree
from collections import defaultdict
from tqdm import tqdm

open_corpora = etree.fromstring(open('annot.opcorpora.no_ambig_strict.xml', 'rb').read())

Итого у нас 10590 предложений. В таком случае разделим выборку на 10 частей: в каждой итерации будет 1059 примеров на тестовой выборке и 9531 предложение в тренировочной. Создадим массив кортежей (форматированое предложение для обучение, стандартное предложение для теста)

In [57]:
vocab = defaultdict(set)
tags = set()

corpus = []

for sentence in open_corpora.xpath('//tokens'):
    formatted_sent = ""
    standart_sent = []
    length = len(sentence.xpath('token'))
    for i,token in enumerate(sentence.xpath('token')):
        word = token.xpath('@text')
        gram_info = token.xpath('tfr/v/l/g/@v')
        if (i + 1) == length and gram_info[0] == 'PNCT':
            gram_info = ['SENT']
            ended = True
        formatted_sent += word[0] + '\t' + ','.join(gram_info) + '\n'
        standart_sent.append(word[0])
        lemma = token.xpath('tfr/v/l/@t')[0]
        vocab[word[0].lower()].add((','.join(gram_info), lemma.lower()))
        tags.add(','.join(gram_info))
    if not ended:
        formatted_sent += '.\tSENT\n'
    standart_sent = " ".join(standart_sent)
    corpus.append((formatted_sent, standart_sent))

print(len(corpus))
for i in range(2):
    print(corpus[i])

10590
('«\tPNCT\nШкола\tNOUN,inan,femn,sing,nomn\nзлословия\tNOUN,inan,neut,sing,gent\n»\tPNCT\nучит\tVERB,impf,tran,sing,3per,pres,indc\nприкусить\tINFN,perf,tran\nязык\tNOUN,inan,masc,sing,accs\n', '« Школа злословия » учит прикусить язык')
('Сохранится\tVERB,perf,intr,sing,3per,futr,indc\nли\tPRCL\nградус\tNOUN,inan,masc,sing,nomn\nдискуссии\tNOUN,inan,femn,sing,gent\nв\tPREP\nновом\tADJF,Qual,masc,sing,loct\nсезоне\tNOUN,inan,masc,sing,loct\n?\tSENT\n', 'Сохранится ли градус дискуссии в новом сезоне ?')


In [58]:
f = open('lexicon.txt', 'w')

for word in vocab:
    f.write(word + '\t')
    f.write('\t'.join([' '.join(pair) for pair in vocab[word]]))
    f.write('\n')
f.close()

f = open('open_class.txt', 'w')

f.write('\n'.join([tag for tag in tags if 'NOUN' in tag or 'VERB' in tag or 'ADJF' in tag]))
f.close()

In [42]:
cross_val_folds = []

fold = []

for i in range(len(corpus)):
    fold.append(corpus[i])
    if i % (len(corpus) / 10) == 0:
        cross_val_folds.append(fold)
        fold = []
        
print(len(cross_val_folds))

10


In [67]:
for i in tqdm(range(len(cross_val_folds))):
    !rm corpus_train.txt
    !rm corpus_test.txt
    !rm output.txt
    fd_train = open('corpus_train.txt', 'w')
    fd_test = open('corpus_test.txt', 'w')
    j = len(cross_val_folds) - 1
    while j != -1:
        if j == i:
            for sent in cross_val_folds[i]:
                fd_test.write('\n' + '\n'.join(sent[1].split()))
        else:
            for sent in cross_val_folds[j]:
                fd_train.write(sent[0])
        j -= 1
    !./bin/train-tree-tagger lexicon.txt open_class.txt corpus_train.txt model_oc
    !./bin/tree-tagger model_oc corpus_test.txt output.txt


  0%|          | 0/10 [00:00<?, ?it/s][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
51000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1796 nodes
	reading classes ...
	making ngram table ...
63639	33659
finished.
	making decision tree ...
84	saving parameters ...

Number of nodes: 85
Max. path length: 14

done.
	reading parameters ...
	tagging ...
	 finished.



 10%|█         | 1/10 [00:04<00:38,  4.26s/it][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
44000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1799 nodes
	reading classes ...
	making ngram table ...
55150	30018
finished.
	making decision tree ...
80	saving parameters ...

Number of nodes: 81
Max. path length: 15

done.
	reading parameters ...
	tagging ...
7000	 finished.



 20%|██        | 2/10 [00:08<00:34,  4.34s/it][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
44000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1796 nodes
	reading classes ...
	making ngram table ...
55786	30143
finished.
	making decision tree ...
82	saving parameters ...

Number of nodes: 83
Max. path length: 16

done.
	reading parameters ...
	tagging ...
7000	 finished.



 30%|███       | 3/10 [00:12<00:29,  4.27s/it][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
44000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1796 nodes
	reading classes ...
	making ngram table ...
55427	29913
finished.
	making decision tree ...
84	saving parameters ...

Number of nodes: 85
Max. path length: 14

done.
	reading parameters ...
	tagging ...
7000	 finished.



 40%|████      | 4/10 [00:17<00:25,  4.23s/it][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
44000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1795 nodes
	reading classes ...
	making ngram table ...
55636	29975
finished.
	making decision tree ...
92	saving parameters ...

Number of nodes: 93
Max. path length: 16

done.
	reading parameters ...
	tagging ...
7000	 finished.



 50%|█████     | 5/10 [00:21<00:20,  4.16s/it][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
44000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1797 nodes
	reading classes ...
	making ngram table ...
55677	30386
finished.
	making decision tree ...
84	saving parameters ...

Number of nodes: 85
Max. path length: 15

done.
	reading parameters ...
	tagging ...
7000	 finished.



 60%|██████    | 6/10 [00:24<00:16,  4.01s/it][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1795 nodes
	reading classes ...
	making ngram table ...
57216	31508
finished.
	making decision tree ...
70	saving parameters ...

Number of nodes: 71
Max. path length: 15

done.
	reading parameters ...
	tagging ...
6000	 finished.



 70%|███████   | 7/10 [00:28<00:11,  3.84s/it][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 806 nodes
suffix lexicon: 1795 nodes
	reading classes ...
	making ngram table ...
57336	31217
finished.
	making decision tree ...
96	saving parameters ...

Number of nodes: 97
Max. path length: 15

done.
	reading parameters ...
	tagging ...
5000	 finished.



 80%|████████  | 8/10 [00:32<00:08,  4.05s/it][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
47000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1798 nodes
	reading classes ...
	making ngram table ...
58597	31683
finished.
	making decision tree ...
82	saving parameters ...

Number of nodes: 83
Max. path length: 14

done.
	reading parameters ...
	tagging ...
4000	 finished.



 90%|█████████ | 9/10 [00:37<00:04,  4.17s/it][A


train-tree-tagger -cl 2 -dtg 0.50 -sw 1.00 -ecw 0.15 -atg 1.20 lexicon.txt open_class.txt corpus_train.txt model_oc

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
46000	making affix tree ...
prefix lexicon: 807 nodes
suffix lexicon: 1799 nodes
	reading classes ...
	making ngram table ...
57621	31148
finished.
	making decision tree ...
88	saving parameters ...

Number of nodes: 89
Max. path length: 14

done.
	reading parameters ...
	tagging ...
5000	 finished.



100%|██████████| 10/10 [00:41<00:00,  4.28s/it][A

In [None]:
!./bin/tree-tagger model_oc corpus_test.txt output.txt


