In [19]:
%load_ext autoreload
%autoreload 2

from sys import intern

from stanford_postagger import StanfordPOSTagger
from syntactic_parser_source import *
from tqdm import tqdm_notebook as tqdm

to_exclude = ['read_conll', 'train', 'main']
for name in to_exclude:
    del globals()[name]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
def read_conll(loc):
    for sent_str in open(loc, mode='r', encoding='utf-8').read().strip().split('\n\n'):
        lines = [line.split() for line in sent_str.split('\n')]
        words = DefaultList(''); tags = DefaultList('')
        heads = [None]; labels = [None]
        
        for i, (_, word, _, pos, _, _, head, label, _, _) in enumerate(lines):
            words.append(intern(word))
            #words.append(intern(normalize(word)))
            tags.append(intern(pos))
            heads.append(int(head) + 1 if head != '-1' else len(lines) + 1)
            labels.append(label)
        pad_tokens(words); pad_tokens(tags)
        yield words, tags, heads, labels

In [21]:
def train(parser, sentences, nr_iter):
    parser.tagger.start_training(sentences)
    for itn in range(nr_iter):
        corr = 0; total = 0
        random.shuffle(sentences)
        
        t = tqdm(total=len(sentences))        
        for words, gold_tags, gold_parse, gold_label in sentences:
            corr += parser.train_one(itn, words, gold_tags, gold_parse)
            if itn < 5:
                parser.tagger.train_one(words, gold_tags)
            total += len(words)
            t.update(1)
        print(itn, '%.3f' % (float(corr) / float(total)))
        if itn == 4:
            parser.tagger.model.average_weights()
    print('Averaging weights')
    parser.model.average_weights()

In [22]:
def main(model_dir, train_loc, heldout_gold, nr_iter=15):
    parser = Parser(load=False)
    sentences = list(read_conll(train_loc))
    train(parser, sentences, nr_iter=15)
    parser.save()
    
    c = 0
    t = 0
    gold_sentences = list(read_conll(heldout_gold))
    t1 = time.time()
    
    for words, tags, gold_heads, gold_labels in gold_sentences:
        _, heads = parser.parse(words)
        for i, w in list(enumerate(words))[1:-1]:
            if gold_labels[i] in ('P', 'punct'):
                continue
            if heads[i] == gold_heads[i]:
                c += 1
            t += 1
            
    t2 = time.time()
    print('Parsing took %0.3f ms' % ((t2-t1)*1000.0))
    print(c, t, float(c)/t)

In [23]:
model_dir = path.join("tmp", "dep_parser_src.pkl")
train_loc = path.join("datasets", "UD_English-EWT", "en_ewt-ud-train.conll")
heldout_gold = path.join("datasets", "UD_English-EWT", "en_ewt-ud-test.conll")
main(model_dir, train_loc, heldout_gold, 15)

A Jupyter Widget

0 0.117


A Jupyter Widget

1 0.116


A Jupyter Widget

2 0.116


A Jupyter Widget

3 0.116


A Jupyter Widget

4 0.116


A Jupyter Widget

5 0.115


A Jupyter Widget

6 0.115


A Jupyter Widget

7 0.115


A Jupyter Widget

8 0.115


A Jupyter Widget

9 0.116


A Jupyter Widget

10 0.116


A Jupyter Widget

11 0.115


A Jupyter Widget

12 0.115


A Jupyter Widget

13 0.116


A Jupyter Widget

14 0.116
Averaging weights
Saving model to D:\Programming\question-answer-generation\NER\parser.pickle
Parsing took 4966.760 ms
1930 22028 0.08761576175776285
