In [50]:
from formal import *
from alg import * 
from earley import *
from time import time
import libitg
import numpy as np
import sys
import dill as pickle

from utils import read_lexicon, read_corpus, reduce_corpus, unk
from features import simple_features


def test(lexicon, src_str, tgt_str, verbose=False):
    # if verbose: print("{} - {}".format(src_str, tgt_str))
        
    # Make a source CFG using the whole lexicon
    src_cfg = libitg.make_source_side_finite_itg(lexicon)

    # Make a source FSA
    src_fsa = libitg.make_fsa(src_str)

    # Make a target FSA
    tgt_fsa = libitg.make_fsa(tgt_str)

    # Intersect source FSA and source CFG
    _Dx = libitg.earley(src_cfg, src_fsa, 
            start_symbol=Nonterminal('S'), 
            sprime_symbol=Nonterminal("D(x)"),
            clean=True)  # to illustrate the difference between clean and dirty forests I will disable clean here

    # projection over target vocabulary - D(x) is now finite
    Dx_clean = libitg.make_target_side_finite_itg(_Dx, lexicon)
    
    Dxy_clean = libitg.earley(Dx_clean, tgt_fsa,
            start_symbol=Nonterminal("D(x)"), 
            sprime_symbol=Nonterminal('D(x,y)'),
            clean=True)
    
    # pickle the data
    if len(Dxy_clean) > 0:       
        # print('D(x) (cleaned): %d rules in %.4f secs' % (len(Dx_clean), times['D(x)']))
        # print('D(x,y) (cleaned): %d rules in %.4f secs ' % (len(Dxy_clean), times['D(x,y)']))
        # if verbose: print('D(x): %d rules, D(x,y): %d rules \n' % (len(Dx_clean), len(Dxy_clean)))
        # with open('pickle-test', 'wb') as f:
        #     pickle.dump(Dxy_clean, f)
        # TODO: Save to file
        return True
        
    else:
        # if verbose: print ('Empty D(x,y) \n')
        return False    

In [43]:
lexicon, prob = read_lexicon('data/lexicon', top=10)

print('LEXICON (excerpt)')
limit = 10
counter = 0
for src_word, tgt_words in lexicon.items():
    print('%s: %s' % (src_word, tgt_words))
    counter += 1
    if counter == limit: break
print('-EPS-: %s' %lexicon['-EPS-'])
print()

LEXICON (excerpt)
能: {'able', 'will', 'me', 'could', '-', 'would', 'get', 'may', 'possible', 'can'}
折: {'discounted', 'marked', 'wouldn', '-', 'percent', 'impossible', 'cut-price', 'lowered', 'reduced', 'gimme'}
我: {'me', 'i', '-', "'d", '.', "'m", 'to', 'like', 'a', 'my'}
段: {'short', 'longer', 'lately', '-', 'while', 'things', 'tricks', 'been', 'period', 'park'}
或者: {'articles', 'hmmm', 'bump', 'mmm-hmm', '-', 'contacts', 'ahold', 'liquors', 'or', 'lenny'}
椭圆: {'-', 'standing', 'an', 'oval', 'one', 'counter', 'be', 'next', 'right', 'rent-a-car'}
明早: {'o', 'a.m.', '-', 'morning', 'eight', 'wake-up', 'tomorrow', 'wake', "'clock", 'sets'}
大约: {'-', 'ten-minute', 'about', 'approximately', 'national', 'takes', 'holidays', 'around', 'or', 'so'}
一一一四: {'will', 'yoshida', '-', 'of', 'eleven', 'hand', 'room', 'fourteen', 'ms.', 'note'}
药片: {'-', 'tablet', 'indigestion', 'whole', 'cough', 'swallow', 'some', 'drops', 'tablets', 'must'}
-EPS-: {'me', 'here', 'could', 'how', "'d", "'ll", 'does', 

In [38]:
corpus = read_corpus('data/training.zh-en')

In [51]:
limit = 20
parse_count = 0
last_count = 0

times = dict()
times[10] = time()
for i, (s_zh, s_en) in enumerate(corpus):    
    if i > 0 and i % 10 == 0: 
        times[i] = time() - times[i]
        print('\n%i parsed in %.4f secs - %i additional parses\n' % (i, times[i], parse_count - last_count))
        times[i+10] = time()
        last_count = parse_count
       
    
    print('TRAINING INSTANCE %i: |x|=%d |y|=%d' % (i, len(s_zh.split()), len(s_en.split())))
    parse = test(lexicon, 
            s_zh,
            s_en,
            verbose = False)
    
    if parse is True: 
        parse_count += 1
        print("Parse successful")
    if i == limit: break
        
print('Total of %i pairs parsed' % (parse_count))

TRAINING INSTANCE 0: |x|=4 |y|=7
TRAINING INSTANCE 1: |x|=8 |y|=7
TRAINING INSTANCE 2: |x|=11 |y|=13
TRAINING INSTANCE 3: |x|=4 |y|=5
Parse successful
TRAINING INSTANCE 4: |x|=13 |y|=12
TRAINING INSTANCE 5: |x|=5 |y|=5
TRAINING INSTANCE 6: |x|=7 |y|=5
TRAINING INSTANCE 7: |x|=8 |y|=10
TRAINING INSTANCE 8: |x|=10 |y|=13
Parse successful
TRAINING INSTANCE 9: |x|=7 |y|=6

10 parsed in 28.9333 secs - 2 additional parses

TRAINING INSTANCE 10: |x|=20 |y|=13
TRAINING INSTANCE 11: |x|=4 |y|=4
Parse successful
TRAINING INSTANCE 12: |x|=6 |y|=6
TRAINING INSTANCE 13: |x|=8 |y|=7
TRAINING INSTANCE 14: |x|=13 |y|=10
TRAINING INSTANCE 15: |x|=5 |y|=7
TRAINING INSTANCE 16: |x|=9 |y|=8
TRAINING INSTANCE 17: |x|=9 |y|=9
TRAINING INSTANCE 18: |x|=2 |y|=3
Parse successful
TRAINING INSTANCE 19: |x|=13 |y|=14

20 parsed in 32.8524 secs - 2 additional parses

TRAINING INSTANCE 20: |x|=6 |y|=7
Parse successful
Total of 5 pairs parsed


In [53]:
with open('pickle-test', 'rb') as f:
    Dloaded = pickle.load(f)

print(len(Dloaded), 'loaded')
print()

111 loaded



In [54]:
for d in Dloaded:
    print(d)

[D(x)]:0-7 ||| [S]:0-6:0-7
[S]:0-6:0-7 ||| [X]:0-6:0-7
[X]:0-6:0-7 ||| [X]:5-6:0-1 [X]:0-5:1-7
[X]:0-6:0-7 ||| [X]:0-3:0-4 [X]:3-6:4-7
[X]:0-6:0-7 ||| [X]:0-4:0-5 [X]:4-6:5-7
[X]:0-6:0-7 ||| [X]:0-1:0-1 [X]:1-6:1-7
[X]:0-6:0-7 ||| [X]:0-1:0-2 [X]:1-6:2-7
[X]:0-6:0-7 ||| [X]:0-5:0-6 [X]:5-6:6-7
[X]:0-6:0-7 ||| [X]:0-2:0-2 [X]:2-6:2-7
[X]:0-6:0-7 ||| [X]:0-2:0-3 [X]:2-6:3-7
[X]:0-6:0-7 ||| [X]:5-6:0-2 [X]:0-5:2-7
[X]:0-6:0-7 ||| [X]:1-6:0-6 [X]:0-1:6-7
[X]:5-6:0-1 ||| [T]:5-6:0-1
[X]:5-6:0-2 ||| [T]:5-6:0-1 [I]:6-6:1-2
[X]:5-6:0-2 ||| [T]:5-6:0-1 [I]:5-5:1-2
[X]:0-5:1-7 ||| [X]:1-5:1-6 [X]:0-1:6-7
[X]:0-3:0-4 ||| [X]:0-1:0-1 [X]:1-3:1-4
[X]:0-3:0-4 ||| [X]:0-2:0-2 [X]:2-3:2-4
[X]:0-3:0-4 ||| [X]:0-1:0-2 [X]:1-3:2-4
[X]:0-3:0-4 ||| [X]:0-2:0-3 [X]:2-3:3-4
[X]:3-6:4-7 ||| [X]:3-5:4-6 [X]:5-6:6-7
[X]:3-6:4-7 ||| [X]:3-4:4-5 [X]:4-6:5-7
[X]:0-4:0-5 ||| [X]:0-1:0-2 [X]:1-4:2-5
[X]:0-4:0-5 ||| [X]:0-3:0-4 [X]:3-4:4-5
[X]:0-4:0-5 ||| [X]:0-2:0-2 [X]:2-4:2-5
[X]:0-4:0-5 ||| [X]:0-2:0-3 [X]:2-4:3