In [1]:
import sys
import os
import numpy as np
from aer import read_naacl_alignments
import aer

# reading training data
training_en = open('./training/hansards.36.2.e').read().splitlines()
training_fr = open('./training/hansards.36.2.f').read().splitlines()

# reading validation data
validation_en = open('./validation/dev.e').read().splitlines()
validation_fr = open('./validation/dev.f').read().splitlines()

#path for validation goldset
path = 'validation/dev.wa.nonullalign'

print(len(training_en), len(training_fr), len(validation_en), len(validation_fr))

sentence_num = len(training_en)
print(sentence_num)

231164 231164 37 37
231164


In [2]:
#take all sents and return a (list)set of vocab 
def vocab_set(list_of_sent):
    all_words =[]
    for x in list_of_sent:
        all_words+=x.split()
    return list(set(all_words))

In [3]:
#take a set of all vocabs and return a dictionary for indexing
#the position of numpy.array (theta) will be based on this indexing
def vocab2dict(vocab_set):
    return {vocab_set[x]:x for x in range(len(vocab_set))} 

In [11]:
#training 
def training(en_data, fr_data, iteration=10):
    #get #total vocab
    vocab_set_fr = vocab_set(fr_data)
    vocab_set_en = vocab_set(en_data)

    fr_vocab_total = len(vocab_set_fr)
    en_vocab_total = len(vocab_set_en)
    print('Fr word total = %d'%fr_vocab_total)
    print('En word total = %d'%en_vocab_total)
    
    #get word index dicts
    fr2index = vocab2dict(vocab_set_fr)
    en2index = vocab2dict(vocab_set_en)
    
    #theta initialization
    uniform_distribution = 1/fr_vocab_total

    #row = en, column=fr
    theta = np.full((en_vocab_total,fr_vocab_total), uniform_distribution, np.float64)    
    
    for iter_num in range(iteration):
        count =np.zeros((en_vocab_total,fr_vocab_total), np.float64)
        count_en = np.zeros(en_vocab_total, np.float64)

        for n in range(len(en_data)):
            en_sent = en_data[n].split()
            fr_sent = fr_data[n].split()
            #print(fr_sent,en_sent)

            for i in range(len(fr_sent)):
                fr_word_index = fr2index[fr_sent[i]]
                Z = 0

                for j in range(len(en_sent)):
                    en_word_index = en2index[en_sent[j]]
                    Z+= theta[en_word_index][fr_word_index]

                for j in range(len(en_sent)):
                    en_word_index = en2index[en_sent[j]]
                    c = theta[en_word_index][fr_word_index]/Z
                    count[en_word_index][fr_word_index] +=c
                    count_en[en_word_index]+=c

        #for each row (of e word), divide the col values (fr words) by total e 
        normalized = count/count_en[:,None]
        theta=normalized
        
        eval(fr2index, en2index, theta)
        
    return 'Done' #theta


In [12]:
training(training_en[:10000], training_fr[:10000])


Fr word total = 12235
En word total = 9659
0.8423040604343721
0.858356940509915
0.8630783758262512
0.8630783758262512
0.8668555240793201
0.8677998111425873
0.8659112370160529
0.8611898016997167
0.8602455146364495
0.858356940509915


'Done'

In [69]:
print(theta[en2index['she']][fr2index['elle']])
print(theta[en2index['he']][fr2index['il']])


0.822982617624
0.69883946096


In [5]:
def eval(fr2index, en2index, theta):
    predictions = []
    for n in range(len(validation_en)):
        fr_sent=validation_fr[n].split()
        en_sent=validation_en[n].split()
        align = set()
        
        for i in range(len(fr_sent)):
            best_p = 0 #en-fr prob
            best_j = 0 #eng position
            if fr2index.get(fr_sent[i], 'empty')!= 'empty':
                fr_ind = fr2index[fr_sent[i]]
            
                for j in range(len(en_sent)):
                    if en2index.get(en_sent[j], 'empty')!= 'empty':
                        en_ind = en2index[en_sent[j]]

                        if theta[en_ind][fr_ind]>best_p:
                            best_p = theta[en_ind][fr_ind]
                            best_j = j
            align.add((best_j, i+1))   
        predictions.append(align)
        
    gold_sets = read_naacl_alignments(path)


    # 3. Compute AER

    # first we get an object that manages sufficient statistics 
    metric = aer.AERSufficientStatistics()
    # then we iterate over the corpus 
    for gold, pred in zip(gold_sets, predictions):
        metric.update(sure=gold[0], probable=gold[1], predicted=pred)
    # AER
    print(metric.aer())


In [109]:
def test(path):
    from random import random
    # 1. Read in gold alignments
    gold_sets = read_naacl_alignments(path)

    # 2. Here you would have the predictions of your own algorithm, 
    #  for the sake of the illustration, I will cheat and make some predictions by corrupting 50% of sure gold alignments
    predictions = []
    for s, p in gold_sets:
        links = set()
        for link in s:
            if random() < 0.1:
                links.add(link)
        predictions.append(links)
    print(predictions)

    # 3. Compute AER

    # first we get an object that manages sufficient statistics 
    metric = aer.AERSufficientStatistics()
    # then we iterate over the corpus 
    for gold, pred in zip(gold_sets, predictions):
        metric.update(sure=gold[0], probable=gold[1], predicted=pred)
    # AER
    print(metric.aer())

In [102]:
test('validation/dev.wa.nonullalign')

[{(9, 10), (20, 24)}, set(), {(16, 16), (22, 23), (15, 15)}, set(), {(4, 12), (20, 25)}, set(), {(15, 2)}, set(), {(12, 13), (8, 9), (3, 3)}, {(4, 6)}, set(), {(16, 23)}, {(9, 8)}, set(), set(), set(), set(), set(), set(), {(13, 12), (12, 11)}, set(), {(16, 14), (15, 13)}, {(3, 4)}, set(), {(14, 12), (15, 13), (1, 1)}, {(14, 10), (16, 13)}, {(4, 3)}, set(), {(9, 11), (1, 1)}, {(24, 30), (17, 29), (10, 16)}, set(), {(14, 16)}, set(), {(16, 15)}, set(), set(), set()]
0.827027027027027
