In [None]:
targetraw = ['he broke rocks', 'she broke rocks', 'he broke his hand', 'she broke her hand', 'he ate rocks', 'he ate his bread', 'she ate her bread']
sourceraw = ['bhris sé clocha', 'bhris sí clocha', 'bhris sé a lámh', 'bhris sí a lámh', 'd\'ith sé clocha', 'd\'ith sé a arán', 'd\'ith sí a harán']

In [None]:
targetcorpus = [sent.split() for sent in targetraw]
sourcecorpus = [sent.split() for sent in sourceraw]

In [None]:
sourcevocab = set([w for s in sourcecorpus for w in s])
targetvocab = set([w for s in targetcorpus for w in s])

In [None]:
# store lexical translation probabilities in a dictionary P(s|t) stored with key (s,t)
# here, initialize to uniform distribution
lexP = dict()
for s in sourcevocab:
    for t in targetvocab:
        lexP[(s,t)] = 1/len(sourcevocab)

In [None]:
def printTable(P, sv, tv):
    print(' '.ljust(12) + ' '.join([t.ljust(7) for t in tv]))
    for s in sv:
        print(s.ljust(12) + ' '.join(["{0:.5f}".format(P[(s,t)]) for t in tv]))

In [None]:
# distribution is over source words, so columns here will always add to 1.0
printTable(lexP,sourcevocab,targetvocab)

In [None]:
# one iteration of EM
def EM(P, sc, tc, sv, tv):
    C = dict()   # keys are pairs (s,t); count of number of alignments
    for s in sv:
        for t in tv:
            C[(s,t)]=0
    total = dict()  # keys are source language words
    for t in tv:
        total[t]=0
    for ssent, tsent in zip(sc,tc):
        sent_totals=dict()
        for s in ssent:
            sent_totals[s]=0
            for t in tsent:
                sent_totals[s] += P[(s,t)]
        for s in ssent:
            for t in tsent:
                C[(s,t)] += P[(s,t)]/sent_totals[s]
                total[t] += P[(s,t)]/sent_totals[s]
    for t in tv:
        for s in sv:
            P[(s,t)] = C[(s,t)]/total[t]

In [None]:
EM(lexP,sourcecorpus,targetcorpus,sourcevocab,targetvocab)
printTable(lexP,sourcevocab,targetvocab)

In [None]:
for i in range(1000):
    EM(lexP,sourcecorpus,targetcorpus,sourcevocab,targetvocab)
printTable(lexP,sourcevocab,targetvocab)