In [1]:
targetcorpus = [['he', 'broke', 'rocks'], ['she', 'broke', 'rocks'], ['he', 'broke', 'his', 'hand'], ['she', 'broke', 'her', 'hand'], ['he', 'ate', 'rocks'], ['he', 'ate', 'his', 'bread'], ['she', 'ate', 'her', 'bread']]
sourcecorpus = [['bhris', 'sé', 'clocha'], ['bhris', 'sí', 'clocha'], ['bhris', 'sé', 'a', 'lámh'], ['bhris', 'sí', 'a', 'lámh'], ['d\'ith', 'sé', 'clocha'], ['d\'ith', 'sé', 'a', 'arán'],['d\'ith', 'sí', 'a', 'harán']]

In [2]:
sourcevocab = set([w for s in sourcecorpus for w in s])
targetvocab = set([w for s in targetcorpus for w in s])

In [3]:
# store lexical translation probabilities in a dictionary P(s|t) stored with key (s,t)
# here, initialize to uniform distribution
lexP = dict()
for s in sourcevocab:
    for t in targetvocab:
        lexP[(s,t)] = 1/len(sourcevocab)

In [4]:
def printTable(P, sv, tv):
    print(' '.ljust(12) + ' '.join([t.ljust(7) for t in tv]))
    for s in sv:
        print(s.ljust(12) + ' '.join(["{0:.5f}".format(P[(s,t)]) for t in tv]))

In [5]:
printTable(lexP,sourcevocab,targetvocab)

            broke   his     she     he      ate     bread   hand    rocks   her    
bhris       0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111
d'ith       0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111
sí          0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111
a           0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111
clocha      0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111
harán       0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111
lámh        0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111
arán        0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111
sé          0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111 0.11111


In [8]:
# one iteration of EM
def EM(P, sc, tc, sv, tv):
    C = dict()   # keys are pairs (s,t); count of number of alignments
    for s in sv:
        for t in tv:
            C[(s,t)]=0
    total = dict()  # keys are source language words
    for t in tv:
        total[t]=0
    for ssent, tsent in zip(sc,tc):
        sent_totals=dict()
        for s in ssent:
            sent_totals[s]=0
            for t in tsent:
                sent_totals[s] += P[(s,t)]
        for s in ssent:
            for t in tsent:
                C[(s,t)] += P[(s,t)]/sent_totals[s]
                total[t] += P[(s,t)]/sent_totals[s]
    for t in tv:
        for s in sv:
            P[(s,t)] = C[(s,t)]/total[t]

In [9]:
EM(lexP,sourcecorpus,targetcorpus,sourcevocab,targetvocab)
printTable(lexP,sourcevocab,targetvocab)

            broke   his     she     he      ate     bread   hand    rocks   her    
bhris       0.29167 0.12500 0.19444 0.14583 0.00000 0.00000 0.25000 0.22222 0.12500
d'ith       0.00000 0.12500 0.08333 0.14583 0.27778 0.25000 0.00000 0.11111 0.12500
sí          0.14583 0.00000 0.27778 0.00000 0.08333 0.12500 0.12500 0.11111 0.25000
a           0.12500 0.25000 0.16667 0.12500 0.16667 0.25000 0.25000 0.00000 0.25000
clocha      0.16667 0.00000 0.11111 0.16667 0.11111 0.00000 0.00000 0.33333 0.00000
harán       0.00000 0.00000 0.08333 0.00000 0.08333 0.12500 0.00000 0.00000 0.12500
lámh        0.12500 0.12500 0.08333 0.06250 0.00000 0.00000 0.25000 0.00000 0.12500
arán        0.00000 0.12500 0.00000 0.06250 0.08333 0.12500 0.00000 0.00000 0.00000
sé          0.14583 0.25000 0.00000 0.29167 0.19444 0.12500 0.12500 0.22222 0.00000


In [10]:
for i in range(1000):
    EM(lexP,sourcecorpus,targetcorpus,sourcevocab,targetvocab)
printTable(lexP,sourcevocab,targetvocab)

            broke   his     she     he      ate     bread   hand    rocks   her    
bhris       1.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
d'ith       0.00000 0.00000 0.00000 0.00000 1.00000 0.00174 0.00000 0.00000 0.00000
sí          0.00000 0.00000 1.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00072
a           0.00000 0.99437 0.00000 0.00000 0.00000 0.00737 0.00332 0.00000 0.99437
clocha      0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 1.00000 0.00000
harán       0.00000 0.00000 0.00000 0.00000 0.00000 0.49545 0.00000 0.00000 0.00491
lámh        0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.99668 0.00000 0.00000
arán        0.00000 0.00492 0.00000 0.00000 0.00000 0.49544 0.00000 0.00000 0.00000
sé          0.00000 0.00071 0.00000 1.00000 0.00000 0.00000 0.00000 0.00000 0.00000
