# NLP2 project 1: Lexical Alignment

In [3]:
import sys
import os
import numpy as np
import operator
import aer
from collections import *
from decimal import *
from aer import read_naacl_alignments

## Read Data

In [4]:
# reading training data
en_train = open('./training/hansards.36.2.e').read().splitlines()
fr_train = open('./training/hansards.36.2.f').read().splitlines()

# reading validation data
en_val = open('./validation/dev.e').read().splitlines()
fr_val = open('./validation/dev.f').read().splitlines()

# path of gold standrad
path = 'validation/dev.wa.nonullalign'

print('Number of English sentences in training set:', len(en_train))
print('Number of French sentences in training set:', len(fr_train))
print('Number of English sentences in validation set:', len(en_val))
print('Number of English sentences in validation set:', len(fr_val))

Number of English sentences in training set: 231164
Number of French sentences in training set: 231164
Number of English sentences in validation set: 37
Number of English sentences in validation set: 37


In [5]:
print('The first English sentence:', en_train[0])

The first English sentence: 36 th Parliament , 2 nd Session 


In [6]:
print('The first French sentence:', fr_train[0])

The first French sentence: 36 e L茅gislature , 2 i猫me Session 


In [7]:
# theta = np.load('theta_0.npy').item()
# theta_10 = np.load('theta_10.npy').item()
# count_f_e_0 = np.load('count_f_e_0.npy').item()
# count_e_0 = np.load('count_e_0.npy').item()

# '''
# np.save('theta_0.npy', theta_0) 
# np.save('count_f_e_0.npy', count_f_e) 
# np.save('count_e_0.npy', count_e)
# np.save('theta_10.npy', theta) 
# '''

## Data Preprocessing

In [8]:
def preprocess(s):
    return s.split()

In [9]:
for i in range(len(en_val)):
    en_val[i] = preprocess(en_val[i])
    fr_val[i] = preprocess(fr_val[i])
for i in range(len(en_train)):
    en_train[i] = preprocess(en_train[i])
    en_train[i] = ['NULLINDICATOR'] + en_train[i]
    fr_train[i] = preprocess(fr_train[i])

In [10]:
print('Proccessed example sentence:', en_train[0])

Proccessed example sentence: ['NULLINDICATOR', '36', 'th', 'Parliament', ',', '2', 'nd', 'Session']


## IBM1 - Initialize Parameters

In [11]:
def init_theta(en_train, fr_train):
    theta = defaultdict(lambda: defaultdict(lambda: Decimal(0)))
    N= len(en_train)
    for n in range(N):
        cleaned_en_n = en_train[n]
        cleaned_fr_n = fr_train[n]     
        for j_en_word in cleaned_en_n:
            for i_fr_word in cleaned_fr_n:
                theta[j_en_word][i_fr_word] = 0      
    for en_word in theta.keys():
        count = len(theta[en_word])
        for fr_word in theta[en_word]:
            theta[en_word][fr_word] = Decimal(1 / count)
    return theta

In [12]:
theta = init_theta(en_train, fr_train)
print('Number of English words in the training set:', len(theta))

Number of English words in the training set: 36636


## IBM1 - EM

In [13]:
def train_EM(en_train, fr_train, theta, en_val, fr_val, K):
    N = len(en_train)
    for k in range(K):
        print('Iteration:', k)
        count_f_e = defaultdict(lambda: Decimal(0))
        total_f = defaultdict(lambda: Decimal(0))
        for n in range(N):
            cleaned_en_n = en_train[n]
            cleaned_fr_n = fr_train[n]
            count_e = defaultdict(lambda: Decimal(0))
            for j in range(len(cleaned_en_n)):
                for i in range(len(cleaned_fr_n)):
                    count_e[cleaned_en_n[j]] += Decimal(theta[cleaned_en_n[j]][cleaned_fr_n[i]])
            for j in range(len(cleaned_en_n)):
                for i in range(len(cleaned_fr_n)):
                    f_e = (cleaned_fr_n[i], cleaned_en_n[j])
                    count_f_e[f_e] += Decimal(theta[cleaned_en_n[j]][cleaned_fr_n[i]] / count_e[cleaned_en_n[j]])
                    total_f[cleaned_fr_n[i]] += Decimal(theta[cleaned_en_n[j]][cleaned_fr_n[i]] / count_e[cleaned_en_n[j]])
        for n in range(N):
            cleaned_en_n = en_train[n]
            cleaned_fr_n = fr_train[n]
            for en in cleaned_en_n:
                for fr in cleaned_fr_n:
                    theta[en][fr] = Decimal(count_f_e[(fr, en)] / total_f[fr])
        predictions = []
        for s in range(len(en_val)):
            alignments=[]
            l = len(en_val[s])
            m = len(fr_val[s])
            for i in range(l):
                if i==0: continue
                vals=[]
                for j in range(m):
                    vals.append(theta[en_val[s][i]][fr_val[s][j]])
                max_idx = vals.index(max(vals))
                alignments.append((i, max_idx+1))
            predictions.append(set(alignments))
#             vals = []
#             for i_en, w_en in enumerate(en_val[s]):
#                 if i_en == 0:
#                     continue
#                 for i_fr, w_fr in enumerate(fr_val[s]):
#                     vals.append(theta[w_en][w_fr])
#                 max_ = vals.index(max(vals))
#                 align.add((i_en, max_+1))
#             for i_fr, w_fr in enumerate(fr_val[s]):
#                 best_p = 0
#                 best_j = 0
#                 potential = []
#                 for i_en, w_en in enumerate(en_val[s]):
#                     if i_en == 0:
#                         continue
#                     if w_en in theta and w_fr in theta[w_en] and theta[w_en][w_fr] > best_p:
#                         best_p = theta[w_en][w_fr]
#                         best_j = i_en
#                         potential = []
#                         potential.append(best_j)
#                     if w_en in theta and w_fr in theta[w_en] and abs(theta[w_en][w_fr] - best_p) < 0.05:
#                         potential.append(i_en)
#                 for candidate in potential:
#                     align.add((candidate, i_fr+1))
#             predictions.append(align)
        gold_sets = read_naacl_alignments(path)
        metric = aer.AERSufficientStatistics()
        for gold, pred in zip(gold_sets, predictions):
            metric.update(sure=gold[0], probable=gold[1], predicted=pred)
        # AER
        print(metric.aer())
    return theta

In [14]:
theta = train_EM(en_train, fr_train, theta, en_val, fr_val, K=10)

Iteration: 0


KeyboardInterrupt: 

## Test

In [None]:
theta.keys()

In [None]:
a = theta['read']

In [None]:
sorted_x = sorted(a.items(), key=operator.itemgetter(1))

In [None]:
sorted_x[-1]