# NLP2 project 1: Lexical Alignment
###  Kai Liang, Yijie Zhang, Billy Chan

In [1]:
# -*- coding:utf-8 -*-
import sys
import os
import numpy as np
import operator
import aer
from collections import *
from decimal import *
from aer import read_naacl_alignments
import matplotlib.pyplot as plt

## * Read data

In [2]:
# reading training data
en_train = open('./training/hansards.36.2.e', mode='r', encoding='utf-8').read().splitlines() # utf-8 encoding
fr_train = open('./training/hansards.36.2.f', mode='r', encoding='utf-8').read().splitlines() # utf-8 encoding

# reading validation data
en_val = open('./validation/dev.e', mode='r', encoding='utf-8').read().splitlines() # utf-8 encoding
fr_val = open('./validation/dev.f', mode='r', encoding='utf-8').read().splitlines() # utf-8 encoding

# reading test data
en_test = open('./testing/test/test.e', mode='r', encoding='utf-8').read().splitlines() # utf-8 encoding
fr_test = open('./testing/test/test.f', mode='r', encoding='utf-8').read().splitlines() # utf-8 encoding

# path of gold standrad
path = 'validation/dev.wa.nonullalign'

print('Number of English sentences in training set:', len(en_train))
print('Number of French sentences in training set:', len(fr_train))
print('Number of English sentences in validation set:', len(en_val))
print('Number of French sentences in validation set:', len(fr_val))
print('Number of English sentences in testing set:', len(en_test))
print('Number of French sentences in testing set:', len(fr_test))

Number of English sentences in training set: 231164
Number of French sentences in training set: 231164
Number of English sentences in validation set: 37
Number of French sentences in validation set: 37
Number of English sentences in testing set: 447
Number of French sentences in testing set: 447


In [3]:
print('The first English sentence:', en_train[0])

The first English sentence: 36 th Parliament , 2 nd Session 


In [4]:
print('The first French sentence:', fr_train[0])

The first French sentence: 36 e Législature , 2 ième Session 


In [5]:
# theta = np.load('theta_0.npy').item()
# theta_10 = np.load('theta_10.npy').item()
# count_f_e_0 = np.load('count_f_e_0.npy').item()
# count_e_0 = np.load('count_e_0.npy').item()

# '''
# np.save('theta_0.npy', theta_0) 
# np.save('count_f_e_0.npy', count_f_e) 
# np.save('count_e_0.npy', count_e)
# np.save('theta_10.npy', theta) 
# '''

## * Data preprocessing

In [6]:
# split sentence into words
def preprocess(s):
    return s.split()

In [7]:
# preprocess training data, validation data and testing data
for i in range(len(en_train)):
    en_train[i] = preprocess(en_train[i])
    en_train[i] = ['NULLINDICATOR'] + en_train[i]
    fr_train[i] = preprocess(fr_train[i])
for i in range(len(en_val)):
    en_val[i] = preprocess(en_val[i])
    fr_val[i] = preprocess(fr_val[i])
for i in range(len(en_test)):
    en_test[i] = preprocess(en_test[i])
    fr_test[i] = preprocess(fr_test[i])

In [8]:
print('Proccessed training example sentence:', fr_train[0])
print('Proccessed validation example sentence:', fr_val[0])
print('Proccessed testing example sentence:', fr_test[0])

Proccessed training example sentence: ['36', 'e', 'Législature', ',', '2', 'ième', 'Session']
Proccessed validation example sentence: ['chacun', 'en', 'lui', '-', 'même', 'est', 'très', 'complexe', 'et', 'le', 'lien', 'entre', 'les', 'deux', 'le', 'est', 'encore', 'davantage', 'de', 'sorte', 'que', 'pour', 'beaucoup', 'la', 'situation', 'présente', 'est', 'confuse', '.']
Proccessed testing example sentence: ['2', '.']


## * Log-likelihood Computation

In [9]:
def get_loglikelihood(en_train, fr_train, theta):
    N = len(en_train)
    loglikelihood_total = 0
    for n in range(N): # for all pairs of sentences
        
        cleaned_en_n = en_train[n]
        len_en_n = len(cleaned_en_n)
        
        cleaned_fr_n = fr_train[n]
        len_fr_n = len(cleaned_fr_n)
        
        loglikelihood_n = np.log(len_fr_n/len_en_n) + len_fr_n*np.log(1/(len_en_n+1))
        for fr_word in cleaned_fr_n: # every french word
            # find the english word that maximize p(fr/en) 
            max_p_fe = 0
            for en_word in cleaned_en_n:
                p_fe = theta[en_word][fr_word]
                if p_fe > max_p_fe:
                    max_p_fe = p_fe
            
            loglikelihood_n += np.log(float(max_p_fe))
        loglikelihood_total += loglikelihood_n
    return loglikelihood_total

## * Compute AER

In [10]:
def compute_aer(en_data, fr_data, theta):
    # compute AER score for the given data
    predictions = []
    for s in range(len(en_data)):
        alignments=[]
        l_e = len(en_data[s])
        l_f = len(fr_data[s])
        for i in range(l_f):
            vals = []
            for j in range(l_e):
                vals.append(theta[en_data[s][j]][fr_data[s][i]])
            max_idx = vals.index(max(vals))
            alignments.append((max_idx+1, i+1))
        predictions.append(set(alignments))
    gold_sets = read_naacl_alignments(path)
    metric = aer.AERSufficientStatistics()
    for gold, pred in zip(gold_sets, predictions):
        metric.update(sure=gold[0], probable=gold[1], predicted=pred)
    # AER
    return metric.aer(), predictions  

## * Export NAACL

In [11]:
def export_naacl(predictions, filename):
    with open('{}.naacl'.format(filename), 'w') as file:
        for index, sen in enumerate(predictions):
            for aligns in sen:
                for align in aligns:
                    file.write('{} {} {} S\n'.format(index+1, align[0], align[1]))
    file.close()

## * IBM1 - Initialize parameters

In [12]:
def init_1():
    return Decimal(0)

def init_2():
    return defaultdict(init_1)

def init_theta(en_train, fr_train):
    # initialize theta
    theta = defaultdict(init_2)
    N = len(en_train)
    for n in range(N):
        cleaned_en_n = en_train[n]
        cleaned_fr_n = fr_train[n]     
        for j_en_word in cleaned_en_n:
            for i_fr_word in cleaned_fr_n:
                theta[j_en_word][i_fr_word] = 0      
    for en_word in theta:
        count = len(theta[en_word])
        for fr_word in theta[en_word]:
            theta[en_word][fr_word] = Decimal(1 / count)
    return theta

In [13]:
theta = init_theta(en_train, fr_train)
print('Number of English words in the training set:', len(theta))

Number of English words in the training set: 36636


## * IBM1 - update via EM

In [14]:
def train_EM(en_train, fr_train, theta, en_val, fr_val, en_test, fr_test, K):
    N = len(en_train)
    AER = []
    iteration = []
    t_log = []
    for k in range(K):
        iteration.append(k+1)
        print('Iteration {}:'.format(k))
        count_f_e = defaultdict(init_1)
        total_f = defaultdict(init_1)
        for n in range(N):
            cleaned_en_n = en_train[n]
            cleaned_fr_n = fr_train[n]
            count_e = defaultdict(lambda: Decimal(0))
            for j in range(len(cleaned_en_n)):
                for i in range(len(cleaned_fr_n)):
                    count_e[cleaned_en_n[j]] += Decimal(theta[cleaned_en_n[j]][cleaned_fr_n[i]])
            for j in range(len(cleaned_en_n)):
                for i in range(len(cleaned_fr_n)):
                    f_e = (cleaned_fr_n[i], cleaned_en_n[j])
                    count_f_e[f_e] += Decimal(theta[cleaned_en_n[j]][cleaned_fr_n[i]] / count_e[cleaned_en_n[j]])
                    total_f[cleaned_fr_n[i]] += Decimal(theta[cleaned_en_n[j]][cleaned_fr_n[i]] / count_e[cleaned_en_n[j]])
        for n in range(N):
            cleaned_en_n = en_train[n]
            cleaned_fr_n = fr_train[n]
            for en in cleaned_en_n:
                for fr in cleaned_fr_n:
                    theta[en][fr] = Decimal(count_f_e[(fr, en)] / total_f[fr])
        # compute AER on the validation set
        aer, _ = compute_aer(en_val, fr_val, theta)
        print('Validation AER:', aer)
        AER.append(aer)
        
        log_likelihood = get_loglikelihood(en_train, fr_train, theta)
        t_log.append(log_likelihood)
        print("Training log-likelihood", log_likelihood)
        
    aer_test, test_alignment = compute_aer(en_test, fr_test, theta)
    print('Test AER:', aer_test)
    export_naacl(test_alignment, 'ibm1.mle')
    print('File saved successfully.')
    np.save("theta_10.npy", theta)
    return theta, AER, t_log, iteration

In [None]:
theta_10, AER, t_log, iteration = train_EM(en_train, fr_train, theta, en_val, fr_val, en_test, fr_test, K=10)

Iteration 0:
Validation AER: 0.8508026440037771
Training log-likelihood -28094857.93114429
Iteration 1:


## * Plotting

In [None]:
plt.plot(iteration, AER, figsize=(10,5))
xlabel('Iteration')
ylabel('AER')
plt.show()
savefig('AER-IBM1.eps')

In [None]:
plt.plot(iteration, t_log, figsize=(10,5))
xlabel('Iteration')
ylabel('Log-likelihood')
plt.show()
savefig('LogLikelihood-IBM1.eps')

## * IBM2 

In [None]:
# starts here