In [1]:
from collections import Counter
import numpy as np 
from more_itertools import pairwise
from collections import defaultdict
import itertools

### **(a) Warmup**

**Print the 10 most frequent words in each language.**

In [2]:
with open('dat410_europarl/europarl-v7.sv-en.lc.en') as f:
    eng = f.readlines()

with open('dat410_europarl/europarl-v7.sv-en.lc.sv') as f:
    swe = f.readlines()

swe_sentences = []
eng_sentences = []

for str in swe:
        swe_sentences.append(str.split(' '))

for str in eng:
        eng_sentences.append(str.split(' '))

swe_words = [item for sublist in swe_sentences for item in sublist]
eng_words = [item for sublist in eng_sentences for item in sublist]

swe_words = [i for i in swe_words if i.isalpha()]
eng_words = [i for i in eng_words if i.isalpha()]

n = 10

counted_swe = Counter(swe_words).most_common(n)
counted_eng = Counter(eng_words).most_common(n)

print(f'The {n} most frequent words in swedish is: {counted_swe}')
print(f'The {n} most frequent words in english is: {counted_eng}')

The 10 most frequent words in swedish is: [('att', 9181), ('och', 7038), ('i', 5949), ('det', 5687), ('som', 5028), ('för', 4959), ('av', 4013), ('är', 3840), ('en', 3724), ('vi', 3211)]
The 10 most frequent words in english is: [('the', 19322), ('of', 9312), ('to', 8801), ('and', 6946), ('in', 6090), ('is', 4400), ('that', 4357), ('a', 4269), ('we', 3223), ('this', 3222)]


**Let's assume that we pick a word completely randomly from the European parliament proceedings. According to your estimate, what is the probability that it is speaker? What is the probability that it is zebra?**

In [3]:
n_zebra = eng_words.count('zebra')
prob_zebra = n_zebra/len(eng_words)
print(f'The probability that the word is zebra is {prob_zebra*100}%, as zebra is not a word in our data set.')


n_speaker = eng_words.count('speaker')
prob_speaker = n_speaker/len(eng_words)
print(f'The probability that the word is speaker is around {round(prob_speaker*100, 5)}%.')


The probability that the word is zebra is 0.0%, as zebra is not a word in our data set.
The probability that the word is speaker is around 0.00399%.


### **(b) Language modeling**
**Implement a bigram language model as described in the lecture, and use it to compute the probability of a short sentence.**

In [4]:
class BigramModel():

    def __init__(self, word_list, word_pair_list):
        self.word_list = word_list
        self.word_pair_list = word_pair_list

    # Calculate probability of a sentence
    def predict(self, sentence):
        words = sentence.split()
        prob = self.prob_of_word(words[0])
    
        for i in range(0, len(words) - 1):
            w1 = words[i]
            w2 = words[i+1]
    
            prob = prob * self.count_of_word_pair(w1, w2)/self.count_of_word(w1)
    
        return prob
    
    def count_of_word(self, word):
        return self.word_list.count(word)
    
    def count_of_word_pair(self, word1, word2):
        return self.word_pair_list.count((word1, word2))
    
    # Calculate probability of a word appearing
    # all_words should be a list 
    # ['hello', 'my', 'name', 'is', 'lisa']
    def prob_of_word(self, word):
        n_word = self.word_list.count(word)
        n_words = len(self.word_list)
        prob = n_word/n_words
        return prob

# data should be list of lists with elements that are words
# [['hello', 'my', 'name', 'is', 'lisa'], ['hej', 'jag', 'gillar', 'att', 'äta']]
def get_word_pairs(data):
    sentence_pairs = []
    for d in data:
        sentence_pairs.append(list(pairwise(d)))
    
    word_pairs = [item for sublist in sentence_pairs for item in sublist]
    return word_pairs

# lst should be list of lists with elements that are words
# [['hello', 'my', 'name', 'is', 'lisa'], ['hej', 'jag', 'gillar', 'att', 'äta']]
def clean_data(lst):
    new_list = []
    for sentence in lst:
        sentence_list = []
        for word in sentence:
            if word.isalpha():
                sentence_list.append(word)

        new_list.append(sentence_list)
    return new_list

In [5]:
with open('dat410_europarl/europarl-v7.sv-en.lc.en') as f:
    eng = f.read()

eng_lst = eng.split('.')

list_of_sentences = [sentence.split() for sentence in eng_lst]
list_of_sentences = clean_data(list_of_sentences)

word_pairs = get_word_pairs(list_of_sentences)     

test_sentence = 'in the meeting'

model = BigramModel(eng_words, word_pairs)
prob = model.predict(test_sentence)
print(f'The probability of the sentence \'{test_sentence}\' is {round(prob, 10)}')

The probability of the sentence 'in the meeting' is 1.7272e-06


**What happens if you try to compute the probability of a sentence that contains a word that did not appear in the training texts? And what happens if your sentence is very long (e.g. 100 words or more)?** 

If we try to compute the probability of a sentence that contains a word that doesn't appear in the training texts, we will get an error. This is because we will perform division by zero, as this word will have a count of 0. If our sentence is very long, the probability will converge to zero. It is likely that we will eventually get a word pair that doesn't exist in the training texts, and then we multiply by 0. Even if all word pairs exists the probability will of course be very low if the sentence is long.

### **(c) Translation modeling**
**Write code that implements the estimation algorithm for IBM model 1. Then print, for either Swedish, German, or French, the 10 words that the English word european is most likely to be translated into, according to your estimate.**


In [6]:
with open('dat410_europarl/europarl-v7.sv-en.lc.en') as f:
    eng = f.readlines()

with open('dat410_europarl/europarl-v7.sv-en.lc.sv') as f:
    swe = f.readlines()

# Clean the data
eng_clean = []
for word in eng: 
    eng_clean.append(word.replace(',', "").replace('.', "").replace('\n', ""))
eng = eng_clean

# Clean the data
swe_clean = []
for word in swe: 
    swe_clean.append(word.replace(',', "").replace('.', "").replace('\n', ""))
swe = swe_clean

all_eng_words = eng_words
all_swe_words = swe_words

In [7]:
n_words_to_ignore = 25

# Get unique elements and their counts
swe_unique, swe_counts = np.unique(all_swe_words, return_counts=True)

# Only keep words that appear more than 25 times
all_swe_words = swe_unique[swe_counts > n_words_to_ignore]

# Get unique elements and their counts
eng_unique, eng_counts = np.unique(all_eng_words, return_counts=True)

# Only keep words that appear more than 25 times
all_eng_words = eng_unique[eng_counts > n_words_to_ignore]

In [8]:
# EM iterations
n_iterations = 30
t_init = 0.0001

# Initialize t 
t = defaultdict(lambda: t_init)   

for i in range(n_iterations):
    count_swe_eng = defaultdict(float)
    count_eng = defaultdict(float)

    # for each sentence pair
    for k in range(len(eng)):
        swe_words_in_sentence = np.array(swe[k].split())
        eng_words_in_sentence = np.array(eng[k].split())
       
        eng_words_in_sentence = np.append(eng_words_in_sentence, 'NULL')

        for swe_word in swe_words_in_sentence: 
        
            # compute alignment prob
            lst_t = np.array([t[(swe_word, word)] for word in eng_words_in_sentence])
            sum_t = np.sum(lst_t)

            for eng_word in eng_words_in_sentence:
                alignment_prob = t[(swe_word, eng_word)]/sum_t

                # update pseudocount
                count_swe_eng[(eng_word, swe_word)] += alignment_prob

                # update pseudocount
                count_eng[eng_word] += alignment_prob   

                # reestimate probabilities
                t[(swe_word, eng_word)] = count_swe_eng[(eng_word, swe_word)]/count_eng[eng_word]


In [11]:
# Getting the probabilitie for each word pair
keys = []
probs = []
for word in set(all_swe_words):
    keys.append(word)
    probs.append(t[(word, 'european')])
    
n = 10

# getting the n most probable translations
n_most_probable = sorted(zip(keys, probs), key=lambda x: x[1], reverse=True)[0:n]
print(f'The ten Swedish words it is most probable that "european" is translated to are: {[i[0] for i in n_most_probable]}')


The ten Swedish words it is most probable that "european" is translated to are: ['europeiska', 'den', 'på', 'europaparlamentets', 'efter', 'jag', 'särskilda', 'israel', 'alldeles', 'ytterst']


### **(d) Decoding**
**Define and implement an algorithm to find a translation, given a sentence in the source language.**


In [16]:
def get_most_probable(eng_words, target):
    keys = []
    probs = []
    for word in set(eng_words):
        keys.append(word)
        probs.append(t[(target, word)])

    most_probable = sorted(zip(keys, probs), key=lambda x: x[1], reverse=True)[0:10]
    return most_probable[0][0]

def get_translation(sentence):

    translation = ''
    for word in sentence.split(): 
        translation += get_most_probable(all_eng_words, word) + ' '

    return translation

def get_permutations(sentence):
    words = sentence.split()
    return itertools.permutations(words, len(words))


def translate_sentence(sentence):
    translation = get_translation(sentence)

    permutations = get_permutations(translation)
    permutations = list(permutations)
    max_prob = 0
    most_probable_translation = []

    for perm_sentence in permutations:
        perm_sentence = ' '.join(perm_sentence)
        prob = model.predict(perm_sentence)
        
        if prob > max_prob:
            most_probable_translation = perm_sentence
            max_prob = prob
        
    if max_prob == 0:
        print('No probable translation found, translation word by word is:')
        return translation
    
    translation = ''.join(most_probable_translation) 
    
    return translation

sentence = 'ni är från europa'
sentence = 'vi kan tala'

translate_sentence(sentence)

'can we talk'