# Statistical Machine Translation

<br>
Corpora:
<ol>
<li>German – English</li>
<li>French – English</li>
</ol>

<br>
Implementing a function that will output a table containing the word translation probabilities that were learned.

In [0]:
# ----- Importing libraries ----- 


import re
import pandas as pd
from copy import deepcopy
import tqdm
import operator

In [2]:
# ----- Mounting drive to access data stored in the drive -----


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [0]:
# ----- Reading files & preprocessing the data -----
# ----- Removing all punctuations except apostrophes ----- 


def preprocessing(text):
    text = text.split("\n")
    exclude = '!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~'
    text = [t.translate(str.maketrans('', '', exclude)).lower() for t in text]
    return text




# The language files are in the same directory as the colab file

f = open('/content/drive/My Drive/NLP_Assignment_3/fr-en.fr', 'r', encoding = "utf-8")
french = f.read()
f.close()
french = preprocessing(french)

f = open('/content/drive/My Drive/NLP_Assignment_3/fr-en.en', 'r', encoding = "utf-8")
french_english = f.read()
f.close()
french_english = preprocessing(french_english)


# Reference: https://python-forum.io/Thread-Removing-punctuation-from-strings-in-lists

In [0]:
# ----- Creating sets of unique words for each language -----


fre_set = set([word for sentence in french for word in sentence.split()])

fre_eng_set = set([word for sentence in french_english for word in sentence.split()])

![IBM EM Algorithm](https://slideplayer.com/slide/14108470/86/images/69/IBM+Model+1+and+EM+Algorithm.jpg)

In [0]:
# ----- Function to find euclidian distance -----
# ----- Takes in two dict of dicts and checks for each combination of words the difference in probabilities -----
# ----- Squares the difference, adds them up and return the square root of the entire term -----
# ----- Euclidian diatance formula:  sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2 + ... + (a[n] - b[n])**2) -----


def euclid_dist(t1, t2):
    row_keys = t1.keys()
    cols = list(t1.values())
    col_keys = cols[0].keys()

    result = 0
    for (row_key, col_key) in zip(row_keys, col_keys):
        delta = (t1[row_key][col_key] -
                 t2[row_key][col_key]) ** 2
        result += delta
        
    return result ** 0.5

In [0]:
# ----- Implementing IBM model 1 and EM algorithm -----
# ----- Initializing translational probabilities t(e|f) uniformly, i.e., to a single number in-----
# ----- t is a dict of dicts with each English word having its own dict of French words with probabilities -----


def trans_prob(source_word_Set, target_word_set):
    t = {word_en: {word_fr: 1/len(target_word_set)
                  for word_fr in source_word_Set}
        for word_en in target_word_set}
    return t



# ----- print_align_table prints the word alignment values of the language pairs -----

def print_align_table(source_word_Set, target_word_set):
    t = trans_prob(source_word_Set, target_word_set)
    return pd.DataFrame.from_dict(t)



# ----- Printing word alignment table for French-English -----

print_align_table(fre_set, fre_eng_set)


# Reference: https://stackoverflow.com/questions/33157522/create-pandas-dataframe-from-dictionary-of-dictionaries

<br>

Implementing a function that outputs the alignment for each sentence pair in the training data based on the IBM Model 1.

In [0]:
# ----- Function that outputs word alignment for different language pairs -----


def word_align(source_word_set, target_word_set, source_sentences, target_sentences):


# ----- Initializing -----
# Getting the uniform probabilities 
t = trans_prob(source_word_set, target_word_set)

# Making a copy of t as we'll modify it later in the function 
# Deep copies take a copy of all the data structures on all levels 
prev_trans_prob = deepcopy(t)

# "converged" is set to False and we will end the iterations once it becomes true 
# That is the value get properly converged to the right probabilities 
converged = False

# Initially setting the iterations to be zero
iterations = 0


# while loop will loop through until values converge
while not converged:

  # Setting count(e|f) to be zero for all French words with respect to English words inside a dict of dicts 
  count = {word_en: {word_fr: 0
                for word_fr in source_word_set}
      for word_en in target_word_set}

  # Setting all French word total to be zero inside a dict 
  total = {f:0 for f in source_word_set}


  # Looping through all sentence pairs
  for i in range(len(source_sentences)):  # could be len(target_senetence) as well as both have equal amount of sentences

      # Looping through all English words in each English sentence
      for e in target_sentences[i].split():  

          # Initializing temp_sum to zero to later add all the word probabilities to it
          temp_sum = 0

          # Looping through all French words in each Fnglish sentence
          for f in source_sentences[i].split():

              # Translational probabilities for each english word for all french words are added together
              temp_sum += t[e][f]

          for f in source_sentences[i].split():

              # Each probability is divided by the sum of the probabilies to get normalized and is added to the count of each English-French word pair
              count[e][f] += t[e][f] / temp_sum

              # Doing the same as above and storing it in the dict total 
              total[f] += t[e][f] / temp_sum


  # Looping thro the source set
  for f in source_word_set:

      # Looping thro the target set
      for e in target_word_set:

          # Dividing count of each word pair by the 'total' variable to get the individual translational probabilited
          t[e][f] = count[e][f] / total[f]


  epsilon = 0.001  # threshold for convergence
  delta = euclid_dist(prev_trans_prob, t)  # finding the distance between the previous & new probabilities
  converged = delta < epsilon  # Converged becomes true when delta < epsilon
  prev_trans_prob = deepcopy(t)  # Changing to the new probabilities
  iterations += 1  # Incrementing iterations
  print(iterations) 

return t




# ----- Calling function word_align for French-English language pairs  ----- 


t = word_align(fre_set, fre_eng_set, french, french_english)

In [0]:
# ----- Printing word alignments of French-English language pair -----


print("")
print("{0:20}{1:20}{2:20}".format('English Words', 'French Words', 'Probabilities'))
print("")
for word in fre_eng_set:
    print("{0:20}{1:20}{2:20}".format(word, max(t[word].items(), key=operator.itemgetter(1))[0], max(t[word].items(), key=operator.itemgetter(1))[1]))


# Reference: https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary

In [0]:
# ----- Deleting data for space ----- 


del t, fre_set, fre_word_set, french, french_english

In [0]:
# ----- Repeating the steps for the German_English language pair -----
# ----- Reading files & preprocessing the data -----


f = open('/content/drive/My Drive/NLP_Assignment_3/de-en.de', 'r', encoding = "utf-8")
german = f.read()
f.close()
german = preprocessing(german)

f = open('/content/drive/My Drive/NLP_Assignment_3/de-en.en', 'r', encoding = "utf-8")
german_english = f.read()
f.close()
german_english = preprocessing(german_english)

In [0]:
# ----- Creating sets of unique words for each language -----


ger_set = set([word for sentence in german for word in sentence.split()])

ger_eng_set = set([word for sentence in german_english for word in sentence.split()])

In [0]:
# ----- Printing word alignment table for German-English -----


print_align_table(ger_set, ger_eng_set)

In [0]:
# ----- Calling function word_align for French-English language pairs  ----- 


t = word_align(ger_set, ger_eng_set, german, german_english)

In [0]:
# ----- Printing word alignments of German-English language pair -----


print("")
print("{0:20}{1:20}{2:20}".format('German Words', 'English Words', 'Probabilities'))
print("")
for word in ger_eng_set:
    print("{0:20}{1:20}{2:20}".format(max(t[word].items(), key=operator.itemgetter(1))[0], word, max(t[word].items(), key=operator.itemgetter(1))[1]))

<br>

### Translating a sentence from German to English:

In [0]:
# ----- Making a toy data set ger_toy and eng_toy with the 2nd sentence of "german" and  2nd sentence of "german_english" -----

ger_toy = german[1]
eng_toy = german_english[1]


# ----- Printing sentences -----

print("German:")
print(ger_toy)
print("Translated English:")
eng = []
for word in ger_toy:
    eng.append(max(t[word].items(), key=operator.itemgetter(1))[0])
eng = ' '.join(eng)
print(eng)
print("Actual English")
print(eng_toy)


# References: https://stackoverflow.com/questions/12453580/concatenate-item-in-list-to-strings