In [2]:
# Import the essential libraries
import collections
from decimal import Decimal
import dill as pickle
import re

In [3]:
#GLOBAL_VARIABLES
engFileName='final_eng.txt'
dutchFileName='final_dutch.txt'
inputEnglishDataset = "./English_Updated.txt" #src
inputdutchDataset = "./Dutch_Updated.txt"
size=100000
iterations = 120

## Tokenizing

Tokenizes the sentence into list of words

In [4]:
def tokenize(sentence):
    """
    Converts sentence to list of words:
    Args: sentence: str
    Returns list of words
    """
    consonants = "bcdfghjklmnpqrstvwxyz"
    # remove the punctuaations, just take the [a-zA-Z0-9]+ type of regex
    words=re.split(r'[` \t\-=~!@#$%^&*()_+\[\]{};\\\:"|<,./<>?,\n\']', sentence)
    output = list()
    for w in words:
        # remove the consonetnts if any
        if len(w) == 1 and w.lower() in list(consonants):
            continue
        if w in [''] and w.isdigit():
            continue
        w = w.strip()
        output.append(w.lower())
    return output

# 2. Cleaning the input files


In [5]:
def cleaningInputFiles():
    """"
    Takes only [a-zA-Z]+ regex words from the sentences
    Opens the file and creates new file as 
    """
    def readFile(filename):
        sentences = list()
        try:
            with open(filename, "r") as fo:
                sentences = fo.readlines()
            return sentences
        except FileNotFoundError:
            print("Please mention the file correctly!! Current filename: {}".format(filename))
            raise FileNotFoundError
        
    def clean(outfileName, sen_list):
        with open(outfileName, "w") as outf:
            for line in sen_list:
                line = re.sub(r'[^a-zA-Z]', " ", line)
                words = [w for w in tokenize(line) if len(w) > 0]
                sentence= " ".join(words)
                outf.write(sentence + "\n")

    try:
        english = readFile(inputEnglishDataset)
        dutch = readFile(inputdutchDataset)
    except:
        print("Something went wrong")
        return None
        
    clean(engFileName, english)
    clean(dutchFileName, dutch)
cleaningInputFiles()
# 5:15-> 46

In [5]:
# modifiying the collections.defaultdict
class customDict(collections.defaultdict):
    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        else:
            ret = self[key] = self.default_factory(key)
            return ret

In [35]:
def aux(key):
    eng_ptr, du_ptr, eng_len, du_len = key
    return (1.0/du_len)

def _constant_factory(value):
    return lambda: value

In [4]:
def ibmModel1Train(english, dutch, transition_prob, iterations):
    print("In Model1....")
    for i in range(iterations):
        print("Running Iteration {}.......: ".format(i+1))
        count=collections.defaultdict(float)
        total=collections.defaultdict(float)
        count=collections.defaultdict(float)

        sum_total={}

        for(english,dutch) in zip(english, dutch):
            english = tokenize(english)
            dutch = tokenize(dutch)
            for e in english:
                sum_total[e]=0.0
                for f in dutch:
                    sum_total[e]+=trans_prob[(e,f)]

            for e in english:
                for f in dutch:
                    count[(e, f)] += trans_prob[(e, f)] / sum_total[e]
                    total[f]+=trans_prob[(e,f)]/sum_total[e]

        for (e,f) in count.keys():
            transition_prob[(e,f)]=count[(e,f)]/total[f]
        
        outfile = open('output/new_map1_{}_{}.pickle'.format(str(size, i+1)), 'wb')
        pickle.dump(trans_prob, outfile)

    return transition_prob

In [11]:
def ibmModel2Train(english, dutch, mapper, iters):
    align = customDict(aux)
    print("In Model1....")
    for x in range(iters):
        print("Running Iteration: {}.....".format(x+1))
        count_map = collections.defaultdict(float)
        count_align = collections.defaultdict(float)
        total_map = collections.defaultdict(float)
        total_align = collections.defaultdict(float)
        total_map_s = collections.defaultdict(float)

        for (eng, du) in zip(english, dutch):
            eng = tokenize(eng)
            du = tokenize(du)
            eng_len = len(english)
            du_len = len(dutch)
            for eng_ptr, eng_word in enumerate(eng, 1):
                total_map_s[eng_word] = 0
                for ptr, word in enumerate(du, 1):
                    total_map_s[eng_word] += mapper[(eng_word, word)] * align[(eng_ptr, ptr, eng_len, du_len)]

            for eng_ptr, eng_word in enumerate(eng, 1):
                for ptr, word in enumerate(du, 1):
                    temp = mapper[(eng_word, word)] * align[(eng_ptr, ptr, eng_len, du_len)] / total_map_s[eng_word]
                    count_map[(eng_word, word)] += temp
                    total_map[word] += temp
                    count_align[(eng_ptr, ptr, eng_len, du_len)] += temp
                    total_align[(eng_ptr, eng_len, du_len)] += temp

        # update mapper
        for key in count_map.keys():
            try:
                mapper[key] = count_map[key] / total_map[key[1]]
            except decimal.DivisionByZero:
                print('Error at', key)
                continue

        #update aligment
        for key in count_align.keys():
            align[key] = count_align[key] / total_align[(key[0], key[2], key[3])]

        pickle.dump(mapper, open('output/new_map2_{}_{}.pickle'.format(size, x+1),'wb'))
        pickle.dump(align, open('output/new_align_{}_{}.pickle'.format(size, x+1),'wb'))


    return (mapper, align)

In [36]:
def trainIBM1_2Model():
    english = list()
    dutch = list()
    with open(engFileName,'r') as inp:
        english=inp.readlines()[:size]

    with open(dutchFileName,'r') as inp:
        dutch=inp.readlines()[:size]

    print("Training starts for total {} sentences".format(len(english)))
    print("Pre-trained IBMModel1 not found..start training")
    mapper = collections.defaultdict(_constant_factory(1.0/163497))
    mapper = ibmModel1Train(english, dutch, mapper, iterations)
    print("IBMModel 1 training done...")

    final_map, final_align = ibmModel2Train(english, dutch, mapper, iterations)
    print("IBMModel2 training done...")

In [None]:
trainIBM1_2Model()

In [None]:
def getTranslations():
    """
    Get the translations from trained model, get the first translation with highest probability
    Args: None
    Returns Translations from english to dutch
    """
    x = dill.load(open("dumps/new_map2_300000_25.pickle", "rb"))
    my_dict_prob = {}
    my_dict = {}
    for key, val in x.items():
        if key[0] in my_dict_prob:
            if my_dict_prob[key[0]] < val:
                my_dict_prob[key[0]] = val
                my_dict[key[0]] = key[1]
        else:
            my_dict_prob[key[0]] = val
            my_dict[key[0]] = key[1]
    return my_dict