In [1]:
# Import the essential libraries/
import collections
from decimal import Decimal
import dill as pickle
import re

***************************************
*****************

# Training ----------------------------------------------------------------

***********

In [2]:
#GLOBAL_VARIABLES
engFileName='final_eng.txt'
dutchFileName='final_dutch.txt'
inputEnglishDataset = "./English_Updated.txt" #src
inputdutchDataset = "./Dutch_Updated.txt"
size=100000
iterations = 120

## Tokenizing

Tokenizes the sentence into list of words

In [3]:
def tokenize(sentence):
    """
    Converts sentence to list of words:
    Args: sentence: str
    Returns list of words
    """
    consonants = "bcdfghjklmnpqrstvwxyz"
    # remove the punctuaations, just take the [a-zA-Z0-9]+ type of regex
    words=re.split(r'[` \t\-=~!@#$%^&*()_+\[\]{};\\\:"|<,./<>?,\n\']', sentence)
    output = list()
    for w in words:
        # remove the consonetnts if any
        if len(w) == 1 and w.lower() in list(consonants):
            continue
        if w in [''] and w.isdigit():
            continue
        w = w.strip()
        output.append(w.lower())
    return output

## 2. Cleaning the input files


In [4]:
def cleaningInputFiles():
    """"
    Takes only [a-zA-Z]+ regex words from the sentences
    Opens the file and creates new file as 
    """
    def readFile(filename):
        sentences = list()
        try:
            with open(filename, "r") as fo:
                sentences = fo.readlines()
            return sentences
        except FileNotFoundError:
            print("Please mention the file correctly!! Current filename: {}".format(filename))
            raise FileNotFoundError
        
    def clean(outfileName, sen_list):
        with open(outfileName, "w") as outf:
            for line in sen_list:
                line = re.sub(r'[^a-zA-Z]', " ", line)
                words = [w for w in tokenize(line) if len(w) > 0]
                sentence= " ".join(words)
                outf.write(sentence + "\n")

    try:
        english = readFile(inputEnglishDataset)
        dutch = readFile(inputdutchDataset)
    except:
        print("Something went wrong")
        return None
        
    clean(engFileName, english)
    clean(dutchFileName, dutch)

# run the cleaning first
cleaningInputFiles()

Please mention the file correctly!! Current filename: ./English_Updated.txt
Something went wrong


In [5]:
# modifiying the collections.defaultdict
class customDict(collections.defaultdict):
    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        else:
            ret = self[key] = self.default_factory(key)
            return ret

In [6]:
def aux(key):
    eng_ptr, du_ptr, eng_len, du_len = key
    return (1.0/du_len)

def _constant_factory(value):
    return lambda: value

## 3. IBM Model 1

In [8]:
def ibmModel1Train(english, dutch, transition_prob, iterations):
    print("In Model1....")
    for i in range(iterations):
        print("Running Iteration {}.......: ".format(i+1))
        count=collections.defaultdict(float)
        total=collections.defaultdict(float)
        count=collections.defaultdict(float)

        sum_total={}

        for(en,du) in zip(english, dutch):
            eng = tokenize(en)
            dut = tokenize(du)
            for e in english:
                sum_total[e]=0.0
                for f in dutch:
                    sum_total[e]+=transition_prob[(e,f)]

            for e in eng:
                for f in dut:
                    count[(e, f)] += transition_prob[(e, f)] / sum_total[e]
                    total[f]+=transition_prob[(e,f)]/sum_total[e]

        for (e,f) in count.keys():
            transition_prob[(e,f)]=count[(e,f)]/total[f]
        
        outfile = open('output/new_map1_{}_{}.pickle'.format(size, i+1), 'wb')
        pickle.dump(transition_prob, outfile)

    return transition_prob

## 4. IBM Model 2

In [9]:
def ibmModel2Train(english, dutch, mapper, iters):
    align = customDict(aux)
    print("In Model1....")
    for x in range(iters):
        print("Running Iteration: {}.....".format(x+1))
        count_map = collections.defaultdict(float)
        count_align = collections.defaultdict(float)
        total_map = collections.defaultdict(float)
        total_align = collections.defaultdict(float)
        total_map_s = collections.defaultdict(float)

        for (eng, du) in zip(english, dutch):
            eng = tokenize(eng)
            du = tokenize(du)
            eng_len = len(english)
            du_len = len(dutch)
            for eng_ptr, eng_word in enumerate(eng, 1):
                total_map_s[eng_word] = 0
                for ptr, word in enumerate(du, 1):
                    total_map_s[eng_word] += mapper[(eng_word, word)] * align[(eng_ptr, ptr, eng_len, du_len)]

            for eng_ptr, eng_word in enumerate(eng, 1):
                for ptr, word in enumerate(du, 1):
                    temp = mapper[(eng_word, word)] * align[(eng_ptr, ptr, eng_len, du_len)] / total_map_s[eng_word]
                    count_map[(eng_word, word)] += temp
                    total_map[word] += temp
                    count_align[(eng_ptr, ptr, eng_len, du_len)] += temp
                    total_align[(eng_ptr, eng_len, du_len)] += temp

        # update mapper
        for key in count_map.keys():
            try:
                mapper[key] = count_map[key] / total_map[key[1]]
            except decimal.DivisionByZero:
                print('Error at', key)
                continue

        #update aligment
        for key in count_align.keys():
            align[key] = count_align[key] / total_align[(key[0], key[2], key[3])]

        pickle.dump(mapper, open('output/new_map2_{}_{}.pickle'.format(size, x+1),'wb'))
        pickle.dump(align, open('output/new_align_{}_{}.pickle'.format(size, x+1),'wb'))

    return (mapper, align)

## Driver Code

In [36]:
def trainIBM1_2Model():
    english = list()
    dutch = list()
    with open(engFileName,'r') as inp:
        english=inp.readlines()[:size]

    with open(dutchFileName,'r') as inp:
        dutch=inp.readlines()[:size]

    print("Training starts for total {} sentences".format(len(english)))
    print("Pre-trained IBMModel1 not found..start training")
    mapper = collections.defaultdict(_constant_factory(1.0/163497))
    mapper = ibmModel1Train(english, dutch, mapper, iterations)
    print("IBMModel 1 training done...")

    final_map, final_align = ibmModel2Train(english, dutch, mapper, iterations)
    print("IBMModel2 training done...")

In [None]:
# Go and train the model
trainIBM1_2Model()

***************************************
*****************

# Testing ----------------------------------------------------------------

***********

## 1. Get Translations

Get the translations from trained model, get the first translation with highest probability

Returns Translations from english to dutch


In [10]:
def getTranslations():
    """
    Get the translations from trained model, get the first translation with highest probability
    Args: None
    Returns Translations from english to dutch
    """
    print("Reading Translations.....")
    print("Reading the pickle time may takes time....")

    finalTranlationsOut = "translations.pickle"
    try:
        x = pickle.load(open(finalTranlationsOut, "rb"))
    except FileNotFoundError:
        print("File not found on appropriate location")
        filename = input("Input the finalTrained pickle file location: ")
        x = pickle.load(open(filename, "rb"))
    translations_prob = {}
    translations = {}
    for key, val in x.items():
        if key[0] in translations_prob:
            if translations_prob[key[0]] < val:
                translations_prob[key[0]] = val
                translations[key[0]] = key[1]
        else:
            translations_prob[key[0]] = val
            translations[key[0]] = key[1]
    return translations

## 2. Get  cosine similarity and  Jacard coef

Get the cosine similarity between 2 documents
    Cosine similarity from 0 to 1
    
Get the Jacard coef between 2 documents
    Jacard coef ranges from 0 to 1
    

In [11]:
def getCosineSimilarity(src, target):

    """
    Get the cosine similarity between 2 documents
    Cosine similarity from 0 to 1

    Args: Two strings of docs
    Returns cosine similarity : float(0, 1)
    """

    l1 =[];l2 =[] 
    # remove stop words from string 
    X_set = {w for w in src}  
    Y_set = {w for w in target} 

    # form a set containing keywords of both strings  
    rvector = X_set.union(Y_set)
    for w in rvector: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y_set: l2.append(1) 
        else: l2.append(0) 
    c = 0

    # cosine formula  
    for i in range(len(rvector)): 
            c+= l1[i]*l2[i]
    try:
        cosine = c / float((sum(l1)*sum(l2))**0.5)
        return cosine 
    except decimal.DivisionByZero:
        print("Zero error")
        return None

    
def getJacardCoeficient(src, target):

    """
    Get the Jacard coef between 2 documents
    Jacard coef ranges from 0 to 1
    Args: Two strings of docs
    Returns Jac Coef: float(0, 1)
    """

    d1 = set(src)
    d2 = set(target)
    d1ud2 = d1.union(d2)
    d1id2 = d1.intersection(d2)
    return float(len(d1id2))/len(d1ud2)

## 3. Driver Program

In [12]:
def run():
    print("Welcome to Cross Language Translations(English<->Dutch)")

    averageJC_score = 0
    averageCS_score = 0
    total_tests = 0
    translations_e_d = getTranslations()
    translations_d_e = dict([(value, key) for key, value in translations_e_d.items()]) 

    while True:
        print("Which Translation you want?")
        print("1. ENGLISH TO DUTCH")
        print("2. DUTCH TO ENGLISH")
        ch = int(input("Please select the option: "))
        
        if ch == 1:
            translations = translations_e_d
        elif ch == 2:
            translations = translations_d_e
        else:
            print("You entered wrong choice")
            sys.exit()

        src_path = input("Enter source document path: ")
        trg_path = input("Enter target document path: ")
        try:
            src_file = open(src_path, 'r')
            src = src_file.read()
        except FileNotFoundError:
            print("Src file does not exists")
            raise FileNotFoundError
            sys.exit()
        try:
            targ_file = open(trg_path, 'r')
            trg = targ_file.read()
        except:
            print("Destination file does not exists")
            raise FileNotFoundError
            sys.exit()

        src_words = tokenize(src)
        trg_words = tokenize(trg)

        translated_list = list()
        for w in src_words:
            if len(w) > 0 and w in translations.keys():
                translated_list.append(translations[w])

        
        cs = getCosineSimilarity(trg_words, translated_list)
        jc = getJacardCoeficient(trg_words, translated_list)

        print("This document has cosine similarity: {}".format(cs))
        print("This document has Jacard similarity: {}".format(jc))

        
        translated_doc = " ".join(translated_list)
        print("Ouputing your result into filename: translated_{}.txt".format(total_tests+1))
        with open("translated_{}.txt".format(total_tests+1), "w") as fout:
            fout.write(translated_doc)

        print("Do you want to continue? ")
        ch = input("Please select the option(Y/N)?: ")
        total_tests += 1
        averageCS_score += cs
        averageJC_score += jc

        if ch == 'Y' or ch == 'y':
            #normal exn
            print("Current Average cosine similarity: {}".format(cs/total_tests))
            print("Current Average Jacard similarity: {}".format(jc/total_tests))
        elif ch == 'N' or ch == 'n':
            # print the average coefficent score
            print("Final Average cosine similarity: {}".format(cs/total_tests))
            print("Final Average Jacard similarity: {}".format(jc/total_tests))

In [None]:
run()

Welcome to Cross Language Translations(English<->Dutch)
Reading Translations.....
Reading the pickle time may takes time....
Which Translation you want?
1. ENGLISH TO DUTCH
2. DUTCH TO ENGLISH
