# Translation Memory Retrieval using Weighted N-Grams

In [6]:
import nltk
import math
from collections import Counter
import string

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ashes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

In [11]:
input_line = input()

#convert input to lowercase
input_line = input_line.lower()

#tokenise
input_tokens = word_tokenize(input_line)

content_words = [word for word in input_tokens dri if word not in stop_words] #Removing Stopwords

print(content_words)

please remove the drive safely
['please', 'remove', 'drive', 'safely']


## Weighted N-Gram Precision

### Load TM

In [353]:
def get_doc():
    words = []
    sent_words = []
    sent_dict = []
    sent_word_dict_array = []
    word_set = {}
    i = 0
    with open("tm_data/source_text.txt") as source_file:
        sents = source_file.read().splitlines()
        
    for sent in sents:
        sent = sent.translate(str.maketrans('', '', string.punctuation))

        words = nltk.word_tokenize(sent)
        words = [x for x in words if x != '.']
        word_set = set(word_set).union(words)
        sent_words.append(words)
        
    for words_array in sent_words:
        sent_word_dict_array.append(dict.fromkeys(word_set, 0))
    
    for sent in sent_words:
        for word in sent:
            sent_word_dict_array[i][word] += 1
        i += 1
            
            
    del sent_word_dict_array[0]
    return sent_word_dict_array

In [354]:
# array = get_doc()
# import pandas as pd
# pd.DataFrame(array)

In [355]:
def computeIDF(docList):
    """
    idf = ln(total number of docs/number of docs with word in it)
    in our case, docs are sentences
    """
    e = 0.00000000000001
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
            idfDict[word] = math.log10(N / (float(val) + e))
        
    return idfDict

In [356]:
# print(computeIDF(get_doc()))

In [357]:
def get_ngrams_mtbt(): 
    """
    sentence to be translated in MTBT
    """
    sentence = "There are a few controversies surrounding the the company may keep changing its business strategy topic how many songs did Rafi sing during his lifetime"
    ngrams_list_sent = []
    M_ngrams = []
    counter_ngrams = []
    
    ngrams = list(nltk.ngrams(sentence.split(), 4))
    ngrams_list_sent.append(list(ngrams))
    M_ngrams = [y for x in ngrams_list_sent for y in x]
    
    for ngrams in M_ngrams:
        counter_ngrams.append(Counter(ngrams))
        
    return M_ngrams



In [358]:
def get_ngrams_candidates(candidate_sentence):
    ngrams_list_sent = []
    C_ngrams = []
    counter_ngrams = []
    
    candidate_sentence = candidate_sentence.translate(str.maketrans('', '', string.punctuation))
    ngrams = list(nltk.ngrams(candidate_sentence.split(), 4))
    ngrams_list_sent.append(list(ngrams))
    C_ngrams = [y for x in ngrams_list_sent for y in x]
    ngrams_sents = []
    ngrams_list_sent = []
    
    for ngrams in C_ngrams:
        counter_ngrams.append(Counter(ngrams))
    
    return C_ngrams


In [359]:
# get_ngrams_candidates("The company may keep changing its business strategy in a steady pace to adapt to the pressure and competition.")

In [360]:
def ngrams_intersection(candidate_sentence):
    M_ngrams = get_ngrams_mtbt()
    C_ngrams = get_ngrams_candidates(candidate_sentence)
    
    M_set = set(M_ngrams)
    C_set = set(C_ngrams)
    
    return list(M_set & C_set)

In [361]:
def compute_w_sum(ngrams_list):
    sent_word_dict_array = get_doc()
    idfs = computeIDF(sent_word_dict_array)
    w = 0
    
    for ngram in ngrams_list:
        for token in ngram:
            w+= idfs[token]            
    return w

In [362]:
M_ngrams = get_ngrams_mtbt()

compute_w_sum(M_ngrams)

152.98964151085798

In [363]:
def compute_wpn(candidate_sentence):
    M_ngrams = get_ngrams_mtbt()
    C_ngrams = get_ngrams_candidates(candidate_sentence)
    intersection_ngrams = ngrams_intersection(candidate_sentence)
    Z = 0.75
    
    w_M_ngrams = compute_w_sum(M_ngrams)
    w_C_ngrams = compute_w_sum(C_ngrams)
    w_intersection_ngrams = compute_w_sum(intersection_ngrams)
    
    
    wpn = w_intersection_ngrams / ((Z*w_M_ngrams) + ((1-Z)*w_C_ngrams))
    
    return wpn

In [364]:
# compute_wpn()


In [367]:
max_wpn = 0
with open("tm_data/source_text.txt") as source_file:
    sentences = source_file.read().splitlines()

    for sentence in sentences:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        wpn = compute_wpn(sentence)
        if wpn > max_wpn:
            max_wpn = wpn
            best_sentence = sentence
            
        
    print(max_wpn)
    print(best_sentence)

0.562765780206217
There are a few controversies surrounding the topic how many songs did Rafi sing during his lifetime 
