# Translation Memory Retrieval using Weighted N-Grams

In [240]:
import nltk
import math
from collections import Counter
import string
import numpy as np

In [241]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ashes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [242]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

In [243]:
input_line = input()

#convert input to lowercase
input_line = input_line.lower()

#tokenise
input_tokens = word_tokenize(input_line)

content_words = [word for word in input_tokens if word not in stop_words] #Removing Stopwords

print(input_tokens)

ilkncls csdnclsknc
['ilkncls', 'csdnclsknc']


## Weighted N-Gram Precision

In [244]:
idf_values = {}
with open("../project/tm_data/tm_src.txt") as source_file:
    sentences = source_file.read().splitlines()


tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
for tkn in all_tokens_set:
    contains_token = map(lambda doc: tkn in doc, tokenized_documents)
    idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))

### Getting the M_ngrams and C_ngrams

In [245]:
def get_M_ngrams():
    sentence = "There are a few controversies surrounding the the company may keep changing its business strategy topic how many songs did Rafi sing during his lifetime"
    ngrams_list_sent = []
    M_ngrams = []
    counter_ngrams = []
    
    ngrams = list(nltk.ngrams(sentence.split(), 4))
    ngrams_list_sent.append(list(ngrams))
    M_ngrams = [y for x in ngrams_list_sent for y in x]
    
    for ngrams in M_ngrams:
        counter_ngrams.append(Counter(ngrams))
        
    return M_ngrams



In [246]:
M_ngrams = get_M_ngrams()

In [247]:
def get_C_ngrams(candidate_sentence):
    ngrams_list_sent = []
    C_ngrams = []
    counter_ngrams = []
    
    ngrams = list(nltk.ngrams(candidate_sentence.split(), 4))
    ngrams_list_sent.append(list(ngrams))
    C_ngrams = [y for x in ngrams_list_sent for y in x]
    ngrams_sents = []
    ngrams_list_sent = []
    
    for ngrams in C_ngrams:
        counter_ngrams.append(Counter(ngrams))
    
    return C_ngrams


### To compute numerator and denominator

In [248]:
def ngrams_intersection(candidate_sentence):
    C_ngrams = get_C_ngrams(candidate_sentence)
    
    M_set = set(M_ngrams)
    C_set = set(C_ngrams)
    
    return list(M_set & C_set)

In [249]:
def compute_w_sum(ngrams_list):
    w = 0
    
    for ngram in ngrams_list:
        for token in ngram:
            if token in idf_values:
                w+= idf_values[token] 
    return w

### Final score for each sentence wrt to input sentence

In [250]:
def compute_wpn(candidate_sentence):
    C_ngrams = get_C_ngrams(candidate_sentence)
    intersection_ngrams = ngrams_intersection(candidate_sentence)
    Z = 0.75
    
    w_M_ngrams = compute_w_sum(M_ngrams)
    w_C_ngrams = compute_w_sum(C_ngrams)
    w_intersection_ngrams = compute_w_sum(intersection_ngrams)
    
    
    wpn = w_intersection_ngrams / ((Z*w_M_ngrams) + ((1-Z)*w_C_ngrams))
    
    return wpn

In [251]:
max_wpn = 0
wnp_all = []
N = 5

for sentence in sentences:
    wpn = compute_wpn(sentence)
    wnp_all.append(wpn)
    if wpn > max_wpn:
        max_wpn = wpn
        best_sentence = sentence
            
        
wnp_all = np.array(wnp_all)
sorted_indices = np.argsort(wnp_all) 
least_N_indices = sorted_indices[-N:] 

print()
for i in least_N_indices:
    print(sentences[i], wnp_all[i])


There are a few societies 0.04892725559201201
There are a few societies 0.04892725559201201
The company may keep changing its business strategy in a steady pace to adapt to the pressure and competition. 0.21756950755179016
The company may keep changing its business strategy in a steady pace to adapt to the pressure and competition. 0.21756950755179016
There are a few controversies surrounding the topic how many songs did Rafi sing during his lifetime . 0.5544249929472864


### Retrieval of Target from TM

In [252]:
tgt_tm_array = []

with open('../project/tm_data/tm_tgt.txt') as tgt_tm:
    line = tgt_tm.readline()
    
    while line:
        tgt_tm_array.append(line)
        line = tgt_tm.readline()
  
    for i in least_N_indices:
        print([i], tgt_tm_array[i])

[756457] आपके नौकरी के आवेदन को कैसे परखेगा.

[708745] रेफ डॉक:  IIT(BHU/IWD/ET/04/Hostel/2018 -19/813-

[769884] उसका मन तो एक ही बात में लगा था कि जो कुछ भी उसने बीते सालों में सीखा है उस सबको एक बार दोहरा ले ।<s> वह कीमियागर जरूर उसकी परीक्षा लेगा ।

[3] कंपनी दबाव और प्रतिस्पर्धा के अनुकूल बनने के लिए धीमी गति से अपनी व्यावसायिक रणनीति को बदल सकती है।

[9] रफ़ी ने अपने जीवन में कुल कितने गाने गाए इस पर कुछ विवाद है ।

