# Translation Memory Retrieval using Weighted N-Grams

In [98]:
import nltk
import math
from collections import Counter
import string
import numpy as np
import json
import ast

In [99]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ashes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [100]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

In [101]:
input_line = input()

sentence = input_line.lower()
# sentence = "There were many controversies about the songs he performed during his lifetime ."

print(sentence)

I request you to please remove the drive safely.
i request you to please remove the drive safely.


## Weighted N-Gram Precision

### Get sentences and IDF values

In [102]:
with open("./tm_data/tm_src_2000_lower.txt") as source_file:
    sentences = source_file.read().splitlines()


with open('./tm_data/idf_values_2000.json') as json_file:
    idf_values_str = json.load(json_file)

idf_values = ast.literal_eval(idf_values_str)


### Getting the M_ngrams and C_ngrams

In [103]:
def get_M_ngrams(sentence):
    ngrams_list_sent = []
    M_ngrams = []
    counter_ngrams = []
    
    ngrams = list(nltk.ngrams(sentence.split(), 2))
    ngrams_list_sent.append(list(ngrams))
    M_ngrams = [y for x in ngrams_list_sent for y in x]
    
    for ngrams in M_ngrams:
        counter_ngrams.append(Counter(ngrams))
        
    return M_ngrams



In [104]:
M_ngrams = get_M_ngrams(sentence)

In [105]:
def get_C_ngrams(candidate_sentence):
    ngrams_list_sent = []
    C_ngrams = []
    counter_ngrams = []
    
    ngrams = list(nltk.ngrams(candidate_sentence.split(), 2))
    ngrams_list_sent.append(list(ngrams))
    C_ngrams = [y for x in ngrams_list_sent for y in x]
    ngrams_sents = []
    ngrams_list_sent = []
    
    for ngrams in C_ngrams:
        counter_ngrams.append(Counter(ngrams))
    
    return C_ngrams


### To compute numerator and denominator

In [106]:
def ngrams_intersection(candidate_sentence):
    C_ngrams = get_C_ngrams(candidate_sentence)
    
    M_set = set(M_ngrams)
    C_set = set(C_ngrams)
    
    return list(M_set & C_set)

In [107]:
def compute_w_sum(ngrams_list):
    w = 0
    
    for ngram in ngrams_list:
        for token in ngram:
            if token in idf_values:
                w+= idf_values[token] 
    return w

### Final score for each sentence wrt to input sentence

In [108]:
def compute_wpn(candidate_sentence):
    C_ngrams = get_C_ngrams(candidate_sentence)
    intersection_ngrams = ngrams_intersection(candidate_sentence)
    Z = 0.75
    
    w_M_ngrams = compute_w_sum(M_ngrams)
    w_C_ngrams = compute_w_sum(C_ngrams)
    w_intersection_ngrams = compute_w_sum(intersection_ngrams)
    
    
    wpn = w_intersection_ngrams / ((Z*w_M_ngrams) + ((1-Z)*w_C_ngrams))
    
    return wpn

In [109]:
max_wpn = 0
wnp_all = []
N = 5

for sentence in sentences:
    wpn = compute_wpn(sentence)
    wnp_all.append(wpn)
    if wpn > max_wpn:
        max_wpn = wpn
        best_sentence = sentence
            
        
wnp_all = np.array(wnp_all)
sorted_indices = np.argsort(wnp_all) 
least_N_indices = sorted_indices[-N:] 

print()
for i in least_N_indices:
    print([i+1], sentences[i], wnp_all[i])


[811] we have sent down the book to you to make everything clear 0.06851292170457951
[1133] here is cool water for you to wash in and drink 0.0715390087369681
[1425] enabled tv screen that allows you to play pre 0.07441939850239553
[103] and had he showed them to you to be numerous 0.074573352760082
[208] safely remove the selected drive 0.12471474617158834


### Retrieval of Target from TM

In [110]:
tgt_tm_array = []

with open('./tm_data/tm_tgt_2000.txt') as tgt_tm:
    line = tgt_tm.readline()
    
    while line:
        tgt_tm_array.append(line)
        line = tgt_tm.readline()
  
    for i in least_N_indices:
        print([i+1], tgt_tm_array[i])

[811] हमने तुमपर किताब अवतरित की हर चीज़ को खोलकर बयान करने के लिए और मुस्लिम (आज्ञाकारियों) के लिए मार्गदर्शन

[1133] 

[1425] इन दरवाजों का नियंत्रण ट्रेन के गार्ड के पास रखा गया है जिससे यात्रियों की सुरक्षा और सुदृढ़ हुई है।

[103] निश्चय ही वह तो जो कुछ दिलों में होता है उसे भी जानता है

[208] चयनित ड्राइव सुरक्षित रूप से निकालें

