# Translation Memory Retrieval

Note: Preprocessing is a separate module and must be done before using this!

In [1]:
import sys
import nltk
import numpy as np

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/khannatanmai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

In [4]:
input_line = input()

#convert input to lowercase
input_line = input_line.lower()

#tokenise
input_tokens = word_tokenize(input_line)

content_words = [word for word in input_tokens if word not in stop_words] #Removing Stopwords

print(content_words)

I want to safely remove THE selected drive if it is POSSIBLE
['want', 'safely', 'remove', 'selected', 'drive', 'possible']


## Edit Distance

### Load TM

Now we are dealing with the whole file which has ~800000 sentences in the TM. 

Approach:
We take each sentence in the TM and check if any of the content words are present in it. If they are, we then calculate edit-distance and store it. This way we save time as we don't have to calculate edit distance for each sentence in the TM.

Once we have a list of edit distances, we take the N lowest, i.e. N best matches and print from the Target TM.

In [5]:
src_tm_words = [] #Content Words in Source TM
src_tm_tokenized = []

with open('../tm_data/tm_src_pp.txt') as src_tm:
    line = src_tm.readline()
    
    while line:
        line = line.rstrip() #Removing Trailing Whitespace
        
        words = line.split('\t')
        src_tm_words.append(words)
        
        line = src_tm.readline()

with open('../tm_data/tm_src_lower.txt') as org_src_tm:
    line = org_src_tm.readline()
    
    while line:
        tokens = word_tokenize(line)
        src_tm_tokenized.append(tokens)
        line = org_src_tm.readline()
        

## Execute Edit Distance

In [6]:
N = 5 #Top N matches returned

edit_distance_all = []
indices_all = []

i = 0
count = 0

for candidate in src_tm_words:
    
    #Check if Content Words present in Candidate
    for word in content_words:
        if(word in candidate):
            count += 1
            #print(candidate)
            
            ed = nltk.edit_distance(input_tokens, src_tm_tokenized[i]) #Calculate Edit Distance only if content words exist
            
            edit_distance_all.append(ed)
            indices_all.append(i)
            
            break
    
    i += 1
    
print('Running Edit Distance on ' + str(count) + ' Candidates out of a possible ' + str(i) + '!\n')
    
#Get top N results
edit_distance_all = np.array(edit_distance_all)

sorted_indices = np.argsort(edit_distance_all) #Sorts in ascending order and returns the indices of indices_all array
least_N_indices = sorted_indices[:N] #We want least edit distance

#print(least_N_indices)

for i in least_N_indices:
    print(indices_all[i]+1, src_tm_words[indices_all[i]], edit_distance_all[i])

Running Edit Distance on 8013 Candidates out of a possible 772820!

208 ['safely', 'remove', 'selected', 'drive'] 7
103529 ['want', 'see', 'manager', '.'] 8
767540 ['want', 'introduce', 'creators'] 8
95387 ['want', '.'] 8
382203 ['want', 'explore', 'world', 'fullest'] 8


## Retrieval of Target from TM

In [7]:
tgt_tm_array = []

with open('../tm_data/tm_tgt.txt') as tgt_tm:
    line = tgt_tm.readline()
    
    while line:
        tgt_tm_array.append(line)
        line = tgt_tm.readline()
        
for i in least_N_indices:
    print(indices_all[i]+1, tgt_tm_array[indices_all[i]])

208 चयनित ड्राइव सुरक्षित रूप से निकालें

103529 अौर ध्यान दें

767540 माता-पिता और कानूनी अभिभावकों को अपनी आस्था और विश्वास के मुताबिक अपने बच्चों को धार्मिक और नैतिक शिक्षा दिलाने की आज़ादी है।

95387 फिर वहाँ उन्हें एक दीवार मिली जो गिरा चाहती थी

382203 विभाग का नाम: खनन इंजीनियरिंग विभाग

