**Reference**: <https://hub.packtpub.com/create-an-rnn-based-python-machine-translation-system-tutorial/>

In [8]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-de-en.txt')[0])

<AlignedSent: 'Wiederaufnahme der S...' -> 'Resumption of the se...'>


In [9]:
print(comtrans.aligned_sents()[0].words)
print(comtrans.aligned_sents()[0].mots)

['Wiederaufnahme', 'der', 'Sitzungsperiode']
['Resumption', 'of', 'the', 'session']


In [10]:
print(comtrans.aligned_sents()[0].alignment)

0-0 1-1 1-2 2-3


In [11]:
import pickle
import re
from collections import Counter
from nltk.corpus import comtrans

In [12]:
def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'):
    print("Retrieving corpora: {}".format(translated_sentences_l1_l2))
    als = comtrans.aligned_sents(translated_sentences_l1_l2)
    sentences_l1 = [sent.words for sent in als]
    sentences_l2 = [sent.mots for sent in als]
    return sentences_l1, sentences_l2

In [13]:
sen_l1, sen_l2 = retrieve_corpora()
print("# A sentence in the two languages DE & EN")
print("DE:", sen_l1[0])
print("EN:", sen_l2[0])
print("# Corpora length (i.e. number of sentences)")
print(len(sen_l1))
assert len(sen_l1) == len(sen_l2)

Retrieving corpora: alignment-de-en.txt


# A sentence in the two languages DE & EN
DE: ['Wiederaufnahme', 'der', 'Sitzungsperiode']
EN: ['Resumption', 'of', 'the', 'session']
# Corpora length (i.e. number of sentences)
33334


In [14]:
import re

def clean_sentence(sentence):
    regex_splitter = re.compile(r"([!?.,:;$'\")( ])")
    clean_words = [re.split(regex_splitter, word.lower()) for word in sentence]
    return [w for words in clean_words for w in words if words and w]


In [15]:
clean_sen_l1 = [clean_sentence(s) for s in sen_l1]
clean_sen_l2 = [clean_sentence(s) for s in sen_l2]
print("# Same sentence as before, but chunked and cleaned")
print("DE:", clean_sen_l1[0])
print("EN:", clean_sen_l2[0])

# Same sentence as before, but chunked and cleaned
DE: ['wiederaufnahme', 'der', 'sitzungsperiode']
EN: ['resumption', 'of', 'the', 'session']


In [16]:
def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):
    filtered_sentences_l1 = []
    filtered_sentences_l2 = []
    for i in range(len(sentences_l1)):
        if min_len <= len(sentences_l1[i]) <= max_len and min_len <= len(sentences_l2[i]) <= max_len:
            filtered_sentences_l1.append(sentences_l1[i])
            filtered_sentences_l2.append(sentences_l2[i])
    return filtered_sentences_l1, filtered_sentences_l2


In [17]:
filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_sen_l1, 
          clean_sen_l2)
print("# Filtered Corpora length (i.e. number of sentences)")
print(len(filt_clean_sen_l1))
assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)

# Filtered Corpora length (i.e. number of sentences)
14788


In [18]:
import data_utils

def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):
    count_words = Counter()
    dict_words = {}
    opt_dict_size = len(data_utils.OP_DICT_IDS)
    
    for sen in sentences:
        for word in sen:
            count_words[word] += 1

    dict_words[data_utils._PAD] = data_utils.PAD_ID
    dict_words[data_utils._GO] = data_utils.GO_ID
    dict_words[data_utils._EOS] = data_utils.EOS_ID
    dict_words[data_utils._UNK] = data_utils.UNK_ID

    for idx, item in enumerate(count_words.most_common(dict_size)):
        dict_words[item[0]] = idx + opt_dict_size

    if storage_path:
        pickle.dump(dict_words, open(storage_path, "wb"))
        
    return dict_words


In [19]:
def sentences_to_indexes(sentences, indexed_dictionary):
    indexed_sentences = []
    not_found_counter = 0
    
    for sent in sentences:
        idx_sent = []
        for word in sent:
            try:
                idx_sent.append(indexed_dictionary[word])
            except KeyError:
                idx_sent.append(data_utils.UNK_ID)
                not_found_counter += 1
        indexed_sentences.append(idx_sent)
    
    print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))
    return indexed_sentences


In [21]:
# Example of defining filt_clean_sen_l1 and filt_clean_sen_l2 with actual data
filt_clean_sen_l1 = [
    ["sentence", "one", "for", "language", "1"],
    ["another", "sentence", "for", "language", "1"],
    # Add more sentences as needed
]

filt_clean_sen_l2 = [
    ["sentence", "one", "for", "language", "2"],
    ["another", "sentence", "for", "language", "2"],
    # Add more sentences as needed
]

# Rest of your code remains the same
dict_l1 = create_indexed_dictionary(filt_clean_sen_l1, dict_size=15000, storage_path="/tmp/l1_dict.p")
dict_l2 = create_indexed_dictionary(filt_clean_sen_l2, dict_size=10000, storage_path="/tmp/l2_dict.p")
idx_sentences_l1 = sentences_to_indexes(filt_clean_sen_l1, dict_l1)
idx_sentences_l2 = sentences_to_indexes(filt_clean_sen_l2, dict_l2)

print("# Same sentences as before, with their dictionary ID")
print("DE:", list(zip(filt_clean_sen_l1[0], idx_sentences_l1[0])))


[sentences_to_indexes] Did not find 0 words
[sentences_to_indexes] Did not find 0 words
# Same sentences as before, with their dictionary ID
DE: [('sentence', 4), ('one', 8), ('for', 5), ('language', 6), ('1', 7)]


In [22]:
# Same sentences as before, with their dictionary ID
DE: [('wiederaufnahme', 1616), ('der', 7), ('sitzungsperiode', 618)]
EN: [('resumption', 1779), ('of', 8), ('the', 5), ('session', 549)]

In [23]:
def extract_max_length(corpora):
    return max([len(sentence) for sentence in corpora])

In [24]:
max_length_l1 = extract_max_length(idx_sentences_l1)
max_length_l2 = extract_max_length(idx_sentences_l2)
print("# Max sentence sizes:")
print("DE:", max_length_l1)
print("EN:", max_length_l2)

# Max sentence sizes:
DE: 5
EN: 5


In [25]:
def prepare_sentences(sentences_l1, sentences_l2, len_l1, len_l2):
    assert len(sentences_l1) == len(sentences_l2)
    data_set = []
    for i in range(len(sentences_l1)):
        padding_l1 = len_l1 - len(sentences_l1[i])
        pad_sentence_l1 = ([data_utils.PAD_ID]*padding_l1) + sentences_l1[i]
        padding_l2 = len_l2 - len(sentences_l2[i])
        pad_sentence_l2 = [data_utils.GO_ID] + sentences_l2[i] + [data_utils.EOS_ID] + ([data_utils.PAD_ID] * padding_l2)
        data_set.append([pad_sentence_l1, pad_sentence_l2])
    return data_set

In [26]:
data_set = prepare_sentences(idx_sentences_l1, idx_sentences_l2, max_length_l1, max_length_l2)
print("# Prepared minibatch with paddings and extra stuff")
print("DE:", data_set[0][0])
print("EN:", data_set[0][1])
print("# The sentence pass from X to Y tokens")
print("DE:", len(idx_sentences_l1[0]), "->", len(data_set[0][0]))
print("EN:", len(idx_sentences_l2[0]), "->", len(data_set[0][1]))

# Prepared minibatch with paddings and extra stuff
DE: [4, 8, 5, 6, 7]
EN: [1, 4, 8, 5, 6, 7, 2]
# The sentence pass from X to Y tokens
DE: 5 -> 5
EN: 5 -> 7
