In [2]:
import os, sys
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
print('tensorflow version: ',tf.__version__)

tensorflow version:  2.0.0


In [13]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 2500
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 2500
EMBEDDING_SIZE = 100

### Data processing

In [4]:
with open(r'/Users/user/Desktop/python_stuff/NLP/hin-eng/hin.txt', encoding="utf-8") as file:
    head = [next(file) for x in range(2)]
print(head)

['Wow!\tवाह!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #6179147 (fastrizwaan)\n', 'Help!\tबचाओ!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #459377 (minshirui)\n']


In [7]:
# We limit the sentences to 2500.
# File has English sentence its corrosponding Hindi sentence and its attribute.
# We need to split them by '\t' and create a list of English sentences, its corrosponding Hindi sentences with
# end of sentence <eos> postfix and a third list of same Hindi sentences with start of sentence <sos> prefix

input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in open(r'/Users/user/Desktop/python_stuff/NLP/hin-eng/hin.txt', encoding="utf-8"):
    count += 1

    if count > NUM_SENTENCES:
        break

    if '\t' not in line:
        continue

    input_sentence, output, attribute = line.rstrip().split('\t')

    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 2500
num samples output: 2500
num samples output input: 2500


In [8]:
print('Input sentence: ',input_sentences[250])
print('Output sentence: ',output_sentences[250])
print('Output sentence as input: ',output_sentences_inputs[250])

Input sentence:  He has long legs.
Output sentence:  उसके पैर लम्बे हैं। <eos>
Output sentence as input:  <sos> उसके पैर लम्बे हैं।


### Tokenization

In [34]:
# Tokenize the input sentences(English Language)

input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

max_input_len = max(len(x) for x in input_integer_seq)
print('Total unique words: ',len(input_word_index))

input_word_index = input_tokenizer.word_index
print('Longest sentence has {} words'.format(max_input_len))

Total unique words:  2096
Longest sentence has 12 words


In [33]:
# Tokenize the output sentences(Hindi Language)

output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

output_word_index = output_tokenizer.word_index
print('Total unique words are: ', len(output_word_index))

max_out_len = max(len(x) for x in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words are:  2755
Length of longest sentence in the output: 17


###  Padding

<p>We need to pad our inputs and outputs because each inputs and outputs are of different length 
<p>LSTM model expects all the inputs to be of the same size.

In [47]:
# Input padding

encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print('padded input sequence shape: ',encoder_input_sequences.shape)
print('padded input sequence example 250: ', encoder_input_sequences[250])


padded input sequence shape:  (2500, 12)
padded input sequence example 250:  [  0   0   0   0   0   0   0   0   7  37  77 614]


In [46]:
print('Integer associated with "He" is: ',input_word_index['he'])
print('Integer associated with "has" is: ',input_word_index['has'])
print('Integer associated with "long" is: ',input_word_index['long'])
print('Integer associated with "legs" is: ',input_word_index['legs'])

Integer associated with "He" is:  7
Integer associated with "has" is:  37
Integer associated with "long" is:  77
Integer associated with "legs" is:  614


In [58]:
# Output padding

decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print('padded output sequence shape: ', decoder_input_sequences.shape)
print('padded output sequence example 250: ', decoder_input_sequences[250])

padded output sequence shape:  (2500, 17)
padded output sequence example 250:  [  2  49 537 547  12   0   0   0   0   0   0   0   0   0   0   0   0]


In [59]:
print('Integer associated with "<sos>" is: ',output_word_index['<sos>'])
print('Integer associated with "उसके" is: ',output_word_index['उसके'])
print('Integer associated with "पैर" is: ',output_word_index['पैर'])
print('Integer associated with "लम्बे" is: ',output_word_index['लम्बे'])
print('Integer associated with "हैं।" is: ',output_word_index['हैं।'])

Integer associated with "<sos>" is:  2
Integer associated with "उसके" is:  49
Integer associated with "पैर" is:  537
Integer associated with "लम्बे" is:  547
Integer associated with "हैं।" is:  12


In [117]:
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')
print(decoder_output_sequences.shape)
decoder_output_sequences[250]

(2500, 17)


array([ 49, 537, 547,  12,   1,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0], dtype=int32)

In [None]:
"""
In the encoder, zeros were padded at the beginning. 
The reason behind this is that encoder output is based on the words occurring at the end of the sentence, 
therefore the original words were kept at the end of the sentence and zeros were padded at the beginning. 
In the case of the decoder, the post-padding is applied,
which means that zeros are appended at the end of the sentence. 
We do this because in the decoder the processing starts from the beginning of a sentence, 
hence post-padding is performed in the decoder.
"""

### Word Embeddings

<p>For the English sentences, i.e. the inputs, we will use the GloVe word embeddings. For the translated Hindi sentences in the output, we will use custom word embeddings.

In [65]:
# reate a dictionary where words are the keys and the corresponding vectors are values

import numpy as np

embeddings_dictionary = dict()
glove_file = open(r"/Users/user/Desktop/python_stuff/NLP/glove.6B/glove.6B.100d.txt", encoding='utf8')

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [102]:
num_words = min(MAX_NUM_WORDS, len(input_word_index)+1)
embedding_matrix = np.zeros((num_words, EMBEDDING_SIZE))
for word, index in input_word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        


In [107]:
embeddings_dictionary['he']

array([ 0.1225   , -0.058833 ,  0.23658  , -0.28877  , -0.028181 ,
        0.31524  ,  0.070229 ,  0.16447  , -0.027623 ,  0.25214  ,
        0.21174  , -0.059674 ,  0.36133  ,  0.13607  ,  0.18755  ,
       -0.1487   ,  0.31315  ,  0.13368  , -0.59703  , -0.030161 ,
        0.080656 ,  0.26162  , -0.055924 , -0.35351  ,  0.34722  ,
       -0.0055801, -0.57935  , -0.88007  ,  0.42931  , -0.15695  ,
       -0.51256  ,  1.2684   , -0.25228  ,  0.35265  , -0.46419  ,
        0.55648  , -0.57556  ,  0.32574  , -0.21893  , -0.13178  ,
       -1.1027   , -0.039591 ,  0.89643  , -0.9845   , -0.47393  ,
       -0.12855  ,  0.63506  , -0.94888  ,  0.40088  , -0.77542  ,
       -0.35153  , -0.27788  ,  0.68747  ,  1.458    , -0.38474  ,
       -2.8937   , -0.29523  , -0.38836  ,  0.94881  ,  1.3891   ,
        0.054591 ,  0.70486  , -0.65699  ,  0.075648 ,  0.7655   ,
       -0.63365  ,  0.86556  ,  0.42441  ,  0.14796  ,  0.4156   ,
        0.29354  , -0.51295  ,  0.19635  , -0.45568  ,  0.0080

In [108]:
embedding_matrix[7]

array([ 0.1225    , -0.058833  ,  0.23658   , -0.28876999, -0.028181  ,
        0.31524   ,  0.070229  ,  0.16447   , -0.027623  ,  0.25213999,
        0.21174   , -0.059674  ,  0.36133   ,  0.13607   ,  0.18754999,
       -0.1487    ,  0.31314999,  0.13368   , -0.59702998, -0.030161  ,
        0.080656  ,  0.26161999, -0.055924  , -0.35350999,  0.34722   ,
       -0.0055801 , -0.57934999, -0.88006997,  0.42930999, -0.15695   ,
       -0.51256001,  1.26839995, -0.25228   ,  0.35264999, -0.46419001,
        0.55647999, -0.57555997,  0.32574001, -0.21893001, -0.13178   ,
       -1.1027    , -0.039591  ,  0.89643002, -0.98449999, -0.47393   ,
       -0.12854999,  0.63506001, -0.94888002,  0.40088001, -0.77542001,
       -0.35152999, -0.27788001,  0.68747002,  1.45799994, -0.38474   ,
       -2.89369988, -0.29523   , -0.38835999,  0.94880998,  1.38909996,
        0.054591  ,  0.70485997, -0.65698999,  0.075648  ,  0.76550001,
       -0.63365   ,  0.86556   ,  0.42440999,  0.14796001,  0.41

### Creating the model

In [109]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [112]:
decoder_targets_one_hot = np.zeros((len(input_sentences),
                                    max_out_len,
                                    num_words_output),
                                    dtype='float32')

In [113]:
decoder_targets_one_hot.shape

(2500, 17, 2756)

In [118]:
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1