In [1]:
#Ignore warnings
import warnings
warnings.filterwarnings('ignore')
import os, sys
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt

In [2]:
BATCH_SIZE = 128
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

In [3]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in open('fra.txt', encoding="utf-8"):
    count += 1
    if count > NUM_SENTENCES:
        break
    if '\t' not in line:
        continue
    input_sentence = line.rstrip().split('\t')[0]
    output = line.rstrip().split('\t')[1]

    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("Number of sample input:", len(input_sentences))
print("Number of sample output:", len(output_sentences))
print("Number of sample output input:", len(output_sentences_inputs))

Number of sample input: 20000
Number of sample output: 20000
Number of sample output input: 20000


In [4]:
print("English sentence: ",input_sentences[180])
print("French translation: ",output_sentences[180])

English sentence:  Beat it.
French translation:  Casse-toi de là. <eos>


In [5]:
#tokenize the input sentences(input language) 
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)
print(input_integer_seq)

[[19], [19], [19], [531], [531], [130], [130], [130], [130], [130], [130], [130], [130], [130], [130], [130], [130], [130], [130], [130], [130], [67], [2398], [1432], [1432], [1432], [725], [59], [851], [851], [663], [663], [54], [54], [54], [104], [104], [104], [104], [104], [104], [104], [778], [778], [19, 44], [19, 44], [19, 44], [615], [615], [1, 62], [1, 62], [1, 75], [1, 187], [1, 187], [1, 187], [726, 28], [370], [370], [370], [370], [370], [370], [370], [370], [370], [370], [370], [370], [559], [559], [559], [224], [616], [616], [616], [229, 4], [229, 4], [229, 4], [229, 4], [1214], [1214], [1214], [1214], [106, 4], [106, 4], [31, 35], [31, 35], [31, 35], [19, 73], [19, 73], [19, 73], [60, 4], [60, 4], [60, 4], [60, 4], [60, 4], [60, 4], [1760, 40], [1760, 40], [480, 8], [480, 8], [1, 340], [1, 340], [1, 2399], [1, 1761], [1, 74], [1, 147], [1, 147], [1, 371], [1, 85], [1, 326], [1, 326], [1, 326], [1, 200], [1, 200], [7, 2400], [7, 142], [7, 142], [283], [28, 213], [28, 213], 

In [6]:
word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

Total unique words in the input: 3441


In [7]:
max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Length of longest sentence in input: 5


In [8]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[180]:", encoder_input_sequences[180])

encoder_input_sequences.shape: (20000, 5)
encoder_input_sequences[180]: [  0   0   0 304   4]


In [9]:
print(word2idx_inputs["join"])
print(word2idx_inputs["us"])

534
56


In [10]:
#tokenize the output sentences(Output language)
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)
print(output_input_integer_seq)

[[2, 51, 4], [2, 703], [2, 428, 4], [2, 895, 4], [2, 4508], [2, 2670], [2, 2671], [2, 130, 238, 1024, 19, 238, 1949, 4], [2, 1505, 4], [2, 1950, 4], [2, 704, 4], [2, 2672, 4], [2, 2673, 4], [2, 2670], [2, 2671], [2, 130, 238, 1024, 19, 238, 1949, 4], [2, 1505, 4], [2, 1950, 4], [2, 704, 4], [2, 2672, 4], [2, 2673, 4], [2, 37, 6], [2, 27, 4509], [2, 19, 4510], [2, 4511], [2, 4512], [2, 65, 781, 4], [2, 19, 2674], [2, 4513], [2, 4514], [2, 1951], [2, 1951], [2, 27, 2675], [2, 4515], [2, 1200, 4], [2, 287, 4], [2, 341, 4], [2, 1025], [2, 287, 4], [2, 341, 4], [2, 4516], [2, 1025], [2, 1952], [2, 2676], [2, 4517], [2, 4518], [2, 4519], [2, 1026, 4], [2, 895, 4], [2, 3, 896], [2, 4520], [2, 2677], [2, 17, 429, 4], [2, 3, 68, 1027, 4], [2, 204, 384], [2, 2678, 556, 4], [2, 1201], [2, 1953], [2, 1954], [2, 2679, 2680], [2, 2681, 2682], [2, 46, 1506], [2, 1955, 4], [2, 1202, 4], [2, 2683], [2, 1203, 4], [2, 46, 187], [2, 217], [2, 4521], [2, 2684, 93, 18, 1204], [2, 4522], [2, 1507, 6], [2, 26

In [11]:
word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

Total unique words in the output: 9499


In [12]:
num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Length of longest sentence in the output: 12


In [13]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[180]:", decoder_input_sequences[180])

decoder_input_sequences.shape: (20000, 12)
decoder_input_sequences[180]: [   2 4555   15  127    0    0    0    0    0    0    0    0]


In [14]:
print(word2idx_outputs["<sos>"])
print(word2idx_outputs["joignez-vous"])
print(word2idx_outputs["à"])
print(word2idx_outputs["nous."])

2
2711
19
219


In [15]:
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_output_sequences.shape:", decoder_output_sequences.shape)

decoder_output_sequences.shape: (20000, 12)


In [16]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('C:/Users/karthick/Downloads/glove.6B.100d.txt/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [17]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [18]:
print(embeddings_dictionary["ill"])

[ 0.12648    0.1366     0.22192   -0.025204  -0.7197     0.66147
  0.48509    0.057223   0.13829   -0.26375   -0.23647    0.74349
  0.46737   -0.462      0.20031   -0.26302    0.093948  -0.61756
 -0.28213    0.1353     0.28213    0.21813    0.16418    0.22547
 -0.98945    0.29624   -0.62476   -0.29535    0.21534    0.92274
  0.38388    0.55744   -0.14628   -0.15674   -0.51941    0.25629
 -0.0079678  0.12998   -0.029192   0.20868   -0.55127    0.075353
  0.44746   -0.71046    0.75562    0.010378   0.095229   0.16673
  0.22073   -0.46562   -0.10199   -0.80386    0.45162    0.45183
  0.19869   -1.6571     0.7584    -0.40298    0.82426   -0.386
  0.0039546  0.61318    0.02701   -0.3308    -0.095652  -0.082164
  0.7858     0.13394   -0.32715   -0.31371   -0.20247   -0.73001
 -0.49343    0.56445    0.61038    0.36777   -0.070182   0.44859
 -0.61774   -0.18849    0.65592    0.44797   -0.10469    0.62512
 -1.9474    -0.60622    0.073874   0.50013   -1.1278    -0.42066
 -0.37322   -0.50538    0

In [19]:
print(embedding_matrix[539])

[-1.23510003e-01  5.82360029e-01  9.39920008e-01 -5.65240011e-02
  1.67850003e-01 -2.40789995e-01  3.60240005e-02 -1.39980003e-01
 -1.35020003e-01  4.10090014e-02  1.94969997e-01  3.83430004e-01
  1.10760003e-01 -4.15609986e-01  1.50639996e-01 -5.81780016e-01
 -3.76370013e-01 -9.37810019e-02 -3.81440014e-01  4.19869989e-01
  1.01559997e+00  7.40190029e-01  2.21249998e-01 -1.41049996e-01
 -4.16350007e-01 -9.45720002e-02 -1.30559996e-01 -5.32060027e-01
 -1.02069996e-01 -3.11580002e-01  3.11659992e-01 -3.99710014e-02
  1.58960000e-01 -5.45970015e-02  5.97019970e-01  4.45329994e-01
  4.51750010e-02 -1.13820001e-01 -7.92980015e-01  2.15570003e-01
 -4.88249987e-01 -3.22939992e-01 -4.70840000e-02 -1.71920002e-01
 -9.04999971e-02 -3.88779998e-01  1.04020000e+00  1.26980003e-02
  1.27130002e-01 -3.56429994e-01  6.26600027e-01 -1.56069994e-01
  1.12539999e-01  1.04149997e+00 -3.17950010e-01 -2.34139991e+00
 -1.59590006e-01 -2.40899995e-01  1.75010002e+00  2.64629990e-01
 -9.47749987e-02  1.17460

In [20]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [21]:
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

In [22]:
decoder_targets_one_hot.shape

(20000, 12, 9500)

In [23]:
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [24]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [25]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [26]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [28]:
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [29]:
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [30]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

In [31]:
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [32]:
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

In [33]:
decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

In [34]:
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

In [35]:
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

In [36]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

In [37]:
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''

        if idx > 0:
            word = idx2word_target[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)

In [39]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)
print('-')
print('Input:', input_sentences[i])
print('Response:', translation)

-
Input: I'm so tired.
Response: je suis tellement fatigué.


In [40]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)
print('-')
print('Input:', input_sentences[i])
print('Response:', translation)

-
Input: It's an outrage.
Response: c'est un accord.


In [27]:
from tensorflow.keras.optimizers import Adam

In [28]:
model2 = Model([encoder_inputs_placeholder,decoder_inputs_placeholder], decoder_outputs)
model2.compile(
    optimizer='Adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [29]:
r = model2.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [27]:
from tensorflow.keras.layers import Bidirectional

In [28]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [29]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [30]:
from tensorflow.keras.layers import Dropout
dropout = Dropout(rate=0.3)
decoder_outputs = dropout(decoder_outputs)

In [31]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [42]:
model2 = Model([encoder_inputs_placeholder,decoder_inputs_placeholder], decoder_outputs)
model2.compile(
    optimizer='Adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [45]:
r = model2.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=50,
    validation_split=0.3,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [32]:
model3 = Model([encoder_inputs_placeholder,decoder_inputs_placeholder], decoder_outputs)
model3.compile(
    optimizer='Adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [58]:
r = model2.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=10,
    validation_split=0.2,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
r = model3.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=20,
    validation_split=0.2,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [34]:
from keras.layers import GRU

In [36]:
encoder_inputs = Input(shape=(max_input_len,))
encoder = GRU(LSTM_NODES, return_state=True)
encoder_outputs, state_h = encoder(encoder_inputs)

decoder_inputs = Input(shape=(max_out_len,))
decoder_gru = GRU(latent_dim, return_sequences=True)
decoder_outputs = decoder_gru(decoder_inputs, initial_state=state_h)
dropout = Dropout(rate=0.3)
decoder_outputs = dropout(decoder_outputs)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

ValueError: Input 0 of layer gru_1 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 5)

In [None]:
model4 = Model([encoder_inputs_placeholder,decoder_inputs_placeholder], decoder_outputs)
model4.compile(
    optimizer='Adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
r = model4.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=50,
    validation_split=0.3,
)