### Neural Machine Translation using word level language model and embeddings in Keras

In [16]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.model_selection import train_test_split

In [17]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [18]:
%cd drive/MyDrive/Colab\ Notebooks/data

[Errno 2] No such file or directory: 'drive/MyDrive/Colab Notebooks/data'
/content/drive/MyDrive/Colab Notebooks/data


In [19]:
f = open("abstract_mod.txt", "r")
lines = f.read().split(',')

In [20]:
lines = lines[:4000]

In [21]:
len(lines)

4000

#### Generate synthetic data

In [22]:
all_eng_words = set()
for eng in lines:
    for word in eng.split():
        all_eng_words.add(word)

In [23]:
len(all_eng_words)

6462

In [24]:
lenght_list=[]
for l in lines:
    lenght_list.append(len(l.split(' ')))
np.max(lenght_list)

47

In [25]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_eng_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_eng_words)
# del all_eng_words, all_french_words

In [26]:
input_token_index = dict(
    [(word, i) for i, word in enumerate(input_words)])
target_token_index = input_token_index.copy()

In [27]:
len(lines)*16*num_decoder_tokens

413568000

In [28]:
encoder_input_data = np.zeros(
    (len(lines), 47),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(lines), 47),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(lines), 47, num_decoder_tokens),
    dtype='float32')

In [29]:
for i, (input_text, target_text) in enumerate(zip(lines, lines)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.

In [53]:
encoder_input_data

array([[1536., 4861., 6039., ...,    0.,    0.,    0.],
       [1159., 1512., 5633., ...,    0.,    0.,    0.],
       [1398., 3834., 3022., ...,    0.,    0.,    0.],
       ...,
       [5032., 2235.,  812., ...,    0.,    0.,    0.],
       [ 812., 4598., 6432., ...,    0.,    0.,    0.],
       [1401., 2235., 1361., ...,    0.,    0.,    0.]], dtype=float32)

In [34]:
decoder_input_data

array([[1536., 4861., 6039., ...,    0.,    0.,    0.],
       [1159., 1512., 5633., ...,    0.,    0.,    0.],
       [1398., 3834., 3022., ...,    0.,    0.,    0.],
       ...,
       [5032., 2235.,  812., ...,    0.,    0.,    0.],
       [ 812., 4598., 6432., ...,    0.,    0.,    0.],
       [1401., 2235., 1361., ...,    0.,    0.,    0.]], dtype=float32)

### Build keras encoder-decoder model

In [15]:
embedding_size = 50

In [42]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model

#### Encoder model

In [36]:
encoder_inputs = Input(shape=(None,))
en_x=  Embedding(num_encoder_tokens, embedding_size)(encoder_inputs)
encoder = LSTM(50, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

#### Decoder model

In [37]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

dex=  Embedding(num_decoder_tokens, embedding_size)

final_dex= dex(decoder_inputs)
decoder_lstm = LSTM(50, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(final_dex,
                                     initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])


In [38]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 50)     323100      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     323100      input_2[0][0]                    
______________________________________________________________________________________________

#### Fit the model

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=128,
          epochs=10,
          validation_split=0.05)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7eff053c6310>

In [43]:
#model.save('encoder_word.h5')

model = load_model('encoder_word.h5')

In [44]:
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 50)          323100    
_________________________________________________________________
lstm (LSTM)                  [(None, 50), (None, 50),  20200     
Total params: 343,300
Trainable params: 343,300
Non-trainable params: 0
_________________________________________________________________


#### Create sampling model

In [45]:
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dex2= dex(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


In [54]:
reverse_input_char_index

{0: 'aa',
 1: 'aac',
 2: 'aarhus',
 3: 'aase',
 4: 'aav',
 5: 'ab',
 6: 'abated',
 7: 'abc',
 8: 'abdomen',
 9: 'abdominal',
 10: 'aberration',
 11: 'ability',
 12: 'abl',
 13: 'ablation',
 14: 'able',
 15: 'abnormal',
 16: 'abnormalities',
 17: 'abnormality',
 18: 'abortion',
 19: 'abovementioned',
 20: 'abpa',
 21: 'abrupt',
 22: 'abruption',
 23: 'abscess',
 24: 'abscesses',
 25: 'absence',
 26: 'absent',
 27: 'absolute',
 28: 'absorptiometry',
 29: 'absorption',
 30: 'abstract',
 31: 'abundant',
 32: 'abuse',
 33: 'ac',
 34: 'academy',
 35: 'accelerated',
 36: 'acceleration',
 37: 'acceptable',
 38: 'accepted',
 39: 'access',
 40: 'accessibility',
 41: 'accompanied',
 42: 'accompanying',
 43: 'accordance',
 44: 'according',
 45: 'accordingly',
 46: 'account',
 47: 'accountability',
 48: 'accounting',
 49: 'accounts',
 50: 'accumulate',
 51: 'accumulating',
 52: 'accumulation',
 53: 'accuracy',
 54: 'accurate',
 55: 'acd',
 56: 'ace',
 57: 'acetaminophen',
 58: 'acetate',
 59: 'acet

#### Function to generate sequences

In [60]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '.' or
           len(decoded_sentence) > 52):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

#### Look at the some sentences:

In [49]:
encoder_input_data[2012: 2012+ 1]

array([[4273., 3930., 5257., 4462.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.]], dtype=float32)

In [61]:
for seq_index in [77,2012,3035,400,2056,408,590,1095,2153]:
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', lines[seq_index: seq_index + 1])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: ['additionally vitro assay demonstrated expression glut increased rcc cells surviving hypoxia condition via mtor pathway']
Decoded sentence:  urealyticum urealyticum macula needed dlbcl somewhat
-
Input sentence: ['patients nih scores points']
Decoded sentence:  urealyticum urealyticum macula needed dlbcl somewhat
-
Input sentence: ['study aimed clarify efficacy safety interferon free therapy sofosbuvir ledipasvir without ribavirin weeks japanese patients hcv genotype infection living donor liver transplantation']
Decoded sentence:  urealyticum urealyticum threatening polymerase cfdna
-
Input sentence: ['myasthenia gravis foundation america postintervention status scale used assess outcomes']
Decoded sentence:  urealyticum urealyticum macula needed dlbcl somewhat
-
Input sentence: ['without locally available active therapeutic option patient maintenance therapy drv mg day tdf tc patient family centered adherence reinforced']
Decoded sentence:  urealyticum urealyticum 