Word Token LSTM

In [None]:
import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt

Set parameter for the model

In [None]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

Import the text

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path="/content/drive/MyDrive/Colab Notebooks/NLP_Assignment/data/training.csv"

In [None]:
training= pd.read_csv(path,nrows=NUM_SENTENCES, names=['unnormalised','normalised'],encoding="utf8")
pd.set_option("display.max_rows", None, "display.max_columns", None)
training.sample(10)

Unnamed: 0,unnormalised,normalised
11349,Harta yang kita miliki hari ini hanyalah sekad...,harta yang kita miliki hari ini hanyalah sekad...
5246,Zull Hanif) sampai (ke sekolah) sama masa deng...,zull hanif sampai ke sekolah sama masa dengan ...
6340,“Rasulullah bertanya mengenai usaha yang palin...,rasulullah bertanya mengenai usaha yang paling...
18287,"Mengikut pertuduhan, Tian Chua didakwa sengaja...",mengikut pertuduhan tian chua didakwa sengaja ...
16534,‘Ops Suri’ yang dilancarkan minggu lepas bertu...,ops suri yang dilancarkan minggu lepas bertuju...
15097,CADANGKAN ketua fleet kenderaan di kem pertaha...,cadangkan ketua fleet kenderaan di kem pertaha...
17091,Antara personaliti tersebut ialah Pengarah dan...,antara personaliti tersebut ialah pengarah dan...
6986,"Ooi berkata, kediaman itu menawarkan tujuh jen...",ooi berkata kediaman itu menawarkan tujuh jeni...
6304,"Mekanisme ini diumumkan oleh Perdana Menteri, ...",mekanisme ini diumumkan oleh perdana menteri d...
9846,“Saya biasa rehat di situ setiap selepas waktu...,saya biasa rehat di situ setiap selepas waktu ...


In [None]:
df = pd.DataFrame(training, columns=['unnormalised','normalised'])
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   unnormalised  20000 non-null  object
 1   normalised    20000 non-null  object
dtypes: object(2)
memory usage: 312.6+ KB
None


In [None]:

output_sentences = []
output_sentences_inputs = []

output_sentence = training.unnormalised.apply(lambda x: x +'<eos>')
output_sentence_input = training.unnormalised.apply(lambda x:'<sos>'+ x)

input_sentences=df['normalised'].to_list()
output_sentences=output_sentence.to_list()
output_sentences_inputs=output_sentence_input.to_list()

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))


num samples input: 20000
num samples output: 20000
num samples output input: 20000


Tokenization and Padding

Tokenier: Seperate the sentence into word  and convert the word to integers

In [None]:
# Input tokenizer
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print(word2idx_inputs)
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

{'yang': 1, 'dan': 2, 'di': 3, 'ini': 4, 'itu': 5, 'dalam': 6, 'tidak': 7, 'dengan': 8, 'untuk': 9, 'pada': 10, 'kepada': 11, 'akan': 12, 'saya': 13, 'mereka': 14, 'oleh': 15, 'negara': 16, 'tahun': 17, 'juga': 18, 'beliau': 19, 'bagi': 20, 'ke': 21, 'daripada': 22, 'malaysia': 23, 'katanya': 24, 'kerana': 25, 'dari': 26, 'kita': 27, 'telah': 28, 'lebih': 29, 'sebagai': 30, 'tersebut': 31, 'hari': 32, 'berkata': 33, 'ada': 34, 'ia': 35, 'satu': 36, 'lagi': 37, 'atau': 38, 'menjadi': 39, 'tetapi': 40, 'adalah': 41, 'orang': 42, 'boleh': 43, 'dia': 44, 'lalu': 45, 'baru': 46, 'seperti': 47, 'kerajaan': 48, 'selepas': 49, 'lain': 50, 'islam': 51, 'perlu': 52, 'apabila': 53, 'ketika': 54, 'kata': 55, 'seorang': 56, 'sini': 57, 'dua': 58, 'pihak': 59, 'turut': 60, 'raya': 61, 'minyak': 62, 'jika': 63, 'kami': 64, 'dapat': 65, 'rakyat': 66, 'kuala': 67, 'antara': 68, 'serta': 69, 'sekolah': 70, 'menteri': 71, 'juta': 72, 'masa': 73, 'datuk': 74, 'bukan': 75, 'sama': 76, 'secara': 77, 'jan': 

In [None]:
# Output Tokenizer

output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 45023
Length of longest sentence in the output: 93


In [None]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (20000, 98)
encoder_input_sequences[172]: [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0   588   313  1354     4  1856  4090  1610     1
  1381   779  4374  7135 13332  4717     1 10034     7  4375     8  2323
     1 10035]


In [None]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

decoder_input_sequences.shape: (20000, 93)
decoder_input_sequences[172]: [  318  1351    10  1788  4473  1547     1  1322  4868  4756  7954 18425
  4869     1 18426     6  4474     5  2555     1     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0]


Embedding layer

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

NameError: ignored

In [None]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [None]:
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

In [None]:
decoder_targets_one_hot.shape

In [None]:
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

Encoder LSTM

In [None]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

Decoder LSTM

In [None]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

Pass the dense to softmax

In [None]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs

In [None]:
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

In [None]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

Modify the model

In [None]:
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [None]:
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

In [None]:
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

Prediction

First, turn the input sentences to word vector
Then create a prediction function

In [None]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

In [None]:
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''

        if idx > 0:
            word = idx2word_target[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)