In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import io
import string
import re
tf.enable_eager_execution()

In [2]:
def preprocess(sentence):
    sent = sentence.lower()
    sent = sent.strip()
    sent = "<start> " + sent + " <end>"
    return sent

In [3]:
X_train = []
X_test = []
with open('./data/train.en','r',encoding="utf8") as f:
    rawData = f.read().strip().split("\n")
for line in rawData:
    line = preprocess(line)
    X_train.append(line)
    

In [4]:
with open('./data/tst2013.en','r',encoding="utf8") as f:
    rawData = f.read().strip().split("\n")
for line in rawData:
    line = preprocess(line)
    X_test.append(line)

In [5]:
Y_train = []
Y_test = []
with open('./data/train.vi','r',encoding="utf8") as f:
    rawData = f.read().strip().split("\n")
for line in rawData:
    line = preprocess(line)
    Y_train.append(line)

In [6]:
with open('./data/tst2013.vi','r',encoding="utf8") as f:
    rawData = f.read().strip().split("\n")
for line in rawData:
    line = preprocess(line)
    Y_test.append(line)

In [7]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [8]:
tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews)

In [9]:
max_lenght = max([len(s.split()) for s in total_reviews])

In [10]:
X_vocal_size = len(tokenizer_obj.word_index)+1

In [11]:
X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

In [12]:
X_train_pad = pad_sequences(X_train_tokens,maxlen = max_lenght, padding = 'post')
X_test_pad = pad_sequences(X_test_tokens,maxlen = max_lenght, padding = 'post')

In [13]:
tokenizer_obj = Tokenizer()
total_reviews = Y_train + Y_test
tokenizer_obj.fit_on_texts(total_reviews)

In [14]:
max_lenght = max([len(s.split()) for s in total_reviews])
Y_vocal_size = len(tokenizer_obj.word_index)+1

In [15]:
Y_train_tokens = tokenizer_obj.texts_to_sequences(Y_train)
Y_test_tokens = tokenizer_obj.texts_to_sequences(Y_test)

In [16]:
Y_train_pad = pad_sequences(Y_train_tokens,maxlen = max_lenght, padding = 'post')
Y_test_pad = pad_sequences(Y_test_tokens,maxlen = max_lenght, padding = 'post')

In [17]:
BATCH_SIZE = 32
BUFFER_SIZE = X_train_pad.shape[0]
N_BATCH = BUFFER_SIZE//BATCH_SIZE
hidden_unit = 1024
embedding_size = 256
print(N_BATCH)

4166


In [18]:
dataset = tf.data.Dataset.from_tensor_slices((X_train_pad, Y_train_pad))
dataset = dataset.batch(BATCH_SIZE)

In [19]:
tmp_x, tmp_y = next(iter(dataset))
print(tmp_x.shape)
print(tmp_y.shape)

Instructions for updating:
Colocations handled automatically by placer.
(32, 630)
(32, 852)


In [20]:
class Encode(tf.keras.Model):
    def __init__(self, embedding_size, vocab_size, hidden_units):
        super(Encode, self).__init__()
        self.Embedding = tf.keras.layers.Embedding(vocab_size,embedding_size)
        self.GRU = tf.keras.layers.GRU(
            hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform')
        self.hidden_units = hidden_units
        
    def call(self, x, hidden_state):
        try:
            x = self.Embedding(x)
        except:
            print(x, print(inp_lang.vocab_size))          
        outputs, last_state = self.GRU(x, hidden_state)
        return outputs, last_state
    
    def init_hidden_state(self, batch_size):
        return tf.zeros([batch_size, self.hidden_units])

In [21]:
encoder = Encode(embedding_size, X_vocal_size, hidden_unit)
hidden_state = encoder.init_hidden_state(BATCH_SIZE)
tmp_outputs, last_state = encoder(tmp_x, hidden_state)
print(tmp_outputs.shape)
print(last_state.shape)

(32, 630, 1024)
(32, 1024)


In [22]:
class Attention(tf.keras.Model):
    def __init__(self, hidden_units):
        super(Attention, self).__init__()
        self.W_out_encode = tf.keras.layers.Dense(hidden_unit)
        self.W_state = tf.keras.layers.Dense(hidden_unit)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, encode_outs, pre_state):
        pre_state = tf.expand_dims(pre_state, axis=1)
        pre_state = self.W_state(pre_state)
        encode_outs = self.W_out_encode(encode_outs)
        score = self.V(
            tf.nn.tanh(
                pre_state + encode_outs)
        )
        score = tf.nn.softmax(score, axis=1)
        context_vector = score*encode_outs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, score

In [23]:
attention = Attention(hidden_unit)
context_vector, attention_weight = attention(tmp_outputs, last_state)
print(context_vector.shape, attention_weight.shape)

(32, 1024) (32, 630, 1)


In [24]:
class Decode(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_units):
        super(Decode, self).__init__()
        self.hidden_units = hidden_units
        self.Embedding = tf.keras.layers.Embedding(vocab_size,embedding_size)
        self.Attention = Attention(hidden_units)
        self.GRU = tf.keras.layers.GRU(
            hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.Fc = tf.keras.layers.Dense(vocab_size)
            
    def call(self, x, encode_outs, pre_state):
        x = tf.expand_dims(x, axis=1)
        try:
            x = self.Embedding(x)
        except:
            print(x, print(tar_lang.vocab_size))          
        context_vector, attention_weight = self.Attention(encode_outs, pre_state)
        context_vector = tf.expand_dims(context_vector, axis=1)
        gru_inp = tf.concat([x, context_vector], axis=-1)
        out_gru, state = self.GRU(gru_inp)
        out_gru = tf.reshape(out_gru, (-1, out_gru.shape[2]))
        return self.Fc(out_gru), state
    
    
decode = Decode(Y_vocal_size, embedding_size, hidden_unit)
print(last_state.shape, tmp_outputs.shape, tmp_y[:, 0].shape)
decode_out, state = decode(tmp_y[:, 0], tmp_outputs, last_state)

(32, 1024) (32, 630, 1024) (32,)


In [25]:
def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [26]:
EPOCHS = 20
for epoch in range(EPOCHS):
    total_loss = 0
    for batch_id, (x, y) in enumerate(dataset.take(N_BATCH)):
        loss = 0
        with tf.GradientTape() as tape:
            first_state = encoder.init_hidden_state(batch_size=BATCH_SIZE)
            encode_outs, last_state = encoder(x, first_state)
            decode_state = last_state
            decode_input = [tar_lang.word2id["<start>"]]*BATCH_SIZE
            
            for i in range(1, y.shape[1]):
                decode_out, decode_state = decoder(decode_input, encode_outs, decode_state)
                loss += loss_function(y[:, i], decode_out)
                decode_input = y[:, i]
                
            train_vars = encoder.trainable_variables + decoder.trainable_variables
            grads = tape.gradient(loss, train_vars)
            optimizer.apply_gradients(zip(grads, train_vars))
        total_loss += loss
    print(total_loss.numpy())

NameError: name 'EPOCHS' is not defined