# 0. Import libraries

In [1]:
%%capture
!pip install fasttext underthesea nltk 

In [2]:
import re
import json
import string
import random
import warnings

import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

In [3]:
import os

# 1. Preparing data for training

In [4]:
data_train_vi = []
data_train_en = []
data_test2012_vi = []
data_test2012_en = []
data_test2013_vi = []
data_test2013_en = []

In [5]:
for dirname, _, filenames in os.walk('/kaggle/input/machine-translate-envi/data'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        if 'train.vi' in filename:
            with open(path, 'r', encoding='utf-8') as f:
                data_train_vi.extend(f.readlines())
        elif 'train.en' in filename:
            with open(path, 'r', encoding='utf-8') as f:
                data_train_en.extend(f.readlines())
        elif 'tst2012.vi' in filename:
            with open(path, 'r', encoding='utf-8') as f:
                data_test2012_vi.extend(f.readlines())
        elif 'tst2012.en' in filename:
            with open(path, 'r', encoding='utf-8') as f:
                data_test2012_en.extend(f.readlines())
        elif 'tst2013.vi' in filename:
            with open(path, 'r', encoding='utf-8') as f:
                data_test2013_vi.extend(f.readlines())
        elif 'tst2013.en' in filename:
            with open(path, 'r', encoding='utf-8') as f:
                data_test2013_en.extend(f.readlines())

In [6]:
def prepare( sentence):
        sentence = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return sentence

In [7]:
pattern1 =  r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]"

In [8]:
data_train_vi=[prepare(sentence) for sentence in data_train_vi]
data_train_en=[prepare(sentence) for sentence in data_train_en]
data_test2012_vi=[prepare(sentence) for sentence in data_test2012_vi]

data_test2012_en=[prepare(sentence) for sentence in data_test2012_en]
data_test2013_vi=[prepare(sentence) for sentence in data_test2013_vi]
data_test2013_en=[prepare(sentence) for sentence in data_test2013_en]

In [9]:
print(data_train_vi[10])
print(data_train_en[10])

mỗi năm , hơn 15,000 nhà khoa học đến san francisco để tham dự hội nghị này . 
over 15,000 scientists go to san francisco every year for that . 


# 2. Vectorizing the text data and format our datasets

In [10]:
strip_chars = string.punctuation
strip_chars = strip_chars.replace('[', '')
strip_chars = strip_chars.replace(']', '')

vocab_size = 15000
sequence_length = 100
batch_size = 128

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, '[%s]' % re.escape(strip_chars), '')

en_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length
)
vi_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization
)

en_vectorization.adapt(data_train_en)
vi_vectorization.adapt(data_train_vi)

In [11]:
def format_dataset(en, vi):
    en = en_vectorization(en)
    vi = vi_vectorization(vi)
    return ({
        'encoder_inputs': en,
        'decoder_inputs': vi[:, :-1]
    }, vi[:, 1:])

def make_dataset(en_texts,vi_texts):
    
    dataset = tf.data.Dataset.from_tensor_slices((en_texts, vi_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(data_train_en,data_train_vi)
val_ds = make_dataset(data_test2012_en,data_test2012_vi)

In [12]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (128, 100)
inputs["decoder_inputs"].shape: (128, 100)
targets.shape: (128, 100)


# 3. Building the model

In [13]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'embed_dim': self.embed_dim,
            'dense_dim': self.dense_dim,
            'num_heads': self.num_heads
        })
        return config

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'sequence_length': self.sequence_length,
            'vocab_size': self.vocab_size,
            'embed_dim': self.embed_dim
        })
        return config

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'embed_dim': self.embed_dim,
            'latent_dim': self.latent_dim,
            'num_heads': self.num_heads
        })
        return config

In [14]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

# 4. Training our model

In [16]:
print(tf.__version__)

2.6.2


In [17]:
epochs = 20

transformer.summary()
transformer.compile(
    optimizer="rmsprop", 
    loss="sparse_categorical_crossentropy", 
    metrics=["accuracy"]
)
history = transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
positional_embedding (Positiona (None, None, 256)    3865600     encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
transformer_encoder (Transforme (None, None, 256)    3155456     positional_embedding[0][0]       
________________________________________________________________________________________

In [22]:
json.dump(history.history, open('./history.json', 'w'))

In [29]:
transformer.save('./best_model.h5')


In [24]:
vi_vocab = vi_vectorization.get_vocabulary()
vi_index_lookup = dict(zip(range(len(vi_vocab)), vi_vocab))
max_decoded_sentence_length = 40

def decode_sequence(input_sentence):
    tokenized_input_sentence = en_vectorization([input_sentence])
    decoded_sentence = '[start]'
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = vi_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
        
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = vi_index_lookup[sampled_token_index]
        decoded_sentence += ' ' + sampled_token
        
        if sampled_token == '[end]':
            break
    return decoded_sentence

In [25]:
test_eng_texts = data_test2013_en[:10]

for i in range(10):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)
    print(f'{"%02d" % (i + 1)}: {input_sentence} ---> {translated}')

01: although i often wondered about the outside world , i thought i would spend my entire life in north korea , until everything suddenly changed .  ---> [start] mặc dù tôi thường tự hỏi về thế giới bên ngoài tôi đã nghĩ tôi sẽ dành cả cuộc sống của tôi cho tới khi mọi thứ đột nhiên thay thế  mọi bất   nào mọi  
02: my family was not poor , and myself , i had never experienced hunger .  ---> [start] gia đình tôi không phải là người nghèo và tôi đã không bao giờ trải nghiệm                        
03: my family was not poor , and myself , i had never experienced hunger .  ---> [start] gia đình tôi không phải là người nghèo và tôi đã không bao giờ trải nghiệm                        
04: it read , &quot when you read this , all five family members will not exist in this world , because we haven &apos t eaten for the past two weeks .  ---> [start] đọc khi bạn đọc tất cả những tác phẩm này được năm thành viên không tồn tại trong thế giới này bởi vì chúng ta chưa từng ăn trong suốt hai tuần 

In [26]:
test_eng_texts = data_test2013_en[10:30]

for i in range(20):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)
    print(f'{"%02d" % (i + 1)}: {input_sentence} ---> {translated}')

01: this is the amrok river , which serves as a part of the border between north korea and china .  ---> [start] đây là dòng sông thu âm được một phần giữa bắc israel và trung quốc     giữa và       và       và     
02: sometimes , i saw dead bodies floating down the river .  ---> [start] đôi khi tôi thấy được cơ thể đã bay trên sông                             
03: i always wondered why they had lights but we didn &apos t .  ---> [start] luôn tự hỏi tại sao chúng ta có đèn nhưng chúng ta không                    không      không 
04: as you can see , the river can be very narrow at certain points , allowing north koreans to secretly cross .  ---> [start] như bạn có thể thấy sông ở những điểm rất nhỏ ở một vài điểm cho phép người bắc mỹ bạn đi qua những cuộc hội thoại             
05: i could have never imagined that it would take 14 years to live together .  ---> [start] có thể không bao giờ tưởng tượng rằng sẽ sống được với nhau                           
06: in china , it was hard l