<a href="https://colab.research.google.com/github/lephuocdat2000/DeepLearning-and-Application/blob/main/EngtoVie_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preparing Data

###1, Import libraries

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd



###2, Reading the data

In [2]:
!wget http://www.manythings.org/anki/vie-eng.zip -O viet-eng.zip
!unzip viet-eng.zip

--2021-06-24 02:44:52--  http://www.manythings.org/anki/vie-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.21.55.222, 172.67.173.198, 2606:4700:3036::ac43:adc6, ...
Connecting to www.manythings.org (www.manythings.org)|104.21.55.222|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 298429 (291K) [application/zip]
Saving to: ‘viet-eng.zip’


2021-06-24 02:44:53 (1.14 MB/s) - ‘viet-eng.zip’ saved [298429/298429]

Archive:  viet-eng.zip
  inflating: _about.txt              
  inflating: vie.txt                 


In [12]:
lines = pd.read_table( 'vie.txt' , names=[ 'eng' , 'vie' ] )
lines.reset_index( level=0 , inplace=True)
lines.rename( columns={ 'index' : 'eng' , 'eng' : 'vie' , 'vie' : 'c' } , inplace=True )
lines = lines.drop('c',1)

In [5]:
lines.head()

Unnamed: 0,eng,vie
0,Run!,Chạy!
1,Help!,Giúp tôi với!
2,Go on.,Tiếp tục đi.
3,Hello!,Chào bạn.
4,Hurry!,Nhanh lên nào!


### 3) Preparing input data for the Encoder ( `encoder_input_data` )

In [14]:
eng_lines = list()
for line in lines.eng:
    eng_lines.append( line ) 

tokenizer = preprocessing.text.Tokenizer()  
tokenizer.fit_on_texts( eng_lines ) 
tokenized_eng_lines = tokenizer.texts_to_sequences( eng_lines ) 

length_list = list()
for token_seq in tokenized_eng_lines:
    length_list.append( len( token_seq ))
max_input_length = np.array( length_list ).max()
padded_eng_lines = preprocessing.sequence.pad_sequences( tokenized_eng_lines , maxlen=max_input_length , padding='post' )
encoder_input_data = np.array( padded_eng_lines )

eng_word_dict = tokenizer.word_index
num_eng_tokens = len( eng_word_dict )+1

### 4) Preparing input data for the Decoder ( `decoder_input_data` )

In [6]:
!pip install underthesea

Collecting underthesea
[?25l  Downloading https://files.pythonhosted.org/packages/a8/5f/03ab9091b88e7851aa92da33f8eea6f111423cc1194cf1636c63c1fff3d0/underthesea-1.3.1-py3-none-any.whl (7.5MB)
[K     |████████████████████████████████| 7.5MB 7.0MB/s 
Collecting transformers<=3.5.1,>=3.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 49.1MB/s 
Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 48.7MB/s 
[?25hCollecting torch<=1.5.1,>=1.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/a4/cf/007b6de316c9f3d4cb315a60c308342cc299e464167f5ebc369e93b5e23a/torch-1.5.1-cp37-cp37m-manylinux1_x86_64.whl (753.2MB)
[K     

In [15]:
import re
strip_special_chars = re.compile("[^\w0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [16]:
from underthesea import word_tokenize
tokenized_vie_lines = []
dict_words = np.array([])
dict_numbers = np.array([]).astype('int64')
vie_lines = []
tokenized_vie_lines = []
vie_word_dict = dict()
for line in lines.vie:
    line = word_tokenize(line,format='text')  
    full_line = '<START> ' + line + ' <END>'
    full_line = cleanSentences(full_line)
    words = full_line.split()
    for word in words:
       where_word_arr = np.where(dict_words==word)[0]
       if len(where_word_arr)==0: 
           dict_words=np.append(dict_words,word)
           dict_numbers=np.append(dict_numbers,1)
       else: 
         dict_numbers[where_word_arr[0]]+=1
    vie_lines.append(full_line)

agr_sorted_dict_numbers = np.argsort(dict_numbers)[::-1]
for idx,value in enumerate(agr_sorted_dict_numbers):
    vie_word_dict.update({dict_words[value]:idx})

# tokenizer = preprocessing.text.Tokenizer()
# tokenizer.fit_on_texts(vie_lines) 
# tokenized_vie_lines = tokenizer.texts_to_sequences(vie_lines) 

# length_list = list()
# for token_seq in tokenized_vie_lines:
#     length_list.append( len( token_seq ))

# max_output_length = np.array( length_list ).max()
# print( 'Vietnamese max length is {}'.format( max_output_length ))

# padded_vie_lines = preprocessing.sequence.pad_sequences( tokenized_vie_lines , maxlen=max_output_length, padding='post' )
# decoder_input_data = np.array( padded_vie_lines )
# print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))


  


In [17]:
for i,vie_line in enumerate(vie_lines):
    words = vie_line.split()
    words_arr = np.zeros(len(words),dtype=int)
    for i,word in enumerate(words):
       words_arr[i] = vie_word_dict[word]
    tokenized_vie_lines.append(words_arr)

In [18]:
length_list = list()
for token_seq in tokenized_vie_lines:
    length_list.append( len( token_seq ))
max_output_length = np.array( length_list ).max()
padded_vie_lines = preprocessing.sequence.pad_sequences( tokenized_vie_lines , maxlen=max_output_length , padding='post' )
decoder_input_data = np.array( padded_vie_lines )
num_vie_tokens = len(dict_words )+1
print( 'Number of Vietnamese tokens = {}'.format( num_vie_tokens))

Number of Vietnamese tokens = 3542


### 5) Preparing target data for the Decoder ( decoder_target_data ) 

In [19]:
decoder_target_data = list()
for token_seq in tokenized_vie_lines:
    decoder_target_data.append( token_seq[ 1 : ] ) 
    
padded_vie_lines = preprocessing.sequence.pad_sequences( decoder_target_data , maxlen=max_output_length, padding='post' )
onehot_vie_lines = utils.to_categorical( padded_vie_lines , num_vie_tokens)
decoder_target_data = np.array( onehot_vie_lines )
print( 'Decoder target data shape -> {}'.format( decoder_target_data.shape ))

Decoder target data shape -> (7547, 36, 3542)


#Defining and Training the models

###1, Defining the Encoder - Decoder model

In [20]:
encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( num_eng_tokens, 256 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 128 , return_state=True  )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_vie_tokens, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 128 , return_state=True , return_sequences=True)
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_vie_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    950272      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    906752      input_4[0][0]                    
______________________________________________________________________________________________

###Training model

In [22]:
model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=250, epochs=50 ) 
model.save( 'model.h5' ) 

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
!cp /content/model_engtovie.h5 /content/drive/MyDrive

In [15]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 128 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 128 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( eng_word_dict[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_input_length , padding='post')

In [20]:
enc_model , dec_model = make_inference_models()

<tensorflow.python.keras.engine.functional.Functional object at 0x7f4460298e50>


In [21]:
for epoch in range( encoder_input_data.shape[0] ):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter eng sentence : ' ) ) )
    print(states_values)
    #states_values = enc_model.predict( encoder_input_data[ epoch ] )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = vie_word_dict['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in vie_word_dict.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )


Enter eng sentence : Hello
[array([[ 0.0540865 , -0.00490825, -0.10520622,  0.28306064,  0.44182664,
        -0.41847327, -0.15631813, -0.27720797, -0.12261759, -0.13239479,
         0.15604481, -0.05106071, -0.14828748,  0.44798028, -0.15015084,
         0.0700992 , -0.17648865,  0.24745317,  0.12648804,  0.23181261,
        -0.4393365 ,  0.11443024,  0.29932436,  0.19135484,  0.11151613,
        -0.23765467,  0.39572933,  0.31172678, -0.13191128, -0.13740106,
         0.2728125 ,  0.02462601, -0.2375038 ,  0.30787456, -0.4800027 ,
         0.09464712, -0.28419167,  0.0991584 , -0.08584496, -0.12324665,
        -0.29007402, -0.13195777,  0.17633331,  0.23387572,  0.32660767,
        -0.21064325,  0.20506108, -0.28932232, -0.10836972,  0.37642172,
        -0.00772683, -0.1843418 , -0.3072076 ,  0.28467813, -0.32326764,
        -0.25114682,  0.18430671, -0.1282141 ,  0.16773863,  0.2402108 ,
         0.25811228,  0.42270294, -0.28615516, -0.28344062, -0.3972286 ,
        -0.13795567, -0

KeyboardInterrupt: ignored

In [19]:
vie_lines

['start chạy  end',
 'start giúp tôi với  end',
 'start tiếp_tục đi  end',
 'start chào bạn  end',
 'start nhanh lên nào  end',
 'start tôi sẽ thử  end',
 'start tôi sẽ thử xem sao  end',
 'start ăn đi  end',
 'start ăn nó đi  end',
 'start hiểu rồi  end',
 'start cứu tôi với  end',
 'start tôi cũng nghĩ như_vậy  end',
 'start hoàn_hảo  end',
 'start chúng_tôi biết  end',
 'start chúng_ta biết  end',
 'start bạn chạy  end',
 'start đừng có rầu_rĩ quá như_thế  end',
 'start hắn thử  end',
 'start anh thử  end',
 'start thoáng cái chân lên  end',
 'start tôi quên mất rồi  end',
 'start tôi sẽ thử  end',
 'start tôi sẽ thử xem sao  end',
 'start tôi bị hói end',
 'start tôi đang bận  end',
 'start muộn quá  end',
 'start lại đây nào  end',
 'start lại đây  end',
 'start lại đây đi  end',
 'start đi ngủ đi  end',
 'start tôi ghét ti vi  end',
 'start tôi đã cười  end',
 'start tôi cười  end',
 'start tôi sẽ đi  end',
 'start nó có tồi không  end',
 'start lạnh  end',
 'start đó là của chún