<a href="https://colab.research.google.com/github/lephuocdat2000/DeepLearning-and-Application/blob/main/EngtoVie_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preparing Data

###1, Import libraries

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd



###2, Reading the data

In [2]:
!wget http://www.manythings.org/anki/vie-eng.zip -O viet-eng.zip
!unzip viet-eng.zip

--2021-06-26 02:08:42--  http://www.manythings.org/anki/vie-eng.zip
Resolving www.manythings.org (www.manythings.org)... 172.67.173.198, 104.21.55.222, 2606:4700:3031::6815:37de, ...
Connecting to www.manythings.org (www.manythings.org)|172.67.173.198|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 298429 (291K) [application/zip]
Saving to: ‘viet-eng.zip’


2021-06-26 02:08:42 (1.05 MB/s) - ‘viet-eng.zip’ saved [298429/298429]

Archive:  viet-eng.zip
  inflating: _about.txt              
  inflating: vie.txt                 


In [2]:
lines = pd.read_table( 'vie.txt' , names=[ 'eng' , 'vie' ] )
lines.reset_index( level=0 , inplace=True)
lines.rename( columns={ 'index' : 'eng' , 'eng' : 'vie' , 'vie' : 'c' } , inplace=True )
lines = lines.drop('c',1)

In [3]:
lines.head()

Unnamed: 0,eng,vie
0,Run!,Chạy!
1,Help!,Giúp tôi với!
2,Go on.,Tiếp tục đi.
3,Hello!,Chào bạn.
4,Hurry!,Nhanh lên nào!


### 3) Preparing input data for the Encoder ( `encoder_input_data` )

In [4]:
eng_lines = list()
for line in lines.eng:
    eng_lines.append( line ) 

tokenizer = preprocessing.text.Tokenizer()  
tokenizer.fit_on_texts( eng_lines ) 
tokenized_eng_lines = tokenizer.texts_to_sequences( eng_lines ) 

length_list = list()
for token_seq in tokenized_eng_lines:
    length_list.append( len( token_seq ))
max_input_length = np.array( length_list ).max()
padded_eng_lines = preprocessing.sequence.pad_sequences( tokenized_eng_lines , maxlen=max_input_length , padding='post' )
encoder_input_data = np.array( padded_eng_lines )

eng_word_dict = tokenizer.word_index
num_eng_tokens = len( eng_word_dict )+1

### 4) Preparing input data for the Decoder ( `decoder_input_data` )

In [12]:
!pip install underthesea

Collecting underthesea
[?25l  Downloading https://files.pythonhosted.org/packages/a8/5f/03ab9091b88e7851aa92da33f8eea6f111423cc1194cf1636c63c1fff3d0/underthesea-1.3.1-py3-none-any.whl (7.5MB)
[K     |████████████████████████████████| 7.5MB 9.6MB/s 
Collecting python-crfsuite>=0.9.6
[?25l  Downloading https://files.pythonhosted.org/packages/79/47/58f16c46506139f17de4630dbcfb877ce41a6355a1bbf3c443edb9708429/python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 51.6MB/s 
[?25hCollecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 9.1MB/s 
[?25hCollecting transformers<=3.5.1,>=3.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████

In [5]:
import re
strip_special_chars = re.compile("[^\w0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [6]:
from underthesea import word_tokenize
tokenized_vie_lines = []
dict_words = np.array([])
dict_numbers = np.array([]).astype('int64')
vie_lines = []
tokenized_vie_lines = []
vie_word_dict = dict()
for line in lines.vie:
    line = word_tokenize(line,format='text')  
    full_line = '<START> ' + line + ' <END>'
    full_line = cleanSentences(full_line)
    words = full_line.split()
    for word in words:
       where_word_arr = np.where(dict_words==word)[0]
       if len(where_word_arr)==0: 
           dict_words=np.append(dict_words,word)
           dict_numbers=np.append(dict_numbers,1)
       else: 
         dict_numbers[where_word_arr[0]]+=1
    vie_lines.append(full_line)

agr_sorted_dict_numbers = np.argsort(dict_numbers)[::-1]
for idx,value in enumerate(agr_sorted_dict_numbers):
    vie_word_dict.update({dict_words[value]:idx})

# tokenizer = preprocessing.text.Tokenizer()
# tokenizer.fit_on_texts(vie_lines) 
# tokenized_vie_lines = tokenizer.texts_to_sequences(vie_lines) 

# length_list = list()
# for token_seq in tokenized_vie_lines:
#     length_list.append( len( token_seq ))

# max_output_length = np.array( length_list ).max()
# print( 'Vietnamese max length is {}'.format( max_output_length ))

# padded_vie_lines = preprocessing.sequence.pad_sequences( tokenized_vie_lines , maxlen=max_output_length, padding='post' )
# decoder_input_data = np.array( padded_vie_lines )
# print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))


  


In [7]:
for i,vie_line in enumerate(vie_lines):
    words = vie_line.split()
    words_arr = np.zeros(len(words),dtype=int)
    for i,word in enumerate(words):
       words_arr[i] = vie_word_dict[word]
    tokenized_vie_lines.append(words_arr)

length_list = list()
for token_seq in tokenized_vie_lines:
    length_list.append( len( token_seq ))
max_output_length = np.array(length_list).max()



In [9]:


# padded_vie_lines = preprocessing.sequence.pad_sequences( tokenized_vie_lines , maxlen=max_output_length , padding='post' )
# decoder_input_data = np.array([padded_vie_lines] )
# num_vie_tokens = len(dict_words )+1
# print( 'Number of Vietnamese tokens = {}'.format( num_vie_tokens))

### 5) Preparing target data for the Decoder ( decoder_target_data ) 

In [8]:
decoder_target_data = list()
decoder_input_data = list()
for token_seq in tokenized_vie_lines:
    decoder_target_data.append( token_seq[ 1 : ] ) 
    decoder_input_data.append(token_seq[0:-1])

#decoder_input
padded_vie_lines = preprocessing.sequence.pad_sequences( decoder_input_data , maxlen=max_output_length , padding='post' )
decoder_input_data = np.array([padded_vie_lines])
decoder_input_data = decoder_input_data.reshape(decoder_input_data.shape[1],decoder_input_data.shape[2])
num_vie_tokens = len(dict_words )+1

padded_vie_lines = preprocessing.sequence.pad_sequences( decoder_target_data , maxlen=max_output_length, padding='post' )
onehot_vie_lines = utils.to_categorical( padded_vie_lines , num_vie_tokens)
decoder_target_data = np.array( onehot_vie_lines )
print( 'Decoder target data shape -> {}'.format( decoder_target_data.shape ))

Decoder target data shape -> (7547, 36, 3542)


#Defining and Training the models

###1, Defining the Encoder - Decoder model

In [11]:
encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( num_eng_tokens, 256 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 128 , return_state=True  )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_vie_tokens, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 128 , return_state=True , return_sequences=True)
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_vie_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    950272      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    906752      input_2[0][0]                    
______________________________________________________________________________________________

###Training model

In [13]:
model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=250, epochs=50 ) 
model.save( 'model.h5' ) 

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


#Load weights

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model.save()

In [49]:
from keras.models import load_model
model = load_model('/content/drive/MyDrive/model_engtovie.h5')

In [14]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 128 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 128 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( eng_word_dict[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_input_length , padding='post')

In [16]:
enc_model , dec_model = make_inference_models()



In [21]:
for epoch in range( encoder_input_data.shape[0] ):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter eng sentence : ' ) ) )
    #states_values = enc_model.predict( encoder_input_data[ epoch ] )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = vie_word_dict['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        print(dec_outputs,sampled_word_index)
        sampled_word = None
        for word , index in vie_word_dict.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )


Enter eng sentence : What do you do
[[[0.00015568 0.00028951 0.00029488 ... 0.00030471 0.00026156 0.00015696]]] 483
[[[3.2792529e-09 5.4143015e-03 2.9231443e-05 ... 1.7359195e-10
   1.1264391e-06 2.8334193e-09]]] 8
[[[1.3190512e-09 1.9490311e-04 1.7625845e-05 ... 8.9314529e-09
   3.3483272e-07 1.2383141e-09]]] 17
[[[9.4790709e-10 8.0801085e-02 7.3738303e-04 ... 1.3916279e-11
   2.5501043e-08 9.9719100e-10]]] 17
[[[3.5146203e-10 6.3107687e-01 3.4885405e-04 ... 1.4640876e-12
   1.6638680e-08 4.1949763e-10]]] 1
 ốm có gì gì end


KeyboardInterrupt: ignored

In [20]:
vie_word_dict

{'start': 0,
 'end': 1,
 'tôi': 2,
 'không': 3,
 'tom': 4,
 'bạn': 5,
 'đã': 6,
 'là': 7,
 'có': 8,
 'một': 9,
 'ấy': 10,
 'anh': 11,
 'đó': 12,
 'của': 13,
 'sẽ': 14,
 'đi': 15,
 'ở': 16,
 'gì': 17,
 'làm': 18,
 'cho': 19,
 'người': 20,
 'nói': 21,
 'đang': 22,
 'được': 23,
 'biết': 24,
 'muốn': 25,
 'phải': 26,
 'cái': 27,
 'này': 28,
 'cô': 29,
 'đến': 30,
 'với': 31,
 'ta': 32,
 'mary': 33,
 'có_thể': 34,
 'điều': 35,
 'vào': 36,
 'nghĩ': 37,
 'nó': 38,
 'trong': 39,
 'chúng_tôi': 40,
 'đây': 41,
 'rất': 42,
 'để': 43,
 'về': 44,
 'ra': 45,
 'ai': 46,
 'khi': 47,
 'nhà': 48,
 'nào': 49,
 'hơn': 50,
 'bị': 51,
 'nhiều': 52,
 'những': 53,
 'thích': 54,
 'rồi': 55,
 'lại': 56,
 'mình': 57,
 'tiếng': 58,
 'mà': 59,
 'nên': 60,
 'chúng_ta': 61,
 'ăn': 62,
 'và': 63,
 'cậu': 64,
 'không_thể': 65,
 'đâu': 66,
 'thì': 67,
 'cần': 68,
 'hãy': 69,
 'còn': 70,
 'họ': 71,
 'thấy': 72,
 'cũng': 73,
 'ông': 74,
 'cả': 75,
 'lúc': 76,
 'chưa': 77,
 'mày': 78,
 'quá': 79,
 'từ': 80,
 'như': 81,
 '