In [1]:
import os
from pickle import dump
import pandas as pd
from numpy import array
from numpy.random import shuffle
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Input ,LSTM, Embedding, Dense, TimeDistributed

from tensorflow.keras.layers import RepeatVector, Bidirectional, Dropout

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import ModelCheckpoint

#from generate_phoneme import get_phoneme_list
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

from numpy import argmax
from tensorflow.keras.models import load_model,model_from_json
import time
from pickle import load

In [2]:
# Path to the data txt file on disk.
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
class LoadData():
        
    def load_data(self):
        data_path = '/content/drive/My Drive/Projects/NLP-Notebooks/1.4-machine-translation/data/ben-eng/ben.txt'
        file = open(data_path, encoding="utf8")
        lines = file.read().split('\n')
        input_target = list()
        for line in lines:
            try:
                input_text, target_text,_ = line.split('\t')
                target_text = "START "+target_text+" END"
                input_target.append([input_text, target_text])
            except:
                pass
        df = pd.DataFrame(input_target, columns = ["Input","Target"])
        return input_target,df

In [44]:
class PreProcessing():
    def __init__(self,dataset=None):
        self.pairs = None
        self.dataset = dataset
        self.training_data = None
        self.testing_data = None
        
    def tokenization(self,df):
        # Input Vocabulary Size:1875
        # Input Max Length Size:19
        # Target Vocabulary Size: 3312
        # Target Max Length Size:18
        df['Target'] = df['Target'].apply(lambda x : 'START '+ x + ' END')
        self.input_max_length=max(df['Input'].apply(lambda x:len(x.split())))
        self.target_max_length=max(df['Target'].apply(lambda x:len(x.split())))

        all_input_words = list(set([word for sent in df["Input"] for word in sent.split()]))
        all_target_words = list(set([word for sent in df["Target"] for word in sent.split()]))

        self.input_vocab_size = len(all_input_words)
        self.target_vocab_size = len(all_target_words)

        self.input_token_index = dict([(word, i+1) for i, word in enumerate(all_input_words)])
        self.target_token_index = dict([(word, i+1) for i, word in enumerate(all_target_words)])
        self.reverse_input_index = dict((i, word) for word, i in self.input_token_index.items())
        self.reverse_target_index = dict((i, word) for word, i in self.target_token_index.items())
        x_train, x_test, y_train, y_test = train_test_split(df["Input"], df["Target"], test_size = 0.2,random_state=123)
        return x_train, x_test, y_train, y_test
        

    def get_tokenizer(self,lines):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(lines)
        return tokenizer
        
    def max_length(self,lines):
        return max(len(line.split()) for line in lines)

    def get_length(self):
        correct_data = self.dataset[:, 0]
        faulty_data = self.dataset[:, 1]
        tar_data_length = max(len(line.split()) for line in correct_data)
        src_data_length = max(len(line.split()) for line in faulty_data)
        return tar_data_length,src_data_length
    
    def encode_sequences(self,tokenizer, length, lines):
        X = tokenizer.texts_to_sequences(lines)
        X = pad_sequences(X, maxlen=length, padding='post')
        return X
 
    def encode_output(self,sequences, vocab_size):
        ylist = list()
        for sequence in sequences:
            encoded = to_categorical(sequence, num_classes=vocab_size)
            ylist.append(encoded)
        y = np.array(ylist)
        y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
        return y

    def get_train_test_data(self,data):
        train, test = train_test_split(data, test_size=0.2, random_state = 143)
        return train, test 




In [45]:
class DesignModel():
    
    def __init__(self,X_train,Y_train,X_test,Y_test,epochs,batch_size):
        self.X_train = X_train
        self.Y_train = Y_train
        self.X_test = X_test
        self.Y_test = Y_test
        self.epochs = epochs
        self.batch_size = batch_size
    
    def generate_batch(self, X, y, max_length_src, max_length_tar,encoder_tokens, decoder_tokens,num_decoder_tokens, batch_size ):
        ''' Generate a batch of data '''
        while True:
            for j in range(0, len(X), batch_size):
                encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
                decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
                decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
                for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                    for t, item in enumerate(input_text.split()):
                        try:
                            encoder_input_data[i, t] = encoder_tokens[item]
                        except Exception as e:
                            print("Wrong word:",item)
                            print("Exception:",e)
                    for t, item in enumerate(target_text.split()):
                        if t<len(target_text)-1:
                            decoder_input_data[i, t] = decoder_tokens[item]
                        if t>0:
                            decoder_target_data[i, t - 1, decoder_tokens[item]] = 1.
                yield ([encoder_input_data, decoder_input_data], decoder_target_data)
                
    def enc_dec_model(self, input_vocab_size, target_vocab_size, encoder_tokens, decoder_tokens, src_data_length, tar_data_length, n_units):
        
        #encoder
        encoder_input = Input(shape = (None,))
        encoder_emb =  Embedding(input_vocab_size, n_units, mask_zero = False)(encoder_input)
        encoder_lstm = LSTM(n_units,return_state = True)
        encoder_outputs,encode_h,encoder_c = encoder_lstm(encoder_emb)
        encoder_states = [encode_h,encoder_c]
        
        #decoder
        decoder_input = Input(shape = (None,))
        decoder_emb_layer = Embedding(target_vocab_size, n_units, mask_zero = False)
        decoder_emb = decoder_emb_layer(decoder_input)
        decoder_lstm = LSTM(n_units,return_sequences=True,return_state = True)
        decoder_out,decode_h,decoder_c = decoder_lstm(decoder_emb,initial_state = encoder_states)
        decoder_dense = Dense(target_vocab_size,activation="softmax")
        decoder_out = decoder_dense(decoder_out)
        self.model = Model([encoder_input,decoder_input],decoder_out)
        #compile
        self.model.compile(optimizer="rmsprop",loss="categorical_crossentropy",metrics=['acc'])
        print(self.model.summary())
        #fit
        train_gen = self.generate_batch(self.X_train,self.Y_train,src_data_length,tar_data_length,encoder_tokens, decoder_tokens, target_vocab_size,self.batch_size)
        test_gen = self.generate_batch(self.X_test,self.Y_test,src_data_length,tar_data_length,encoder_tokens, decoder_tokens, target_vocab_size,self.batch_size)
        train_samples_steps = len(self.X_train) / self.batch_size
        val_samples_steps = len(self.X_test) / self.batch_size
                

        self.model.fit(train_gen, steps_per_epoch = train_samples_steps, epochs=self.epochs, validation_data = test_gen, validation_steps = val_samples_steps)
        
        #eocoder setup
        
        self.encoder_model = Model(encoder_input, encoder_states)
        
        # Decoder setup
        decoder_state_input_h = Input(shape=(n_units,))
        decoder_state_input_c = Input(shape=(n_units,))
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
        
        dec_emb2 = decoder_emb_layer(decoder_input)
        
        decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
        decoder_states2 = [state_h2, state_c2]
        decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary
        
        # Final decoder model
        self.decoder_model = Model([decoder_input] + decoder_states_inputs,[decoder_outputs2] + decoder_states2)
    
    def save_model(self,model,model_file):
        with open(model_file+'.json', 'w', encoding='utf8') as f:
            f.write(model.to_json())
        model.save_weights(model_file+'.h5')
        
    def save(self):
        encoder_model_name = 'Encoder_Model'
        self.save_model(self.encoder_model,encoder_model_name)
        
        decoder_model_name = 'Decoder_Model'
        self.save_model(self.decoder_model,decoder_model_name)


In [46]:
class Prediction():
    
    def __init__(self,input_vocab_size,target_vocab_size,input_length,target_length):
        self.src_data_length = input_length
        self.tar_data_length = target_length
        self.data_vocab_size = target_vocab_size
    
    def load_weights(self,model_filename, model_weights_filename):
        with open(model_filename, 'r', encoding='utf8') as f:
            model = model_from_json(f.read())
        model.load_weights(model_weights_filename)
        return model
    
    def load_model(self):
        encoder_model_name = 'Encoder_Model'
        decoder_model_name = 'Decoder_Model'
        self.encoder_model = self.load_weights(encoder_model_name+'.json', encoder_model_name+'.h5')
        self.decoder_model = self.load_weights(decoder_model_name+'.json', decoder_model_name+'.h5')
    
    def decode_sequence(self,input_seq,target_token_index,reverse_target_index):
        # Encode the input as state vectors.
        states_value = encoder_model.predict(input_seq)
        # Generate empty target sequence of length 1.
        target_seq = np.zeros((1,1))
        # Populate the first character of target sequence with the start character.
        target_seq[0, 0] = target_token_index['START']

        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stop_condition = False
        decoded_sentence = ''
        while not stop_condition:
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_char = reverse_target_index[sampled_token_index]
            decoded_sentence += ' '+sampled_char

            # Exit condition: either hit max length
            # or find stop character.
            if (sampled_char == 'END' or len(decoded_sentence) > 50):
                stop_condition = True

            # Update the target sequence (of length 1).
            target_seq = np.zeros((1,1))
            target_seq[0, 0] = sampled_token_index

            # Update states
            states_value = [h, c]

        return decoded_sentence

Loading & preprocessing

In [48]:
batch_size = 32
ld = LoadData()
input_target,df = ld.load_data()

print(df.head()) 
prp = PreProcessing()
x_train, x_test, y_train, y_test = prp.tokenization(df)
print(x_train.shape, x_test.shape)
input_vocab_size = prp.input_vocab_size + 1
target_vocab_size = prp.target_vocab_size + 1
input_token_index = prp.input_token_index
target_token_index = prp.target_token_index
input_max_length = prp.input_max_length
target_max_length = prp.target_max_length
reverse_input_index = prp.reverse_input_index
reverse_target_index = prp.reverse_target_index

# input_texts = np.array(input_target)[:, 0]
# target_texts = np.array(input_target)[:, 1]
# prp = PreProcessing()
# input_tokenizer = prp.get_tokenizer(input_texts)
# input_vocab_size = len(input_tokenizer.word_index) + 1
# input_max_length = prp.max_length(input_texts)
# input_token_index = None
# reverse_input_index = None
# print(f'Input Vocabulary Size:{input_vocab_size}')
# print(f'Input Max Length Size:{input_length}')
            
# target_tokenizer = prp.get_tokenizer(target_texts)
# target_vocab_size = len(target_tokenizer.word_index) + 1
# target_max_length = prp.max_length(target_texts)
# target_token_index
# reverse_target_index

# print(f'Target Vocabulary Size: {target_vocab_size}')
# print(f'Target Max Length Size:{target_length}')

# train , test = prp.get_train_test_data(input_target)
# train = np.array(train) 
# test = np.array(test)
# trainX = prp.encode_sequences(input_tokenizer, input_length, train[:, 1])
# trainY = prp.encode_sequences(target_tokenizer, target_length, train[:, 0])

# testX = prp.encode_sequences(input_tokenizer, input_length, train[:, 1])
# testY = prp.encode_sequences(target_tokenizer, target_length, train[:, 0])

  Input  Target
0   Go.    যাও।
1   Go.    যান।
2   Go.     যা।
3  Run!  পালাও!
4  Run!  পালান!
(3480,) (870,)


Training & Model Saving

In [None]:
model_obj = DesignModel(x_train,y_train,x_test,y_test,100,32)
model_obj.enc_dec_model(input_vocab_size, target_vocab_size, input_token_index, target_token_index, input_max_length,target_max_length,300)
model_obj.save()

Model: "model_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_26 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, None, 300)    858300      input_25[0][0]                   
__________________________________________________________________________________________________
embedding_19 (Embedding)        (None, None, 300)    1062900     input_26[0][0]                   
___________________________________________________________________________________________

Prediction

In [12]:
reverse_word_map = dict(map(reversed, target_tokenizer.word_index.items()))
pred = Prediction(input_vocab_size,target_vocab_size,input_length,target_length)
pred.load_model()
for seq_index in range(10):
    # Take one sequence (part of the training test)
    # for trying out decoding.
    input_seq = prp.encode_sequences(input_tokenizer, input_length, input_texts[seq_index])
    decoded_sentence = pred.decode_sequence(input_seq,reverse_word_map,target_vocab_size)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

NameError: ignored