In [18]:
import tensorflow as tf
import numpy as np
import pandas as pd
from pickle import load
from pickle import dump

Using TensorFlow backend.


In [2]:
class DataPreparation():
    def __init__(self):
        file = 'data/dataset.csv'
        data_frame = pd.read_csv(file)
        self.content_list = data_frame['content'].tolist()
    
    def remove_punctuations(self,content):
        punctuators = '#$%&*+-/<=>@[\\]^_`{|}~\t\n'
        for idx in range(len(content)):
            for punc in punctuators:
                content[idx] = content[idx].replace(punc, '') 
        return content
    
    def removing_sentence(self,content):
        drop_index_list = list()
        for idx in range(len(content)):
            words = content[idx].split()
            if len(words)<=2 or len(words)>100:
                drop_index_list.append(idx)
        content = np.array(content)
        content = np.delete(content,drop_index_list)
        content = content.tolist()
        return content
    
    def split_sentence(self,content):
        new_data_list = list()
        for item in content:
            item = item.replace('\n','')
            item = item.replace('\t','')
        for item in content:
            new_data_list = new_data_list+item.split('.')
        content = new_data_list
        return content
    
    def cleaning_data(self):
        content = self.split_sentence(self.content_list)
        print("Sentences Splited")
        content = self.removing_sentence(content)
        print('Sentences deleted')
        content = self.remove_punctuations(content)
        print('Punctuators Removed')
        content = list(set(content))
        return content

In [3]:
dp_obj = DataPreparation()

In [4]:
content = dp_obj.cleaning_data()

Sentences Splited
Sentences deleted
Punctuators Removed


In [100]:
class Preprocessing():
    def __init__(self):
        self.content_list = content
        
    def make_data(self):
        data = list()
        for sentence in self.content_list:
            for idx in range(1, len(sentence)):
                x = '<start> '+ sentence[:idx+1] + ' <end>'
                y = '<start> '+ sentence[idx+1:] + ' <end>'
                data.append([x,y])
        print("data: ",data[10:20])
        return data
    
    def create_data(self,pairs):
        vocab = set()
        word2idx = dict() 
        for phrase in pairs:
            vocab.update(phrase.split(' '))
        vocab = sorted(vocab)
        word2idx["<pad>"] = 0
        for i,word in enumerate(vocab):
            word2idx[word] = i + 1
        return word2idx,vocab
            
    def get_data(self):
        data = self.make_data()
        
        inputs = list()
        outputs = list()
        for item in data:
            inputs.append(item[0])
            outputs.append(item[1])
        
        in_word2idx,in_vocab = self.create_data(inputs)
        out_word2idx,out_vocab = self.create_data(outputs)
        
        input_data = [[in_word2idx[word] for word in sentence.split(' ')] for sentence in inputs]
        output_data = [[out_word2idx[word] for word in sentence.split(' ')] for sentence in outputs]
        
        in_maxlen = max(len(item) for item in input_data)
        out_maxlen = max(len(item) for item in output_data)
        
        input_data = tf.keras.preprocessing.sequence.pad_sequences(input_data, maxlen=in_maxlen, padding="post")
        output_data = tf.keras.preprocessing.sequence.pad_sequences(output_data, maxlen=out_maxlen, padding="post")
        
        return input_data, output_data, in_word2idx,out_word2idx, in_maxlen, out_maxlen

In [101]:
preprocess_obj = Preprocessing()
input_data, output_data, in_word2idx,out_word2idx, in_maxlen, out_maxlen = preprocess_obj.get_data()

print("in_maxlen: ",in_maxlen)
print("out_maxlen: ",out_maxlen)
target_data = [[output_data[n][i+1] for i in range(len(output_data[n])-1)] for n in range(len(output_data))]
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, maxlen=out_maxlen, padding="post")
target_data = target_data.reshape((target_data.shape[0], target_data.shape[1], 1))

p = np.random.permutation(len(input_data))
encoder_data_input = input_data[p]
decoder_data_input = output_data[p]
decoder_data_output = target_data[p]


data:  [['<start> In other wor <end>', '<start> ds would you like us to <end>'], ['<start> In other word <end>', '<start> s would you like us to <end>'], ['<start> In other words <end>', '<start>  would you like us to <end>'], ['<start> In other words  <end>', '<start> would you like us to <end>'], ['<start> In other words w <end>', '<start> ould you like us to <end>'], ['<start> In other words wo <end>', '<start> uld you like us to <end>'], ['<start> In other words wou <end>', '<start> ld you like us to <end>'], ['<start> In other words woul <end>', '<start> d you like us to <end>'], ['<start> In other words would <end>', '<start>  you like us to <end>'], ['<start> In other words would  <end>', '<start> you like us to <end>']]
in_maxlen:  35
out_maxlen:  35


In [102]:
in_vocab_size = len(in_word2idx)
out_vocab_size = len(out_word2idx)

In [136]:
class CreateModel():
    def __init__(self):
        self.model = None
        self.history = None
                
    def build(self,epochs,batch_size):
        units = 64
        embedding_dim = 300
        #encoder
        encoder_inputs = tf.keras.layers.Input(shape=(in_maxlen,))
        encoder_emb = tf.keras.layers.Embedding(input_dim=in_vocab_size, output_dim=embedding_dim)
        encoder_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=units, 
                                                                          return_sequences=True, 
                                                                          return_state=True))
        encoder_out, fstate_h, fstate_c, bstate_h, bstate_c = encoder_lstm(encoder_emb(encoder_inputs))
        state_h = tf.keras.layers.Concatenate()([fstate_h,bstate_h])
        state_c = tf.keras.layers.Concatenate()([bstate_h,bstate_c])
        encoder_states = [state_h, state_c]
        
        #decoder       
        decoder_inputs = tf.keras.layers.Input(shape=(None,))
        decoder_emb = tf.keras.layers.Embedding(input_dim=out_vocab_size, output_dim=embedding_dim)
        decoder_lstm = tf.keras.layers.LSTM(units=units*2, return_sequences=True, return_state=True)
        decoder_lstm_out, _, _ = decoder_lstm(decoder_emb(decoder_inputs), initial_state=encoder_states)
        decoder_d1 = tf.keras.layers.Dense(units, activation="relu")
        decoder_d2 = tf.keras.layers.Dense(out_vocab_size, activation="softmax")
        decoder_out = decoder_d2(tf.keras.layers.Dropout(rate=.2)(decoder_d1(tf.keras.layers.Dropout(rate=.2)(decoder_lstm_out))))
        
        self.model = tf.keras.models.Model(inputs = [encoder_inputs, decoder_inputs], outputs= decoder_out)

        self.model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])
        self.model.summary()
        
        #run
        self.run(epochs,batch_size)
        
        #eocoder setup
        self.encoder_model = tf.keras.models.Model(encoder_inputs, [encoder_out, state_h, state_c])

        # Decoder setup
        inf_decoder_inputs = tf.keras.layers.Input(shape=(None,), name="inf_decoder_inputs")
        state_input_h = tf.keras.layers.Input(shape=(units*2,), name="state_input_h")
        state_input_c = tf.keras.layers.Input(shape=(units*2,), name="state_input_c")
        decoder_res, decoder_h, decoder_c = decoder_lstm(decoder_emb(inf_decoder_inputs), 
                                                         initial_state=[state_input_h, state_input_c])
        inf_decoder_out = decoder_d2(decoder_d1(decoder_res))
        self.decoder_model = tf.keras.models.Model(inputs=[inf_decoder_inputs, state_input_h, state_input_c], 
                                                   outputs=[inf_decoder_out, decoder_h, decoder_c])

    def run(self,epochs,batch_size):
        self.history = self.model.fit([encoder_data_input,decoder_data_input],decoder_data_output,
                                      batch_size =batch_size, epochs=epochs,validation_split=0.1)
        
    def save_model(self,model,model_name):
        with open(model_name+'.json', 'w', encoding='utf8') as f:
            f.write(model.to_json())
        model.save_weights(model_name+'.h5')
            
    def save(self):
        
        encoder_model_name = 'models/Encoder_base_model-v1.0'
        self.save_model(self.encoder_model,encoder_model_name)
        
        decoder_model_name = 'models/Decoder_base_model-v1.0'
        self.save_model(self.decoder_model,decoder_model_name)
        
        in_word2idx_dict = 'dict/in_word2idx_dict.pkl'
        dump(in_word2idx, open(in_word2idx_dict, 'wb'))
        out_word2idx_dict = 'dict/out_word2idx_dict.pkl'
        dump(out_word2idx, open(out_word2idx_dict, 'wb'))

        

In [None]:
model_obj = CreateModel()
model_obj.build(30,128)
model_obj.save()

Model: "model_23"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_30 (InputLayer)           [(None, 35)]         0                                            
__________________________________________________________________________________________________
embedding_23 (Embedding)        (None, 35, 300)      980400      input_30[0][0]                   
__________________________________________________________________________________________________
input_31 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
bidirectional_11 (Bidirectional [(None, 35, 128), (N 186880      embedding_23[0][0]               
___________________________________________________________________________________________

In [130]:
class Prediction():
    def __init__(self):
        self.input_max_len = 35
        self.target_max_len = 35
        self.load_model()
        
    def load_weights(self,model_filename, model_weights_filename):
        with open(model_filename, 'r', encoding='utf8') as f:
            model = tf.keras.models.model_from_json(f.read())
        model.load_weights(model_weights_filename)
        return model
    
    def load_model(self):
        encoder_model_name = 'models/Encoder_base_model-v1.0'
        decoder_model_name = 'models/Decoder_base_model-v1.0'
        self.encoder_model = self.load_weights(encoder_model_name+'.json', encoder_model_name+'.h5')
        self.decoder_model = self.load_weights(decoder_model_name+'.json', decoder_model_name+'.h5')
        
        in_word2idx_dict = 'dict/in_word2idx_dict.pkl'
        self.in_word2idx = load(open(in_word2idx_dict, 'rb'))
        self.in_idx2word = {v:k for k,v in self.in_word2idx.items()}
        out_word2idx_dict = 'dict/out_word2idx_dict.pkl'
        self.out_word2idx =load(open(out_word2idx_dict, 'rb'))
        self.out_idx2word = {v:k for k,v in self.out_word2idx.items()}
        
    def sentence_to_vector(self, sentence):
        pre = sentence
        vec = np.zeros(self.input_max_len)
        sentence_list = [self.in_word2idx[s] for s in pre.split(' ')]
        for i,w in enumerate(sentence_list):
            vec[i] = w
        return vec

    def predict(self,input_sentence):
        sv = self.sentence_to_vector(input_sentence)
        sv = sv.reshape(1,len(sv))
        [emb_out, sh, sc] = self.encoder_model.predict(x=sv)

        i = 0
        start_vec = self.out_word2idx["<start>"]
        stop_vec = self.out_word2idx["<end>"]

        cur_vec = np.zeros((1,1))
        cur_vec[0,0] = start_vec
        cur_word = "<start>"
        output_sentence = ""
        while cur_word != "<end>" and i < (self.target_max_len-1):
            i += 1
            if cur_word != "<start>":
                output_sentence = output_sentence + " " + cur_word
            x_in = [cur_vec, sh, sc]
            [nvec, sh, sc] = self.decoder_model.predict(x=x_in)
            cur_vec[0,0] = np.argmax(nvec[0,0])
            cur_word = self.out_idx2word[np.argmax(nvec[0,0])]
        return output_sentence

In [131]:
predict_obj = Prediction()

In [135]:
print(predict_obj.predict("Thank you for ta"))

 d a vacation in June
