In [10]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random
from pickle import load
from pickle import dump

In [11]:
class DataPreparation():
    def __init__(self):
        file = 'data/dataset.txt'
        with open(file,'r') as fp:
            self.content_list = fp.read().splitlines()[1:]
        #data_frame = pd.read_csv(file)
        #self.content_list = data_frame['content'].tolist()
        print(self.content_list[:10])
    
    def remove_punctuations(self,content):
        punctuators = '#$%&*+-/<=>@[\\]^_`{|}~\t\n'
        for idx in range(len(content)):
            for punc in punctuators:
                content[idx] = content[idx].replace(punc, '') 
        return content
    
    def removing_sentence(self,content):
        drop_index_list = list()
        for idx in range(len(content)):
            words = content[idx].split()
            if len(words)<=2 or len(words)>100:
                drop_index_list.append(idx)
        content = np.array(content)
        content = np.delete(content,drop_index_list)
        content = content.tolist()
        return content
    
    def split_sentence(self,content):
        new_data_list = list()
        for item in content:
            item = item.replace('\n','')
            item = item.replace('\t','')
        for item in content:
            new_data_list = new_data_list+item.split('.')
        content = new_data_list
        return content
    
    def cleaning_data(self):
        content = self.split_sentence(self.content_list)
        print("Sentences Splited")
        content = self.remove_punctuations(content)
        print('Punctuators Removed')
        content = list(set(content))
        return content

In [12]:
dp_obj = DataPreparation()

['Dear Sir/Madam', 'Dear', 'Hello', 'Hi', 'Hi Team', 'Good morning Team', 'Good afternoon', 'I recently read about that', 'I recently heard about that', 'Thank you for taking the time to write to us']


In [13]:
content = dp_obj.cleaning_data()

Sentences Splited
Punctuators Removed


In [14]:
class Preprocessing():
    def __init__(self):
        self.content_list = content
        
    def make_data(self):
        
        data = list()
        for sentence in self.content_list:
            for idx in range(1, len(sentence)):
                x = '<start> '+ sentence[:idx+1] + ' <end>'
                y = '<start> '+ sentence[idx+1:] + ' <end>'
                data.append([x,y])
        random.shuffle(data)
        print("data: ",data[10:20])
        return data
    
    def create_vacab(self,pairs):
        vocab = set()
        word2idx = dict() 
        for phrase in pairs:
            vocab.update(phrase.split(' '))
        vocab = sorted(vocab)
        word2idx["<pad>"] = 0
        for i,word in enumerate(vocab):
            word2idx[word] = i + 1
        return word2idx
            
    def get_data(self):
        data = self.make_data()
        
        self.inputs = list()
        self.outputs = list()
        for item in data:
            self.inputs.append(item[0])
            self.outputs.append(item[1])
        
        self.in_word2idx = self.create_vacab(self.inputs)
        self.out_word2idx = self.create_vacab(self.outputs)
        
        self.in_vocab_size = len(in_word2idx)
        self.out_vocab_size = len(out_word2idx)
        
        self.in_maxlen = max(len(item.split(' ')) for item in self.inputs)
        self.out_maxlen = max(len(item.split(' ')) for item in self.outputs)
        
        
    def create_data(self,inputs,outputs,in_maxlen,out_maxlen,in_vocab_size,out_vocab_size):
        input_data = [[in_word2idx[word] for word in sentence.split(' ')] for sentence in inputs]
        output_data = [[out_word2idx[word] for word in sentence.split(' ')] for sentence in outputs]
        input_data = tf.keras.preprocessing.sequence.pad_sequences(input_data, maxlen=in_maxlen, padding="post")
        output_data = tf.keras.preprocessing.sequence.pad_sequences(output_data, maxlen=out_maxlen, padding="post")
        
        target_data = [[output_data[n][i+1] for i in range(len(output_data[n])-1)] for n in range(len(output_data))]
        target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, maxlen=out_maxlen, padding="post")
        target_data = target_data.reshape((target_data.shape[0], target_data.shape[1], 1))
        return input_data, output_data,target_data
    
    def data_generator(self,x,y,in_maxlen,out_maxlen,in_vocab_size,out_vocab_size,batch_size):
        ''' Generate a batch of data '''
        while True:
            for j in range(0, len(x), batch_size):
                encoder_input_data = np.zeros((batch_size, in_maxlen),dtype='float32')
                decoder_input_data = np.zeros((batch_size, out_maxlen),dtype='float32')
                decoder_target_data = np.zeros((batch_size, out_maxlen, out_vocab_size),dtype='float32')
                for i, (input_text, target_text) in enumerate(zip(x[j:j+batch_size], y[j:j+batch_size])):
                    for t, word in enumerate(input_text.split(' ')):
                        try:
                            encoder_input_data[i, t] = in_word2idx[word]
                        except Exception as e:
                            print("Wrong word:",word)
                            print("Exception:",e)
                    for t, word in enumerate(target_text.split(' ')):
                        if t<len(target_text.split())-1:
                            decoder_input_data[i, t] = out_word2idx[word]
                        if t>0:
                            decoder_target_data[i, t - 1, out_word2idx[word]] = 1.
                encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(encoder_input_data, maxlen=in_maxlen, padding="post")
                decoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(decoder_input_data, maxlen=out_maxlen, padding="post")
                decoder_target_data = tf.keras.preprocessing.sequence.pad_sequences(decoder_target_data, maxlen=out_maxlen, padding="post")
                yield([encoder_input_data, decoder_input_data], decoder_target_data)
                

In [15]:
preprocess_obj = Preprocessing()

preprocess_obj.get_data()
# Training Params
in_vocab_size = preprocess_obj.in_vocab_size
out_vocab_size = preprocess_obj.out_vocab_size
in_maxlen = preprocess_obj.in_maxlen
out_maxlen = preprocess_obj.out_maxlen
in_word2idx = preprocess_obj.in_word2idx
out_word2idx = preprocess_obj.out_word2idx
x = preprocess_obj.inputs
y = preprocess_obj.outputs
print("in_maxlen: ",in_maxlen)
print("out_maxlen: ",out_maxlen)
encoder_data_input, decoder_data_input,decoder_data_output = preprocess_obj.create_data(x,y,in_maxlen,out_maxlen,in_vocab_size,out_vocab_size)

data:  [['<start> Despite my best  <end>', '<start> efforts <end>'], ["<start> We're glad the issu <end>", '<start> es got sorted out despite the delay <end>'], ["<start> I'm wr <end>", '<start> iting to remind you about <end>'], ['<start> If you could hav <end>', '<start> e it ready <end>'], ["<start> I didn't quite  <end>", '<start> get your point <end>'], ['<start> Could you be more sp <end>', '<start> ecific? <end>'], ['<start> If you need mo <end>', '<start> re information <end>'], ['<start> Would you min <end>', '<start> d if I took the day off <end>'], ['<start> If <end>', "<start>  so I'll book accordingly <end>"], ['<start> We brought together some of the best tutorials w <end>', '<start> hich <end>']]
in_maxlen:  21
out_maxlen:  21


In [42]:
class CreateModel():
    
    def build(self,epochs,batch_size):
        embedding_dim = 300
        n_units = 128
        '''Enoder'''
        encoder_inputs = tf.keras.layers.Input(shape=(None,))
        encoder_emb = tf.keras.layers.Embedding(input_dim=in_vocab_size, output_dim=embedding_dim)
        encoder_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=n_units, return_sequences=True, return_state=True))
        
        encoder_lstm_out, fstate_h, fstate_c, bstate_h, bstate_c = encoder_lstm(encoder_emb(encoder_inputs))
        state_h = tf.keras.layers.Concatenate()([fstate_h,bstate_h])
        state_c = tf.keras.layers.Concatenate()([bstate_h,bstate_c])
        encoder_states = [state_h, state_c]
        
        '''Decoder'''
        decoder_inputs = tf.keras.layers.Input(shape=(None,))
        decoder_emb = tf.keras.layers.Embedding(input_dim=out_vocab_size, output_dim=embedding_dim)
        decoder_lstm = tf.keras.layers.LSTM(units=n_units*2, return_sequences=True, return_state=True)
        
        decoder_lstm_out, _, _ = decoder_lstm(decoder_emb(decoder_inputs), initial_state=encoder_states)
        
        
        '''Dense layer'''
        decoder_dense = tf.keras.models.Sequential()
        decoder_dense.add(tf.keras.layers.Dense(n_units, activation="relu"))
        decoder_dense.add(tf.keras.layers.Dropout(rate=.2))
        decoder_dense.add(tf.keras.layers.Dense(out_vocab_size, activation="softmax"))
        decoder_out = decoder_dense(decoder_lstm_out)

        self.model = tf.keras.models.Model(inputs = [encoder_inputs, decoder_inputs], outputs= decoder_out)
        
        '''Compile and Run'''
        self.model.compile(optimizer='adam', loss="sparse_categorical_crossentropy", metrics=['acc'])
        self.model.summary()
        self.run(epochs,batch_size)
        
        '''Here's our inference setup'''
        self.encoder_model = tf.keras.models.Model(encoder_inputs, [encoder_lstm_out, state_h, state_c])

        #inf_decoder_inputs = Input(shape=(None,))
        decoder_state_input_h = tf.keras.layers.Input(shape=(n_units*2,))
        decoder_state_input_c = tf.keras.layers.Input(shape=(n_units*2,))
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
        decoder_outputs, decoder_h, decoder_c = decoder_lstm(decoder_emb(decoder_inputs),
                                                         initial_state=decoder_states_inputs)
        
        decoder_outputs = decoder_dense(decoder_outputs)
        
        inputs = [decoder_inputs, decoder_state_input_h, decoder_state_input_c]
        outputs = [decoder_outputs, decoder_h, decoder_c]
        
        self.decoder_model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        
        
        
    def run(self,epochs,batch_size):
        self.history = self.model.fit([encoder_data_input, decoder_data_input], decoder_data_output, 
                                      batch_size= batch_size, epochs=epochs,validation_split=0.2)
        
    def save_model(self,model,model_file):
        with open(model_file+'.json', 'w', encoding='utf8') as f:
            f.write(model.to_json())
        model.save_weights(model_file+'.h5')
        
    def save(self):
        encoder_model_name = 'models/Encoder_Model1'
        self.save_model(self.encoder_model,encoder_model_name)
        
        decoder_model_name = 'models/Decoder_Model1'
        self.save_model(self.decoder_model,decoder_model_name)
        


In [None]:
model_obj = CreateModel()
model_obj.build(1,32)
model_obj.save()

Model: "model_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, None, 300)    654300      input_25[0][0]                   
__________________________________________________________________________________________________
input_26 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
bidirectional_9 (Bidirectional) [(None, None, 256),  439296      embedding_18[0][0]               
___________________________________________________________________________________________

In [101]:
class Prediction():
    def __init__(self):
        self.input_max_len = 21
        self.target_max_len = 21
        self.load_model()
        
    def load_weights(self,model_filename, model_weights_filename):
        with open(model_filename, 'r', encoding='utf8') as f:
            model = tf.keras.models.model_from_json(f.read())
        model.load_weights(model_weights_filename)
        return model
    
    def load_model(self):
        encoder_model_name = 'models/Encoder_Model1'
        decoder_model_name = 'models/Decoder_Model1'
        self.encoder_model = self.load_weights(encoder_model_name+'.json', encoder_model_name+'.h5')
        self.decoder_model = self.load_weights(decoder_model_name+'.json', decoder_model_name+'.h5')
        
        in_word2idx_dict = 'dict/in_word2idx_dict.pkl'
        self.in_word2idx = load(open(in_word2idx_dict, 'rb'))
        self.in_idx2word = {v:k for k,v in self.in_word2idx.items()}
        out_word2idx_dict = 'dict/out_word2idx_dict.pkl'
        self.out_word2idx =load(open(out_word2idx_dict, 'rb'))
        self.out_idx2word = {v:k for k,v in self.out_word2idx.items()}
        
    def sentence_to_vector(self, sentence):
        pre = sentence
        vec = np.zeros(self.input_max_len)
        sentence_list = [self.in_word2idx[s] for s in pre.split(' ')]
        for i,w in enumerate(sentence_list):
            vec[i] = w
        return vec

    def predict(self,input_sentence):
        sv = self.sentence_to_vector(input_sentence)
        sv = sv.reshape(1,len(sv))
        [emb_out, sh, sc] = self.encoder_model.predict(x=sv)

        i = 0
        start_vec = self.out_word2idx["<start>"]
        stop_vec = self.out_word2idx["<end>"]

        cur_vec = np.zeros((1,1))
        cur_vec[0,0] = start_vec
        cur_word = "<start>"
        output_sentence = ""
        while cur_word != "<end>" and i < (self.target_max_len-1):
            i += 1
            if cur_word != "<start>":
                output_sentence = output_sentence + " " + cur_word
            x_in = [cur_vec, sh, sc]
            [nvec, sh, sc] = self.decoder_model.predict(x=x_in)
            cur_vec[0,0] = np.argmax(nvec[0,0])
            cur_word = self.out_idx2word[np.argmax(nvec[0,0])]
        return output_sentence
        
        

In [102]:
predict_obj = Prediction()

In [103]:
test_list = [
    "I would appreci",
    "Please find the",
    "Could you ple",
    "I would be happy",
    "Please let me know",
    "For further d"
]
for item in test_list:
    print("Result: ",item+predict_obj.predict(item))

Result:  I would appreci 
Result:  Please find the 
Result:  Could you ple 
Result:  I would be happy 
Result:  Please let me know 
Result:  For further d 
