In [0]:
import numpy as np , os
import tensorflow as tf
import collections
import unicodedata
import re
import numpy as np
import os
import io
import time

In [0]:
path_to_zip = tf.keras.utils.get_file(
    'dataset.zip', origin='http://ec2-18-232-83-49.compute-1.amazonaws.com/tst2012.zip', 
    extract=True)

path_from = os.path.dirname(path_to_zip)+"/tst2012.from"
path_to = os.path.dirname(path_to_zip)+"/tst2012.to"

In [0]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [31]:
text_from = io.open(path_from, encoding='UTF-8').read().strip().split('\n')
text_to = io.open(path_to, encoding='UTF-8').read().strip().split('\n')
print('len from: %d, len to: %d'%(len(text_from), len(text_to)))

len from: 100, len to: 100


In [32]:
concat_from = ' '.join(text_from).split()
vocabulary_size_from = len(list(set(concat_from)))
data_from, count_from, dictionary_from, rev_dictionary_from = build_dataset(concat_from, vocabulary_size_from)
print('vocab from size: %d'%(vocabulary_size_from))
print('Most common words', count_from[4:10])
print('Sample data', data_from[:10], [rev_dictionary_from[i] for i in data_from[:10]])

vocab from size: 722
Most common words [('.', 89), ("'", 53), ('I', 46), (',', 46), ('the', 44), ('to', 41)]
Sample data [126, 5, 24, 25, 217, 26, 20, 127, 218, 47] ['Aren', "'", 't', 'they', 'streaming', 'it', 'for', 'free', 'online', '...']


In [33]:
concat_to = ' '.join(text_to).split()
vocabulary_size_to = len(list(set(concat_to)))
data_to, count_to, dictionary_to, rev_dictionary_to = build_dataset(concat_to, vocabulary_size_to)
print('vocab to size: %d'%(vocabulary_size_to))
print('Most common words', count_to[4:10])
print('Sample data', data_to[:10], [rev_dictionary_to[i] for i in data_to[:10]])

vocab to size: 645
Most common words [('.', 89), ("'", 48), ('I', 46), ('^', 40), ('the', 38), (',', 37)]
Sample data [106, 9, 171, 28, 50, 4, 107, 108, 172, 4] ['Yes', ',', 'yes', 'they', 'are', '.', 'That', 'poor', 'bastard', '.']


In [0]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [0]:
#Defining seq2seq model
class Chatbot:
    def __init__(self, size_layer, num_layers, embedded_size,
                 from_dict_size, to_dict_size, learning_rate, batch_size):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size_layer,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.placeholder(tf.int32, [None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])

        with tf.variable_scope("encoder_embeddings"):        
            
            encoder_embeddings = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
            print(encoder_embeddings)
            encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
            print(encoder_embedded)
            main = tf.strided_slice(self.X, [0, 0], [batch_size, -1], [1, 1])
            
        with tf.variable_scope("decoder_embeddings"):        
            decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
            decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
            decoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, decoder_input)
        
        with tf.variable_scope("encoder"):
            rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
            _, last_state = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded,
                                              dtype = tf.float32)
        with tf.variable_scope("decoder"):
            rnn_cells_dec = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
            outputs, _ = tf.nn.dynamic_rnn(rnn_cells_dec, decoder_embedded, 
                                           initial_state = last_state,
                                           dtype = tf.float32)
        with tf.variable_scope("logits"):            
            self.logits = tf.layers.dense(outputs,to_dict_size)
            print(self.logits)
            masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        with tf.variable_scope("cost"):            
            self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.logits,
                                                         targets = self.Y,
                                                         weights = masks)
        with tf.variable_scope("optimizer"):            
            self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
            

In [36]:
#Hyperparameters         
            
size_layer = 128
num_layers = 2
embedded_size = 128
learning_rate = 0.001
batch_size = 32
epoch = 1

#Training
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Chatbot(size_layer, num_layers, embedded_size, vocabulary_size_from + 4, 
                vocabulary_size_to + 4, learning_rate, batch_size)

sess.run(tf.global_variables_initializer())

saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
checkpoint_dir = os.path.abspath(os.path.join('./', "checkpoints_chatbot"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")

<tf.Variable 'encoder_embeddings/Variable:0' shape=(726, 128) dtype=float32_ref>
Tensor("encoder_embeddings/embedding_lookup/Identity:0", shape=(?, ?, 128), dtype=float32)




Tensor("logits/dense/BiasAdd:0", shape=(32, ?, 649), dtype=float32)


In [37]:
!python3 -c 'import tensorflow as tf; print(tf.__version__)'

1.13.1


In [0]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            try:
                ints.append(dic[k])
            except Exception as e:
                print(e)
                ints.append(2)
        X.append(ints)
    return X

In [0]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = 50
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(50)
    return padded_seqs, seq_lens

def check_accuracy(logits, Y):
    acc = 0
    for i in range(logits.shape[0]):
        internal_acc = 0
        for k in range(len(Y[i])):
            if Y[i][k] == logits[i][k]:
                internal_acc += 1
        acc += (internal_acc / len(Y[i]))
    return acc / logits.shape[0]

In [40]:
X = str_idx(text_from, dictionary_from)
Y = str_idx(text_to, dictionary_to)


for i in range(epoch):
    total_loss, total_accuracy = 0, 0
    print(len(text_from))
    for k in range(0, (len(text_from) // batch_size) * batch_size, batch_size):
        batch_x, seq_x = pad_sentence_batch(X[k: k+batch_size], PAD)
        batch_y, seq_y = pad_sentence_batch(Y[k: k+batch_size], PAD)
        #print(batch_x)
        #print(type(batch_x))
        #print(np.asarray(batch_x).reshape(32,1))
        arr_x = np.zeros([32,50])
        arr_y = np.zeros([32,50])
        arr_seq_x = np.zeros([32])
        arr_seq_y = np.zeros([32])
        
        # converting list to array to pass through feed_dict 
        for s in range(32):
          for k in range(50):
            arr_x[s][k]=batch_x[s][k]
        for s in range(32):
          for k in range(50):
            arr_y[s][k]=batch_y[s][k]
        for s in range(32):
          arr_seq_x[s]=seq_x[s]
        for s in range(32):
          arr_seq_y[s]=seq_y[s]          
          
        predicted, loss, _ = sess.run([tf.argmax(model.logits,2), model.cost, model.optimizer], 
                                      feed_dict={model.X:arr_x,
                                                model.Y:arr_y,
                                                model.X_seq_len:arr_seq_x,
                                                model.Y_seq_len:arr_seq_y})
        
        total_loss += loss
#        total_accuracy += check_accuracy(predicted,batch_y)
#        print 'output:', [rev_dictionary_to[i] for i in predicted[0]]
#        print 'input:', [rev_dictionary_to[i] for i in batch_x[0]]
        
    total_loss /= (len(text_from) // batch_size)
    total_accuracy /= (len(text_from) // batch_size)
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(i+1, total_loss, total_accuracy))
    path = saver.save(sess, checkpoint_prefix, global_step=i+1)

'O'
'Knight'
100
epoch: 1, avg loss: 6.314381, avg accuracy: 0.000000


In [41]:
#Evaluation

def predict(sentence):
    X_in = []
    for word in sentence.split():
        try:
            X_in.append(dictionary_from[word])
        except:
            X_in.append(PAD)
            pass
        
    test, seq_x = pad_sentence_batch([X_in], PAD)
    input_batch = np.zeros([batch_size,seq_x[0]])
    input_batch[0] =test[0] 
        
    log = sess.run(tf.argmax(model.logits,2), 
                                      feed_dict={
                                              model.X:input_batch,
                                              model.X_seq_len:seq_x,
                                              model.Y_seq_len:seq_x
                                              }
                                      )
    
    result=' '.join(rev_dictionary_to[i] for i in log[0])
    return result
    
checkpoint_file = tf.train.latest_checkpoint(os.path.join('./', 'checkpoints_chatbot'))
saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
saver.restore(sess, checkpoint_file)
    
print (predict('how are you ?')) 

INFO:tensorflow:Restoring parameters from /content/checkpoints_chatbot/model-1
PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
