In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import tokenization
import collections
import ast

In [2]:
columns = ['character1','character2','movietitle','conversation']

movie_conver = []
with open('data/movie_conversations.txt','r') as file:
    for l in file:
        lines = l.split('+++$+++')
        line_content =  eval(lines[-1].rstrip())
        lines = [x.strip() for x in lines[:-1]]
        lines.append(line_content)
        movie_conver.append(lines)
movie_conver = pd.DataFrame(movie_conver, columns=columns)

In [3]:
columns = ['line','character_id','movietitle','character_name','text']

movie_lines = []
with open('data/movie_lines.txt','r', errors='ignore') as file:
    for l in file:
        lines = l.split('+++$+++')
        lines[-1] = lines[-1].rstrip()
        lines = [x.strip() for x in lines]
        movie_lines.append(lines)
movie_lines = pd.DataFrame(movie_lines, columns=columns)

In [4]:
assert len(movie_lines['line'].unique()) == len(movie_lines['line'])
movie_lines.set_index('line',inplace=True)

In [5]:
class Example():
    
    def __init__(self,input_sentences, output_sentences):
        
        self.input_sentences = input_sentences
        self.output_sentences = output_sentences
               

In [6]:
def convert_toIndex(tokens, word_dict):

    index_features = []
    for token in tokens:
        if not token in word_dict:
            word_dict[token] = len(word_dict) + 1

        index_features.append(word_dict[token])
    return index_features

In [7]:
def truncate_sequence(text_a, text_b, max_len):
    
    while len(text_a) + len(text_b) + 3 >= max_len:
        if len(text_b) > len(text_a):
            text_b.pop()
        else:
            text_a.pop()

In [8]:
def convert_toExample(num_i, conversation, tokenizer, max_input_seq, max_output_seq, word_dict):
#     convert text into examples
    
    num_convers = len(conversation)
    for i, _ in enumerate(conversation):
        
        if i < num_convers - 2:
            num_i += 1
            
            text_q = tokenizer.tokenize(conversation[i])
            text_p = tokenizer.tokenize(conversation[i+1])
            response = tokenizer.tokenize(conversation[i+2])
            
            truncate_sequence(text_q, text_p, max_input_seq)   
            
            input_texts = text_q + ['[SEQ]'] + text_p
            
            if len (response) > max_output_seq - 2:
                response = response[:max_output_seq - 2]
                
            response = ['[START]'] + response + ['[END]']
        
            input_id = convert_toIndex(input_texts, word_dict)
            output_id = convert_toIndex(response, word_dict)
            
            if len(input_id) < max_input_seq:
                input_id.extend([0] * (max_input_seq - len(input_id)))
                
            if len(output_id) < max_output_seq:
                output_id.extend([0] * (max_output_seq - len(output_id)))
                
            assert len(input_id) == max_input_seq
            assert len(output_id) == max_output_seq
            
            if num_i % 5000 == 0:
                tf.logging.info('input text: %s' % ' '.join(input_texts))
                tf.logging.info('input id: %s' % ' '.join([str(x) for x in input_id]))
                tf.logging.info('output text: %s' % ' '.join(response))
                tf.logging.info('output id: %s' % ' '.join([ str(x) for x in output_id]))
            
            yield num_i, Example(input_id, output_id)
        

In [9]:
def build_dataSet(movie_conver, movie_lines, max_input_seq, max_output_seq, tokenizer):
    
    word_dict = collections.OrderedDict()
    output_file = 'data/record'
    writer = tf.python_io.TFRecordWriter(output_file)
    num_i = 0
    
    for index, row in movie_conver.iterrows():
        line_index = row['conversation']
        conversation = movie_lines.loc[line_index,'text']
        
        if len(conversation) > 2:
            for num_i, example in convert_toExample(num_i, conversation, tokenizer, max_input_seq, max_output_seq, word_dict):
    #             input_texts, response, input_id, output_id = text_info
  
    
                features = collections.OrderedDict()
                features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=example.input_sentences))
                features["output_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=example.output_sentences))

                tf_example = tf.train.Example(features=tf.train.Features(feature=features))
                writer.write(tf_example.SerializeToString())
    writer.close()
    return word_dict
    

In [10]:
tokenizer = tokenization.FullTokenizer('data/vocab.txt')

In [11]:
max_input_seq = 50
max_output_seq = 25

word_dict = build_dataSet(movie_conver, movie_lines, max_input_seq, max_output_seq, tokenizer)

INFO:tensorflow:input text: what do you think ' s gonna go on at the guys ' party ? [SEQ] they ' ll probably get drunk , and watch dirty movies . but don ' t worry about the dirty movies .
INFO:tensorflow:input id: 208 205 42 245 33 40 238 151 157 78 46 278 33 193 6 28 474 33 298 1599 93 751 30 13 1677 3606 712 43 152 130 33 69 1468 221 46 3606 712 43 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] what do you mean ? [END]
INFO:tensorflow:output id: 44 208 205 42 254 6 53 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input text: the indian rope - trick . [SEQ] look , now i ' m pumping you . i ' m sorry . it ' s none of my business . it ' s just that you ' re not like the others .
INFO:tensorflow:input id: 46 2328 8526 26 3735 43 28 642 30 375 31 33 77 7805 42 43 31 33 77 951 43 65 33 40 2809 80 66 1917 43 65 33 40 111 39 42 33 126 45 91 46 1169 43 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] not like matt , you mean . [END]
INFO:tensorflow:output id: 44 45 91

INFO:tensorflow:output id: 44 31 33 77 45 3835 43 53 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input text: jesus . . . jesus ! [SEQ] oh , god , i don ' t even know what i want .
INFO:tensorflow:input id: 1098 43 43 43 1098 217 28 311 30 216 30 31 130 33 69 340 132 208 31 131 43 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] oh . . . [END]
INFO:tensorflow:output id: 44 311 43 43 43 53 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input text: marc ##ee , things are changing around here . you and rod will have my total personal attention . [SEQ] damn right , and you can start by taking rod ' s poster and putting it where people can see it !
INFO:tensorflow:input id: 10872 5643 30 136 16 4036 1131 247 43 42 13 458 411 70 66 373 2437 787 43 28 1345 124 30 13 42 1 35 506 765 458 33 40 10935 13 1517 65 137 410 1 125 65 217 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] damn right . [END]
INFO:tensorflow:output id: 44 1345 1

In [12]:
del movie_conver
del movie_lines

In [13]:
import json
with open('data/vocab.json', 'w') as fp:
    json.dump(word_dict, fp)

In [33]:
def load_DataSet(input_file, input_length, output_length, batch_size = 32):
    
    feature_description = {
      "input_ids": tf.FixedLenFeature([input_length], tf.int64),
      "output_ids": tf.FixedLenFeature([output_length], tf.int64),
      }
    
    def _parse_function(record):
        return tf.parse_single_example(record, feature_description)
    
    
    data = tf.data.TFRecordDataset(input_file)
    data = data.map(lambda x: _parse_function(x))
    
    data = data.shuffle(buffer_size=1000).batch(batch_size)
    data = data.cache()
    
    return data

In [34]:
def encoder(input_ids, vocab_size, embeding_size):
    
    embeding = tf.get_variable(name = 'word_embedding', shape = [vocab_size, embeding_size], dtype = tf.float32,
                              initializer=tf.contrib.layers.xavier_initializer())
        
    input_embeding = tf.nn.embedding_lookup(embeding, input_ids)
    
    rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in [128]]

    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
    
    sequence_length = tf.reduce_sum(tf.cast(input_ids > 0, dtype = tf.int32), axis=-1)

    outputs, state = tf.nn.dynamic_rnn(multi_rnn_cell, input_embeding, dtype=tf.float32, sequence_length = sequence_length)
    
    return outputs, state, sequence_length
    

In [35]:
def attention(attention_unit, encoder_output, sequence_length):
    
    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(attention_unit, memory = encoder_output,
                                                               memory_sequence_length=sequence_length)
    return attention_mechanism

In [42]:
def decoder(output_ids, attention_mechanism, encoder_state, attention_layer_size, reuse = None, training = True):
    
    with tf.variable_scope('decoder', reuse = reuse):
    
        output_ids = output_ids[:, :-1]

        embeding = tf.get_default_graph().get_tensor_by_name("word_embedding:0")

        input_embeding = tf.nn.embedding_lookup(embeding, output_ids)

        rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in [128]]

        multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)

        attention_cell = tf.contrib.seq2seq.AttentionWrapper(multi_rnn_cell, attention_mechanism, attention_layer_size = attention_layer_size)
        
        batch_size = tf.shape(output_ids)[0]

        initial_state = attention_cell.zero_state(dtype = tf.float32, batch_size = batch_size)
        initial_state = initial_state.clone(cell_state=encoder_state)

        decoder_sequence_length = tf.reduce_sum(tf.cast(output_ids > 0, dtype = tf.int32), axis=-1)
        
        if training:
            helper = tf.contrib.seq2seq.TrainingHelper(
                        inputs = input_embeding, # decoder inputs
                        sequence_length = decoder_sequence_length, # decoder input length
                        name = "decoder_training_helper")
        else:
            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    embeding, start_tokens=tf.zeros([batch_size], dtype=tf.int32) + word_dict['[START]'], end_token=word_dict['[END]'])
            

        decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = attention_cell,
                helper = helper, 
                initial_state = initial_state)

        decoder_outputs, final_state, _ = tf.contrib.seq2seq.dynamic_decode(decoder)
    
    return decoder_outputs, final_state
    

In [48]:
import json
with open('data/vocab.json', 'r') as fp:
    word_dict = json.load(fp)

In [49]:
tf.reset_default_graph()
input_file = 'data/record'
input_length = 50
output_length = 25
batch_size = 64


data = load_DataSet(input_file, input_length, output_length, batch_size)

In [50]:
iterator = data.make_initializable_iterator()
features = iterator.get_next()

input_ids = features['input_ids']
output_ids = features['output_ids']

In [51]:
embeding_size = 300
vocab_size = len(word_dict)
attention_unit = 32

encoder_output, encoder_state, sequence_length = encoder(input_ids, vocab_size, embeding_size)

attention_mechanism = attention(attention_unit, encoder_output, sequence_length)

decoder_outputs, final_state = decoder(output_ids, attention_mechanism, encoder_state, attention_layer_size = vocab_size)

infer_outputs, _  = decoder(output_ids, attention_mechanism, encoder_state, 
                                     attention_layer_size = vocab_size, reuse = True, training = False)

sequence_length = tf.cast(output_ids[:,1:] > 0, dtype = tf.float32)

prob = tf.nn.softmax(decoder_outputs.rnn_output, dim=-1)
prediction = infer_outputs.sample_id

loss  = tf.contrib.seq2seq.sequence_loss(decoder_outputs.rnn_output, output_ids[:,1:], weights = sequence_length)
train_op = tf.train.AdamOptimizer().minimize(loss)

accuracy, accu_update = tf.metrics.accuracy(prediction, output_ids[:,1:], weights = sequence_length)

In [None]:
epoch = 30
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(epoch):
        sess.run(tf.local_variables_initializer())
        sess.run(iterator.initializer)
        try: 
            while True:
                _, _, los, accu = sess.run([train_op, accu_update, loss, accuracy])
                print(accu)
        except tf.errors.OutOfRangeError:
            pass
        
        print(accu)