In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import tokenization
import collections
import ast
import time
from nltk.tokenize import TweetTokenizer

In [2]:
columns = ['character1','character2','movietitle','conversation']

movie_conver = []
with open('data/movie_conversations.txt','r') as file:
    for l in file:
        lines = l.split('+++$+++')
        line_content =  eval(lines[-1].rstrip())
        lines = [x.strip() for x in lines[:-1]]
        lines.append(line_content)
        movie_conver.append(lines)
movie_conver = pd.DataFrame(movie_conver, columns=columns)

In [3]:
columns = ['line','character_id','movietitle','character_name','text']

movie_lines = []
with open('data/movie_lines.txt','r', errors='ignore') as file:
    for l in file:
        lines = l.split('+++$+++')
        lines[-1] = lines[-1].rstrip()
        lines = [x.strip() for x in lines]
        movie_lines.append(lines)
movie_lines = pd.DataFrame(movie_lines, columns=columns)

In [4]:
assert len(movie_lines['line'].unique()) == len(movie_lines['line'])
movie_lines.set_index('line',inplace=True)

In [5]:
movie_title = movie_conver['movietitle'].unique()
select_title = movie_title[0:10]

select_movie_conver = movie_conver[np.isin(movie_conver['movietitle'], select_title)]
select_movie_lines = movie_lines[np.isin(movie_lines['movietitle'], select_title)]

In [6]:
class Example():
    
    def __init__(self,input_sentences, output_sentences):
        
        self.input_sentences = input_sentences
        self.output_sentences = output_sentences
               

In [7]:
def convert_toIndex(tokens, word_dict):

    index_features = []
    for token in tokens:
        if not token in word_dict:
            word_dict[token] = len(word_dict) + 1

        index_features.append(word_dict[token])
    return index_features

In [8]:
def truncate_sequence(text_a, text_b, max_len):
    
    while len(text_a) + len(text_b) + 3 >= max_len:
        if len(text_b) > len(text_a):
            text_b.pop()
        else:
            text_a.pop()

In [9]:
def convert_toExample(num_i, conversation, tokenizer, max_input_seq, max_output_seq, word_dict):
#     convert text into examples
    
    num_convers = len(conversation)
    for i, _ in enumerate(conversation):
        
        if i < num_convers - 1:
            num_i += 1
            
            input_texts = tokenizer.tokenize(conversation[i])
            response = tokenizer.tokenize(conversation[i+1])
#             response = tokenizer.tokenize(conversation[i+2])          
#             truncate_sequence(text_q, text_p, max_input_seq)              
#             input_texts = text_q + ['[SEQ]'] + text_p
            if len(input_texts) == 0 or len(response) == 0:
                continue

            if len (input_texts) > max_input_seq:
                input_texts = input_texts[:max_output_seq]
            
            if len (response) > max_output_seq - 2:
                response = response[:max_output_seq - 2]
                
            response = ['[START]'] + response + ['[END]']
        
            input_id = convert_toIndex(input_texts, word_dict)
            output_id = convert_toIndex(response, word_dict)
            
            if len(input_id) < max_input_seq:
                input_id.extend([0] * (max_input_seq - len(input_id)))
                
            if len(output_id) < max_output_seq:
                output_id.extend([0] * (max_output_seq - len(output_id)))
                
            assert len(input_id) == max_input_seq
            assert len(output_id) == max_output_seq
            
            if num_i % 5000 == 0:
                tf.logging.info('input text: %s' % ' '.join(input_texts))
                tf.logging.info('input id: %s' % ' '.join([str(x) for x in input_id]))
                tf.logging.info('output text: %s' % ' '.join(response))
                tf.logging.info('output id: %s' % ' '.join([ str(x) for x in output_id]))
            
            yield num_i, Example(input_id, output_id)
        

In [10]:
def build_dataSet(movie_conver, movie_lines, max_input_seq, max_output_seq, tokenizer):
    
    word_dict = collections.OrderedDict()
    num_i = 0
    
    examples = []
    for index, row in movie_conver.iterrows():
        line_index = row['conversation']
        conversation = movie_lines.loc[line_index,'text']
        
        if len(conversation) > 2:
            for num_i, example in convert_toExample(num_i, conversation, tokenizer, max_input_seq, max_output_seq, word_dict):
                examples.append(example)
    return word_dict, examples
    

In [11]:
def save_data(examples, file_path):
    writer = tf.python_io.TFRecordWriter(file_path)
    
    for example in examples:
        features = collections.OrderedDict()
        features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=example.input_sentences))
        features["output_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=example.output_sentences))

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()
    

In [12]:
# tokenizer = tokenization.FullTokenizer('data/vocab.txt')
tokenizer = TweetTokenizer()

In [13]:
max_input_seq = 20
max_output_seq = 20

word_dict, examples = build_dataSet(movie_conver, movie_lines, max_input_seq, max_output_seq, tokenizer)

INFO:tensorflow:input text: Thanks for the help .
INFO:tensorflow:input id: 2355 114 37 1091 34 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] After all you did on our farm ? You miss it , don't you Jesse ? [END]
INFO:tensorflow:output id: 21 378 187 33 185 140 1174 3833 6 181 3564 100 23 116 33 6141 6 35 0 0
INFO:tensorflow:input text: But , I don't understand .
INFO:tensorflow:input id: 141 23 24 116 1031 34 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] And , I'm asking you to trust me without understanding why . [END]
INFO:tensorflow:output id: 21 224 23 70 1224 33 92 396 184 892 2288 307 34 35 0 0 0 0 0 0
INFO:tensorflow:input text: You could be quiet .
INFO:tensorflow:input id: 181 82 93 2028 34 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] Hi . [END]
INFO:tensorflow:output id: 21 290 34 35 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input text: The interest is four hundred and fifty dollars a week on fifteen thousand

INFO:tensorflow:output id: 21 25116 474 9582 23 1121 473 6 2166 37 15249 348 6 35 0 0 0 0 0 0
INFO:tensorflow:input text: Not pure evil . That's correct .
INFO:tensorflow:input id: 36 2703 520 34 103 1810 34 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] And also that ... [END]
INFO:tensorflow:output id: 21 224 1297 120 44 35 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input text: I'd like a little more conviction ..
INFO:tensorflow:input id: 269 80 63 474 206 11171 2289 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] So would I . But it's not mine to give . [END]
INFO:tensorflow:output id: 21 158 741 24 34 141 58 147 1217 92 270 34 35 0 0 0 0 0 0 0
INFO:tensorflow:input text: With Hypnocyl ?
INFO:tensorflow:input id: 1092 47518 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:output text: [START] That's right . [END]
INFO:tensorflow:output id: 21 103 453 34 35 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input text: I can't believe Jim Hopper walked 

In [14]:
from sklearn.model_selection import train_test_split
train_example, val_examples = train_test_split(examples, test_size = 0.2, random_state = 3)
save_data(train_example, 'data/train')
save_data(val_examples, 'data/val')

In [23]:
del movie_conver
del movie_lines
del train_example, val_examples, examples

NameError: name 'movie_conver' is not defined

In [43]:
import json
with open('data/vocab.json', 'w') as fp:
    json.dump(word_dict, fp)

In [44]:
def load_DataSet(input_file, input_length, output_length, batch_size = 32, training = True):
    
    feature_description = {
      "input_ids": tf.FixedLenFeature([input_length], tf.int64),
      "output_ids": tf.FixedLenFeature([output_length], tf.int64),
      }
    
    def _parse_function(record):
        return tf.parse_single_example(record, feature_description)
    
    
    data = tf.data.TFRecordDataset(input_file).map(lambda x: _parse_function(x)).cache()  
    
    if training:
        data = data.shuffle(buffer_size=1000)
        
    data = data.batch(batch_size, drop_remainder=True)
    
    return data

In [45]:
def encoder(input_ids, vocab_size, embeding_size):
    
    embeding = tf.get_variable(name = 'word_embedding', shape = [vocab_size, embeding_size], dtype = tf.float32,
                              initializer=tf.contrib.layers.xavier_initializer())
        
    input_embeding = tf.nn.embedding_lookup(embeding, input_ids)
    
    rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in [512]]

    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
    
    sequence_length = tf.reduce_sum(tf.cast(input_ids > 0, dtype = tf.int32), axis=-1)

    outputs, state = tf.nn.dynamic_rnn(multi_rnn_cell, input_embeding, dtype=tf.float32, sequence_length = sequence_length)
    
    return outputs, state, sequence_length
    

In [46]:
def attention(attention_unit, encoder_output, sequence_length):
    
    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(attention_unit, memory = encoder_output,
                                                               memory_sequence_length=sequence_length)
    return attention_mechanism

In [47]:
def decoder(output_ids, encoder_state, vocab_size, reuse = None, training = True):
    
    with tf.variable_scope('decoder', reuse = reuse):

        embeding = tf.get_default_graph().get_tensor_by_name("word_embedding:0")

        input_embeding = tf.nn.embedding_lookup(embeding, output_ids)

        rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in [512]]

        multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
        
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(attention_unit, encoder_output, encoder_length)

        attention_cell = tf.contrib.seq2seq.AttentionWrapper(multi_rnn_cell, attention_mechanism, attention_layer_size = None)
        
        attention_cell = tf.contrib.rnn.OutputProjectionWrapper(
                attention_cell, vocab_size, reuse=reuse)
        
        batch_size = tf.shape(output_ids)[0]

        initial_state = attention_cell.zero_state(dtype = tf.float32, batch_size = batch_size)
        initial_state = initial_state.clone(cell_state=encoder_state)

        decoder_sequence_length = tf.reduce_sum(tf.cast(output_ids > 0, dtype = tf.int32), axis=-1)
        
        if training:
            helper = tf.contrib.seq2seq.TrainingHelper(
                        inputs = input_embeding,
                        sequence_length = decoder_sequence_length)
        else:
            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    embeding, start_tokens=tf.zeros([batch_size], dtype=tf.int32) + word_dict['[START]'], end_token=word_dict['[END]'])
            

        decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = attention_cell,
                helper = helper, 
                initial_state = initial_state)

        decoder_outputs, final_state, _ = tf.contrib.seq2seq.dynamic_decode(decoder)
    
    return decoder_outputs, final_state
    

In [48]:
def sequence_loss(prediction, label, sequence_length_mask, vocab_size):
    
    one_hot_label =tf.one_hot(label, depth = vocab_size)
    per_example_loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels = one_hot_label, logits = prediction)
    per_example_loss = per_example_loss * sequence_length_mask
    
    per_example_loss = tf.where(sequence_length_mask < 1, tf.stop_gradient(per_example_loss), per_example_loss)
    per_example_loss = tf.reduce_mean(per_example_loss, axis = -1)
    
    return per_example_loss


In [49]:
import json
with open('data/vocab.json', 'r') as fp:
    word_dict = json.load(fp)

In [58]:
tf.reset_default_graph()
train_file = 'data/train'
val_file = 'data/val'
input_length = 20
output_length = 20
batch_size = 32

embeding_size = 300
vocab_size = len(word_dict)
attention_unit = 32

train_data = load_DataSet(train_file, input_length, output_length, batch_size)
# val_data = load_DataSet(val_file, input_length, output_length, batch_size, training = False)

iter_ = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes)

train_init_op = iter_.make_initializer(train_data)
# test_init_op = iter_.make_initializer(val_data)
features = iter_.get_next()

input_ids = features['input_ids']
output_ids = features['output_ids']

encoder_output, encoder_state, encoder_length = encoder(input_ids, vocab_size, embeding_size)

# attention_mechanism = attention(attention_unit, encoder_output, encoder_length)

decoder_outputs, _ = decoder(output_ids, encoder_state, vocab_size = vocab_size)

# infer_outputs, _  = decoder(output_ids, encoder_state, vocab_size = vocab_size, reuse = True, training = False)

decoder_length = tf.cast(output_ids[:,:-1] > 0, dtype = tf.float32)

prob = tf.nn.softmax(decoder_outputs.rnn_output, axis=-1)
# prediction = infer_outputs.sample_id[:,:-1]
prediction = decoder_outputs.sample_id[:,:-1]

# per_example_loss  = tf.contrib.seq2seq.sequence_loss(decoder_outputs.rnn_output[:,:-1,:], output_ids[:,1:], 
#                                          weights = sequence_length, average_across_batch = False)

per_example_loss = sequence_loss(decoder_outputs.rnn_output[:,:-1,:], output_ids[:,1:], decoder_length, vocab_size)

loss, loss_update = tf.metrics.mean(values=per_example_loss)
total_cost = tf.reduce_mean(per_example_loss)

train_op = tf.train.AdamOptimizer().minimize(total_cost)

accuracy, accu_update = tf.metrics.accuracy(prediction, output_ids[:,1:], weights = decoder_length)

In [None]:
epoch = 50
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(epoch):
        begin = time.time()
        
        sess.run(train_init_op)
        sess.run(tf.local_variables_initializer())
        try: 
            while True:
#                 _, _, los, accu = sess.run([train_op, accu_update, loss, accuracy])
                _, _, los= sess.run([train_op, loss_update, loss])
        except tf.errors.OutOfRangeError:
            pass
        
        print('time cost for training % s ' % (time.time() - begin))
#         begin = time.time()
#         sess.run(test_init_op)
#         try: 
#             while True:
                
#                 _, accura= sess.run([accu_update, accuracy])
#         except tf.errors.OutOfRangeError:
#             pass
#         print('loss in traing set at epoch %s is %s, accuracy is %s' % (i, los, accura))
#         print('time cost for infer % s ' % (time.time() - begin))