In [1]:
import tensorflow as tf
import numpy as np
import re
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '2'

# 讀檔

In [2]:
# load lines dictionary 
lines = open('chatbot/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')

# load conversations
convs_lines = open('chatbot/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

print('\n'.join(lines[:3]))
print()
print('\n'.join(convs_lines[:3]))
print()
print("Number of lines: ", len(lines))

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']

Number of lines:  304714


In [3]:
# build a id-line dictionary
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

print(id2line['L1045'])

They do not!


In [4]:
# build a list of all conversations
convs = []
for line in convs_lines:
    _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    convs.append(_line.split(','))

print(convs[0])

['L194', 'L195', 'L196', 'L197']


# 將兩兩連續的對話組成Q-A Pair 

In [5]:
# make Q-A pair
questions = []
answers = []

for conv in convs:
    for i in range(len(conv)-1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i+1]])
        
for i in range(1, 3):
    print("Q: ", questions[i])
    print("A: ", answers[i])
    print()

print("Number of questions: ", len(questions))
print("Number of answers: ", len(answers))

Q:  Well, I thought we'd start with pronunciation, if that's okay with you.
A:  Not the hacking and gagging and spitting part.  Please.

Q:  Not the hacking and gagging and spitting part.  Please.
A:  Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

Number of questions:  221616
Number of answers:  221616


# Word Embedding
原先有嘗試使用spacy來做word embedding，後來因為出現型別轉換的問題，而直接採用 word-to-index

In [6]:
#import spacy

#nlp = spacy.load('en_core_web_md-2.0.0')

# word_apple = nlp('apple')
# word_banana = nlp('banana')
# word_mac = nlp('mac')

# print('%s vs %s: %.6f'%(word_apple, word_banana, word_apple.similarity(word_banana)))
# print('%s vs %s: %.6f'%(word_apple, word_mac, word_apple.similarity(word_mac)))
# print('%s vs %s: %.6f'%(word_banana, word_mac, word_banana.similarity(word_mac)))

# Preprocessing

1. 全部轉成小寫
2. 濾掉標點符號 (,.!?.'#@ 等等)
3. 還原連音字 (I'm -> I am, we'd -> we would)
4. 只保留長度在2~20之間的對話
5. 每句話前面加上SOS、後面加上EOS
6. 頻率過低的字用UNK取代
7. word2index, index2word

In [7]:
def filter_line(line):
    
    line = line.lower()
    line = re.sub(r"\'d", " would", line)
    line = re.sub(r"i'm", "i am", line)
    line = re.sub(r"he's", "he is", line)
    line = re.sub(r"she's", "she is", line)
    line = re.sub(r"it's", "it is", line)
    line = re.sub(r"that's", "that is", line)
    line = re.sub(r"what's", "what is", line)
    line = re.sub(r"where's", "where is", line)
    line = re.sub(r"how's", "how is", line)
    line = re.sub(r"\'re", " are", line)
    line = re.sub(r"\'ll", " will", line)
    line = re.sub(r"\'d", " would", line)
    line = re.sub(r"n't", " not", line)
    line = re.sub(r"'til", "until", line)
    line = re.sub(r"\'ve", " have", line)
    line = re.sub(r"won't", "will not", line)
    line = re.sub(r"can't", "can not", line)
    line = re.sub(r"'bout", "about", line)
    line = re.sub(r"n'", "and", line)
    line = re.sub(r"[-()\"#/@$;:<>{}`+=~|.!?,]", "", line)
    
    return line

In [8]:
# filter out punctuation
questions_filtered = []
for question in questions:
    questions_filtered.append(filter_line(question))
    
answers_filtered = []
for answer in answers:
    answers_filtered.append(filter_line(answer))

In [9]:
print(questions[5])
print(questions_filtered[5])
print()
print(questions[10])
print(questions_filtered[10])
print()
print(answers[300])
print(answers_filtered[300])
print()
print(answers[500])
print(answers_filtered[500])
print()
print("Number of questions: ", len(questions_filtered))
print("Number of answers: ", len(answers_filtered))

Cameron.
cameron

C'esc ma tete. This is my head
c'esc ma tete this is my head

See that?  Who needs affection when I've got blind hatred?
see that  who needs affection when i have got blind hatred

I don't want you to wait for me.
i do not want you to wait for me

Number of questions:  221616
Number of answers:  221616


In [10]:
#filter out string with length <3 or >20
questions_shorten = []
answers_shorten = []
for question, answer in zip(questions_filtered, answers_filtered):
    q_len = len(question.split())
    a_len = len(answer.split())
    if q_len >= 2 and q_len <= 20 and a_len >= 2 and a_len <= 20:
        questions_shorten.append(question)
        answers_shorten.append(answer)

In [11]:
print("Number of questions: ", len(questions_shorten))
print("Number of questions: ", len(answers_shorten))

Number of questions:  138164
Number of questions:  138164


In [12]:
vocab = {}

for question, answer in zip(questions_shorten, answers_shorten):
    for q in question.split():
        if q not in vocab:
            vocab[q] = 1
        else:
            vocab[q] += 1
            
    for a in answer.split():
        if a not in vocab:
            vocab[a] = 1
        else:
            vocab[a] += 1

In [13]:
print(len(vocab))
print(vocab['hello'])

45911
701


In [14]:
# create word embedding
word2index = {'<SOS>': 0, '<EOS>': 1, '<UNK>': 2, '<PAD>':3}
index2word = {0: '<SOS>', 1: '<EOS>', 2: '<UNK>', 3: '<PAD>'}

for k, v in vocab.items():
    if v >= 10:
        word2index[k] = len(index2word)
        index2word[len(index2word)] = k

In [15]:
vocab_size = len(word2index)
print(len(word2index))
print(len(index2word))

8123
8123


In [16]:
print(index2word[0])
print(index2word[4])

<SOS>
well


In [17]:
# replace words that are not in the Vocab with <UNK>
# add <SOS> and <EOS>

Questions = []
Answers = []

for question, answer in zip(questions_shorten, answers_shorten):
    flag = True
    tmp_q = []
    tmp_a = []
    tmp_q.append(word2index['<SOS>'])
    tmp_a.append(word2index['<SOS>'])

    for q in question.split():
        if q not in word2index:
            tmp_q.append(word2index['<UNK>'])
        else:
            tmp_q.append(word2index[q])
            
    for a in answer.split():
        if a not in word2index:
            tmp_a.append(word2index['<UNK>'])
        else:
            tmp_a.append(word2index[a])
               
    tmp_q.append(word2index['<EOS>'])
    tmp_a.append(word2index['<EOS>'])

    Questions.append(tmp_q)
    Answers.append(tmp_a)

In [18]:
print(len(Questions))
print(len(Answers))

138164
138164


In [19]:
# caculate max length
q_max_len = 0
a_max_len = 0

for i in range(len(Questions)):
    q_max_len = max(q_max_len, len(Questions[i]))
    a_max_len = max(a_max_len, len(Answers[i]))

print(q_max_len, a_max_len)

22 22


In [20]:
class BatchGenerator:
    def __init__(self, en_corpus, ch_corpus, en_pad, ch_pad, en_max_len, ch_max_len, batch_size):
        assert len(en_corpus) == len(ch_corpus)
        
        batch_num = len(en_corpus)//batch_size
        n = batch_num*batch_size
        
        self.xs = [np.zeros(n, dtype=np.int32) for _ in range(en_max_len)] # encoder inputs
        self.ys = [np.zeros(n, dtype=np.int32) for _ in range(ch_max_len)] # decoder inputs
        self.gs = [np.zeros(n, dtype=np.int32) for _ in range(ch_max_len)] # decoder outputs
        self.ws = [np.zeros(n, dtype=np.float32) for _ in range(ch_max_len)] # decoder weight for loss caculation
        
        self.en_max_len = en_max_len
        self.ch_max_len = ch_max_len
        self.batch_size = batch_size
        
        for b in range(batch_num):
            for i in range(b*batch_size, (b+1)*batch_size):
                for j in range(len(en_corpus[i])-2):
                    self.xs[j][i] = en_corpus[i][j+1]
                for j in range(j+1, en_max_len):
                    self.xs[j][i] = en_pad
                
                for j in range(len(ch_corpus[i])-1):
                    self.ys[j][i] = ch_corpus[i][j]
                    self.gs[j][i] = ch_corpus[i][j+1]
                    self.ws[j][i] = 1.0
                for j in range(j+1, ch_max_len): # don't forget padding and let loss weight zero
                    self.ys[j][i] = ch_pad
                    self.gs[j][i] = ch_pad
                    self.ws[j][i] = 0.0
    
    def get(self, batch_id):
        x = [self.xs[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.en_max_len)]
        y = [self.ys[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.ch_max_len)]
        g = [self.gs[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.ch_max_len)]
        w = [self.ws[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.ch_max_len)]
        
        return x, y, g, w


batch = BatchGenerator(Questions, Answers, word2index['<PAD>'], word2index['<PAD>'], q_max_len, a_max_len, 4)

In [21]:
class seq2seq:
    def __init__(self, en_max_len, ch_max_len, en_size, ch_size):
        self.en_max_len = en_max_len
        self.ch_max_len = ch_max_len
        
        with tf.variable_scope('seq2seq_intput/output'):
            self.enc_inputs = [tf.placeholder(tf.int32, [None]) for i in range(en_max_len)] # time mojor feed
            self.dec_inputs = [tf.placeholder(tf.int32, [None]) for i in range(ch_max_len)]
            self.groundtruths = [tf.placeholder(tf.int32, [None]) for i in range(ch_max_len)]
            self.weights = [tf.placeholder(tf.float32, [None]) for i in range(ch_max_len)]
            
        with tf.variable_scope('seq2seq_rnn'): # training by teacher forcing
            self.out_cell = tf.contrib.rnn.LSTMCell(512)
            self.outputs, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(self.enc_inputs, self.dec_inputs, 
                                                                                    self.out_cell, 
                                                                                    en_size, ch_size, 300)
        with tf.variable_scope('seq2seq_rnn', reuse=True): # predict by feeding previous
            self.pred_cell = tf.contrib.rnn.LSTMCell(512, reuse=True) # reuse cell for train and test
            self.predictions, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(self.enc_inputs, self.dec_inputs, 
                                                                                        self.pred_cell, 
                                                                                        en_size, ch_size, 300, 
                                                                                        feed_previous=True)
        
        with tf.variable_scope('loss'):
            # caculate weighted loss
            self.loss = tf.reduce_mean(tf.contrib.legacy_seq2seq.sequence_loss_by_example(self.outputs, 
                                                                                          self.groundtruths, 
                                                                                          self.weights))
            self.optimizer = tf.train.AdamOptimizer(0.002).minimize(self.loss)
        
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
    
    def train(self, x, y, g, w):
        fd = {}
        for i in range(self.en_max_len):
            fd[self.enc_inputs[i]] = x[i] # show how to feed a list
        
        for i in range(self.ch_max_len):
            fd[self.dec_inputs[i]] = y[i]
            fd[self.groundtruths[i]] = g[i]
            fd[self.weights[i]] = w[i]
        
        loss, _ = self.sess.run([self.loss, self.optimizer], fd)
        
        return loss

    def output(self, x, y):
        fd = {}
        for i in range(self.en_max_len):
            fd[self.enc_inputs[i]] = x[i]
        
        for i in range(self.ch_max_len):
            fd[self.dec_inputs[i]] = y[i]
        
        out = self.sess.run(self.outputs, fd)
        
        return out
    
    def predict(self, x, ch_beg):
        fd = {}
        for i in range(self.en_max_len):
            fd[self.enc_inputs[i]] = x[i]
        
        for i in range(self.ch_max_len): # when feed previous, the fist token should be '<BEG>', and others are useless
            if i==0:
                fd[self.dec_inputs[i]] = np.ones(y[i].shape, dtype=np.int32)*ch_beg
            else:
                fd[self.dec_inputs[i]] = np.zeros(y[i].shape, dtype=np.int32)
        
        pd = self.sess.run(self.predictions, fd)
        
        return pd
    
    def save(self, e):
        self.saver.save(self.sess, 'model/seq2seq_%d.ckpt'%(e+1))
    
    def restore(self, e):
        self.saver.restore(self.sess, 'model/seq2seq_%d.ckpt'%(e))

In [22]:
tf.reset_default_graph()
model = seq2seq(q_max_len, a_max_len, len(word2index), len(index2word))

In [23]:
EPOCHS = 50
BATCH_SIZE = 128
batch_num = len(Questions) // BATCH_SIZE

batch = BatchGenerator(Questions, Answers, word2index['<PAD>'], word2index['<PAD>'], q_max_len, a_max_len, BATCH_SIZE)

In [None]:
rec_loss = []
for e in range(EPOCHS):
    train_loss = 0

    for b in range(batch_num):
        if b % 100 == 1: 
            print('Batch: %d loss: %f' % (b, batch_loss))
        x, y, g, w = batch.get(b)
        batch_loss = model.train(x, y, g, w)
        train_loss += batch_loss

    train_loss /= batch_num
    rec_loss.append(train_loss)
    print("epoch %d loss: %f" % (e, train_loss))
    model.save(e)

np.save('./model/rec_loss.npy', rec_loss)

Batch: 1 loss: 9.008163
Batch: 101 loss: 5.568803
Batch: 201 loss: 4.995028
Batch: 301 loss: 4.135249
Batch: 401 loss: 4.420690
Batch: 501 loss: 4.450699
Batch: 601 loss: 4.623874
Batch: 701 loss: 4.597316
Batch: 801 loss: 4.541345
Batch: 901 loss: 4.701380
Batch: 1001 loss: 4.320860
epoch 0 loss: 4.723568
Batch: 1 loss: 4.398059
Batch: 101 loss: 4.577847
Batch: 201 loss: 4.474986
Batch: 301 loss: 3.641847
Batch: 401 loss: 4.001675
Batch: 501 loss: 4.137747
Batch: 601 loss: 4.261307
Batch: 701 loss: 4.332678
Batch: 801 loss: 4.228455
Batch: 901 loss: 4.426671
Batch: 1001 loss: 3.967466
epoch 1 loss: 4.221116
Batch: 1 loss: 4.187247
Batch: 101 loss: 4.155052
Batch: 201 loss: 4.199736
Batch: 301 loss: 3.416176
Batch: 401 loss: 3.791955
Batch: 501 loss: 3.967443
Batch: 601 loss: 3.929487
Batch: 701 loss: 4.098811
Batch: 801 loss: 3.975438
Batch: 901 loss: 4.166307
Batch: 1001 loss: 3.651073
epoch 2 loss: 3.956894
Batch: 1 loss: 3.967913
Batch: 101 loss: 3.623804
Batch: 201 loss: 3.879039


In [25]:
model.restore(18)

INFO:tensorflow:Restoring parameters from model/seq2seq_18.ckpt


In [None]:
rec_loss = []
for e in range(19, EPOCHS):
    train_loss = 0

    for b in range(batch_num):
        if b % 100 == 1: 
            print('Batch: %d loss: %f' % (b, batch_loss))
        x, y, g, w = batch.get(b)
        batch_loss = model.train(x, y, g, w)
        train_loss += batch_loss

    train_loss /= batch_num
    rec_loss.append(train_loss)
    print("epoch %d loss: %f" % (e, train_loss))
    model.save(e)

np.save('./model/rec_loss.npy', rec_loss)

Batch: 1 loss: 1.564755
Batch: 101 loss: 1.563944
Batch: 201 loss: 2.085399
Batch: 301 loss: 2.216991
Batch: 401 loss: 2.133068
Batch: 501 loss: 2.293040
Batch: 601 loss: 1.933262
Batch: 701 loss: 2.236406
Batch: 801 loss: 1.985712
Batch: 901 loss: 2.359137
Batch: 1001 loss: 1.948127
epoch 18 loss: 2.149125
Batch: 1 loss: 2.101591
Batch: 101 loss: 1.475576
Batch: 201 loss: 2.008351
Batch: 301 loss: 2.160506
Batch: 401 loss: 2.115754
Batch: 501 loss: 2.247978
Batch: 601 loss: 1.887059
Batch: 701 loss: 2.193770
Batch: 801 loss: 1.921431
Batch: 901 loss: 2.292850
Batch: 1001 loss: 1.949135
epoch 19 loss: 2.106405
Batch: 1 loss: 2.096280
Batch: 101 loss: 1.504730
Batch: 201 loss: 2.048798
Batch: 301 loss: 2.099875
Batch: 401 loss: 2.097867
Batch: 501 loss: 2.173342
Batch: 601 loss: 1.783840
Batch: 701 loss: 2.152656
Batch: 801 loss: 1.944306
Batch: 901 loss: 2.324838
Batch: 1001 loss: 1.905224
epoch 20 loss: 2.081011
Batch: 1 loss: 2.074074
Batch: 101 loss: 1.471436
Batch: 201 loss: 2.0033

In [24]:
model.restore(43)

INFO:tensorflow:Restoring parameters from model/seq2seq_43.ckpt


In [None]:
rec_loss = []
for e in range(44, EPOCHS):
    train_loss = 0

    for b in range(batch_num):
        if b % 100 == 1: 
            print('Batch: %d loss: %f' % (b, batch_loss))
        x, y, g, w = batch.get(b)
        batch_loss = model.train(x, y, g, w)
        train_loss += batch_loss

    train_loss /= batch_num
    rec_loss.append(train_loss)
    print("epoch %d loss: %f" % (e, train_loss))
    model.save(e)

np.save('./model/rec_loss.npy', rec_loss)

Batch: 1 loss: 1.747747
Batch: 101 loss: 1.158854
Batch: 201 loss: 1.657402
Batch: 301 loss: 1.890517
Batch: 401 loss: 1.798719
Batch: 501 loss: 1.888305
Batch: 601 loss: 1.490829
Batch: 701 loss: 1.820071
Batch: 801 loss: 1.559195
Batch: 901 loss: 1.868362
Batch: 1001 loss: 1.612518
epoch 43 loss: 1.745909
Batch: 1 loss: 1.777535
Batch: 101 loss: 1.150358
Batch: 201 loss: 1.660801
Batch: 301 loss: 1.888675
Batch: 401 loss: 1.744602
Batch: 501 loss: 1.902653
Batch: 601 loss: 1.546380
Batch: 701 loss: 1.840378
Batch: 801 loss: 1.554497
Batch: 901 loss: 1.928380
Batch: 1001 loss: 1.595371
epoch 44 loss: 1.752835
Batch: 1 loss: 1.737133
Batch: 101 loss: 1.155712
Batch: 201 loss: 1.623110
Batch: 301 loss: 1.894664
Batch: 401 loss: 1.742445
Batch: 501 loss: 1.869797
Batch: 601 loss: 1.504197
Batch: 701 loss: 1.830243
Batch: 801 loss: 1.557066
Batch: 901 loss: 1.875733
Batch: 1001 loss: 1.602853
epoch 45 loss: 1.761135


In [24]:
model.restore(45)

INFO:tensorflow:Restoring parameters from model/seq2seq_45.ckpt


In [25]:
rec_loss = []
for e in range(46, EPOCHS):
    train_loss = 0

    for b in range(batch_num):
        if b % 100 == 1: 
            print('Batch: %d loss: %f' % (b, batch_loss))
        x, y, g, w = batch.get(b)
        batch_loss = model.train(x, y, g, w)
        train_loss += batch_loss

    train_loss /= batch_num
    rec_loss.append(train_loss)
    print("epoch %d loss: %f" % (e, train_loss))
    model.save(e)

np.save('./model/rec_loss.npy', rec_loss)

Batch: 1 loss: 1.737133
Batch: 101 loss: 1.155712
Batch: 201 loss: 1.623049
Batch: 301 loss: 1.895772
Batch: 401 loss: 1.714554
Batch: 501 loss: 1.886192
Batch: 601 loss: 1.481389
Batch: 701 loss: 1.788327
Batch: 801 loss: 1.529632
Batch: 901 loss: 1.924595
Batch: 1001 loss: 1.613624
epoch 46 loss: 1.755118
Batch: 1 loss: 1.737347
Batch: 101 loss: 1.154730
Batch: 201 loss: 1.613453
Batch: 301 loss: 1.866787
Batch: 401 loss: 1.759150
Batch: 501 loss: 1.867481
Batch: 601 loss: 1.478531
Batch: 701 loss: 1.759460
Batch: 801 loss: 1.588596
Batch: 901 loss: 1.888155
Batch: 1001 loss: 1.638176
epoch 47 loss: 1.741701


KeyboardInterrupt: 

In [57]:
model.restore(45)

INFO:tensorflow:Restoring parameters from model/seq2seq_45.ckpt


# Testing

Hello. <br>
How are you? <br>
Where are you going? <br>
You look great. <br>
Good night. <br>

1. 將以上五個問題先經過 filter_line() 前處理，將大寫轉換成小寫、過濾掉標點符號等
2. 每句前面加上SOS，句尾加上EOS，沒看過的字用UNK取代
3. 轉換成 index
4. 用前面的 BatchGenerator() 包成 batch
5. 用cherry pick挑出最好的

In [59]:
testing_questions = ['Hi',
                     'How are you?',
                     'Where are you going?',
                     'You look great.',
                     'Good night.']
testing_answers = ['Hi',
                   'fine',
                   'I am going to ',
                   'Thanks',
                   'Good night']

testing_questions_filtered = []
testing_answers_filtered = []
for question, answer in zip(testing_questions, testing_answers):
    tmp = filter_line(question)
    testing_questions_filtered.append(tmp)
    tmp = filter_line(answer)
    testing_answers_filtered.append(tmp)
    
print(testing_questions_filtered[0])
print(testing_answers_filtered[0])
print()
print(testing_questions_filtered[1])
print(testing_answers_filtered[1])
print()

testing_Q = []
testing_A = []

for question, answer in zip(testing_questions_filtered, testing_answers_filtered):
    tmp_q = []
    tmp_a = []
    tmp_q.append(word2index['<SOS>'])
    tmp_a.append(word2index['<SOS>'])
    for q in question.split():
        if q not in word2index:
            tmp_q.append(word2index['<UNK>'])
        else:
            tmp_q.append(word2index[q])
    for a in answer.split():
        if a not in word2index:
            tmp_a.append(word2index['<UNK>'])
        else:
            tmp_a.append(word2index[a])

    tmp_q.append(word2index['<EOS>'])
    tmp_a.append(word2index['<EOS>'])
    testing_Q.append(tmp_q)
    testing_A.append(tmp_a)

print(testing_Q[0])
print(testing_A[0])
print()
print(testing_Q[1])
print(testing_A[1])

batch_test = BatchGenerator(testing_Q, testing_A, word2index['<PAD>'], word2index['<PAD>'], q_max_len, a_max_len, 1)

hi
hi

how are you
fine

[0, 183, 1]
[0, 183, 1]

[0, 23, 31, 15, 1]
[0, 477, 1]


In [31]:
import nltk

def cherry_pick(records, n, upper_bound=1.0):
    bleus = []
    
    for en, ch_gr, ch_pd in records:
        # caculate BLEU by nltk
        bleu = nltk.translate.bleu_score.sentence_bleu([ch_gr], ch_pd)
        bleus.append(bleu)
    
    lst = [i for i in range(len(records)) if bleus[i]<=upper_bound]
    lst = sorted(lst, key=lambda i: bleus[i], reverse=True) # sort by BLEU score
    
    return [records[lst[i]] for i in range(n)]

In [60]:
import random as rd

records = []

for i in range(5):
    x, y, g, w = batch_test.get(i)
    out = model.output(x, y)
    pd = model.predict(x, word2index['<SOS>'])
    xs = [index2word[x[i][0]] for i in range(q_max_len)]
    xs = xs[:xs.index('<PAD>')]
    ys_gr = [index2word[g[i][0]] for i in range(a_max_len)]
    if '<EOS>' in ys_gr:
        ys_gr = ys_gr[:ys_gr.index('<EOS>')]
    ys_pd = [index2word[np.argmax(pd[i][0, :])] for i in range(a_max_len)]
    if '<EOS>' in ys_pd:
        ys_pd = ys_pd[:ys_pd.index('<EOS>')]

    records.append([xs, ys_gr, ys_pd])

n = 5  # how many result we show
rec_cherry = cherry_pick(records, n)

for i in range(n):
    print(' '.join(rec_cherry[i][0]))
    print(' '.join(rec_cherry[i][2]))
    print('')

good night
good night

where are you going
to complete my cure

how are you
fine i will take a minute

hi
i am scared

you look great
i am not sure you



# 分析

loss從最一開始的9點多，經過50個epoch降到1點多，因此應該是有train起來的。從印出的loss中可以看出來它還在繼續收斂，如果有更多時間的話應該有機會訓練出更好的model。我覺得我的 model 跟我平常講話的習慣還滿相似的，以下是我為每段對話想的一些情境

1. 第一組是正常的問答，A：晚安，B：說晚安
2. 第二組也相當符合常理，B可能在去醫院的路上，A：你要去哪？ B：我要去完成療程
3. 第三組是日常對話，A, B在路上偶遇，A先跟B打招呼說「你好嗎？」，但 B 忙著講電話，因此就匆匆說「還行，等我一分鐘」
4. 第四組對話的情境可能是 A 在路上看到 B 就跟他打招呼說「嗨」，但 B 當時在恍神，所以就說「嚇我一跳」
5. 第五組對話常常發生在我們宿舍，A 說「你看起來真美」，B 就會說「我才不相信你哈哈哈」