# KAIST Summer Session 2018

## Simple Chat Bot using Sequence-to-Sequence with Attention (08.23.2018)

- A simple chat bot based on fictional conversations extracted from raw movie scripts.
- The training dataset is obtained from https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html.
- This code is adapted from https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html.

### 0. Data Preparation

In [22]:
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LENGTH = 5
hidden_size = 300
learning_rate = 0.01
dropout_p = 0.1
n_layers = 1
n_epochs = 20

In [23]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [24]:
def normalizeString(s):
    s = re.sub('[.,/?!-]', '', s)
    return s.lower()


def lengthfilter(pairs):
    return [pair for pair in pairs if len(pair[0].split(' ')) < MAX_LENGTH and len(pair[1].split(' ')) < MAX_LENGTH]

def readLangs(lang1, lang2):
    print("Reading lines...")
    
    # Load the data# Load  
    lines = open('chat bot\\movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
    conv_lines = open('chat bot\\movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')


    # Create a dictionary to map each line's id with its text# Creat 
    id2line = {}
    for line in lines:
        _line = line.split(' +++$+++ ')
        if len(_line) == 5:
            id2line[_line[0]] = _line[4]
        

    # Create a list of all of the conversations' lines' ids.
    convs = [ ]
    for line in conv_lines[:-1]:
        _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
        convs.append(_line.split(','))
    

    # Sort the sentences into questions (inputs) and answers (targets)
    questions = []
    answers = []

    for conv in convs:
        for i in range((len(conv)-1)):
            questions.append(id2line[conv[i]])
            answers.append(id2line[conv[i+1]])
     
    # Split every line into pairs and normalize
    pairs = [[normalizeString(x),normalizeString(y)] for x, y in zip(questions, answers)]
    
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


def prepareData(lang1, lang2, small=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2)
    random.shuffle(pairs)
    pairs = lengthfilter(pairs)
    if small:
        pairs = pairs[: int(len(pairs)/10 * 0.9)]     
        
    print("Read %s sentence pairs" % len(pairs))
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [25]:
input_lang, output_lang, pairs = prepareData('questions', 'answer', small=True) # "Small=True" option will draw 10% random sample from dataset 

training = pairs[: int(len(pairs) * 0.9)]
test = pairs[int(len(pairs) * 0.9):]

training_pairs = [tensorsFromPair(training[i]) for i in range(len(training))]

Reading lines...
Read 2273 sentence pairs
Counted words:
questions 1667
answer 1558


In [26]:
for i in range(10):
    pair = random.choice(pairs)
    print('Question: ', pair[0])
    print('Answer: ', pair[1])

Question:  he killed her
Answer:  what
Question:  it'l be alright
Answer:  it will
Question:  maverick
Answer:  yeah cougar
Question:  my wife
Answer:  what's it like
Question:  you've seen it
Answer:  yes
Question:  fifteen years yeah
Answer:  yeah oh god bless
Question:  no  no
Answer:  no
Question:  no we're not
Answer:  we're not
Question:  that means topsecret cooper
Answer:  i heard it
Question:  good night
Answer:  good night


### 1. Define a Neural Network

In [27]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers)

    def forward(self, input, hidden, cell):
        embedded = self.embedding(input).view(1, 1, -1)
        output,(hidden, cell) = self.lstm(embedded,(hidden, cell))
        return output, hidden, cell

    def init_hidden_cell(self):
        hidden = torch.zeros(self.n_layers, 1, self.hidden_size, device=device)
        cell = torch.zeros(self.n_layers, 1, self.hidden_size, device=device)
        return hidden,cell
    
    
    
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size,n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, n_layers)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, cell, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output,(hidden, cell) = self.lstm(output,(hidden, cell))
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, cell, attn_weights

    def init_hidden_cell(self):
        hidden = torch.zeros(self.n_layers, 1, self.hidden_size, device=device)
        cell = torch.zeros(self.n_layers, 1, self.hidden_size, device=device)
        return hidden, cell

### 2. Define a Loss Function and Optimizer

In [28]:
# Instantiate the seq2seq model
encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers).to(device)
attn_decoder = AttnDecoderRNN(hidden_size, output_lang.n_words,n_layers, dropout_p=dropout_p).to(device)

# Set loss and optimizer function
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(attn_decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

### 3. Training

In [29]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder.train()
    decoder.train()    
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    loss = 0

    # Encoding
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size,  device=device)
    encoder_hidden, encoder_cell = encoder.init_hidden_cell()
    
    for ei in range(input_length):
        encoder_output, encoder_hidden, encoder_cell = encoder(input_tensor[ei], encoder_hidden, encoder_cell)
        encoder_outputs[ei] = encoder_output[0, 0]

        
    # Decoding using the encoded representation          
    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden
    decoder_cell = encoder_cell

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_cell, decoder_attention = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_cell, decoder_attention = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [30]:
import time
import math

def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [31]:
print_loss_total = 0
start = time.time()

for epoch in range(1, n_epochs+1):    
    for i in range(len(training_pairs)):
        training_pair = random.choice(training_pairs)
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = train(input_tensor, target_tensor, encoder, attn_decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
    
    print_loss_average = print_loss_total / len(training_pairs)    
    print('Epoch {}'.format(epoch),' {} ({:.0f}%) {:.4f}'.format(time_since(start, epoch / n_epochs), epoch / n_epochs * 100, print_loss_average))
    print_loss_total = 0
    
print("Learning finished!")

Epoch 1  1m 46s (- 33m 41s) (5%) 4.0383
Epoch 2  3m 41s (- 33m 11s) (10%) 3.8309
Epoch 3  5m 26s (- 30m 47s) (15%) 3.7838
Epoch 4  7m 10s (- 28m 42s) (20%) 3.6605
Epoch 5  9m 2s (- 27m 7s) (25%) 3.5835
Epoch 6  10m 48s (- 25m 14s) (30%) 3.4382
Epoch 7  12m 38s (- 23m 27s) (35%) 3.2893
Epoch 8  14m 35s (- 21m 52s) (40%) 3.1549
Epoch 9  16m 26s (- 20m 5s) (45%) 2.9455
Epoch 10  18m 16s (- 18m 16s) (50%) 2.7675
Epoch 11  20m 13s (- 16m 32s) (55%) 2.5604
Epoch 12  22m 6s (- 14m 44s) (60%) 2.3938
Epoch 13  24m 1s (- 12m 55s) (65%) 2.1706
Epoch 14  26m 1s (- 11m 9s) (70%) 1.8884
Epoch 15  27m 57s (- 9m 19s) (75%) 1.7726
Epoch 16  30m 0s (- 7m 30s) (80%) 1.4914
Epoch 17  31m 59s (- 5m 38s) (85%) 1.3481
Epoch 18  33m 57s (- 3m 46s) (90%) 1.1687
Epoch 19  36m 3s (- 1m 53s) (95%) 1.0810
Epoch 20  38m 4s (- 0m 0s) (100%) 0.8820
Learning finished!


### 4. Evaluation

In [32]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    encoder.eval()
    decoder.eval()   
    
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden, encoder_cell = encoder.init_hidden_cell()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden, encoder_cell = encoder(input_tensor[ei], encoder_hidden, encoder_cell)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden,decoder_cell, decoder_attention = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

- Let's see how the model works well

In [33]:
for i in range(10):
    pair = random.choice(test)
    print('Source: ', pair[0])
    print('Truth: ', pair[1])
    output_words, attentions = evaluate(encoder, attn_decoder, pair[0])
    output_sentence = ' '.join(output_words)
    print('Predicted: ', output_sentence)
    print('') 

Source:  hi
Truth:  hi
Predicted:  hi want a drink <EOS>

Source:  you think i'm crazy
Truth:  i think you're different
Predicted:  you got a <EOS>

Source:  not that one
Truth:  not that one
Predicted:  i like like <EOS>

Source:  that's it
Truth:  that's it
Predicted:  i can't <EOS>

Source:  he's dead
Truth:  dead
Predicted:  so <EOS>

Source:  what right now
Truth:  uhhuh
Predicted:  gimme a goddamn <EOS>

Source:  jake
Truth:  do what i say
Predicted:  i'm <EOS>

Source:  drive  drive away
Truth:  what happened
Predicted:  i'm the driver <EOS>

Source:  for keeps
Truth:  for keeps
Predicted:  thank you <EOS>

Source:   hi
Truth:  well c'mere young fella
Predicted:  i knowi know <EOS>



In [34]:
own_sentence = 'What is your name?'
own_sentence = normalizeString(own_sentence)
print('Question: ', own_sentence)
output_words, attentions = evaluate(encoder, attn_decoder, own_sentence)
output_sentence = ' '.join(output_words)
print('Answer: ', output_sentence)
print('')   

Question:  what is your name
Answer:  mulet <EOS>

