# Seq2Seq (Chatbot Building)

Part of **#30DaysOfBasics**, Lets do SEq2Seq (Encoder-Decoder) modelling in pytorch.

Training Data: Cornell Movies dialogs (https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html)

Referencev tutorial: https://pytorch.org/tutorials/beginner/chatbot_tutorial.html

In [1]:
import torch
import random
import numpy as np
import os
import re
import csv 
import codecs
import itertools
import json
import collections
import unicodedata

## Data Preprocessing

In [2]:
corpus_movie_convo = '../data/movie_conversations.txt'
corpus_movie_lines = '../data/movie_lines.txt'

In [3]:
with open(corpus_movie_convo, 'r', encoding='iso-8859-1') as f:
    movie_convo = f.readlines()
    
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as f:
    movie_lines = f.readlines()

In [4]:
for line in movie_lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [5]:
line_fields = ['lineID', 'characterID', 'movieID', 'character', 'text']

lines = {}

with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split('+++$+++')
        tempLineDict = {}
        
        for idx, field in enumerate(line_fields):
            tempLineDict[field] = values[idx].strip()
        lines[tempLineDict['lineID']] = tempLineDict

In [41]:
dict(itertools.islice(lines.items(), 2))

{'L1045': {'lineID': 'L1045',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': 'They do not!'},
 'L1044': {'lineID': 'L1044',
  'characterID': 'u2',
  'movieID': 'm0',
  'character': 'CAMERON',
  'text': 'They do to!'}}

In [7]:
conv_fields = ['character_1ID', 'character_2ID', 'movieID', 'utteranceIDs']

conversations = []

with open(corpus_movie_convo, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split('+++$+++')
        
        tempConvDict = {}
        
        for idx, field in enumerate(conv_fields):
            tempConvDict[field] = values[idx].strip()
        
        tempConvDict['lines'] = []
        lineIDs = eval(tempConvDict['utteranceIDs'])
        
        for lineID in lineIDs:
            tempConvDict['lines'].append(lines[lineID])
        conversations.append(tempConvDict)

In [8]:
conversations[0]

{'character_1ID': 'u0',
 'character_2ID': 'u2',
 'movieID': 'm0',
 'utteranceIDs': "['L194', 'L195', 'L196', 'L197']",
 'lines': [{'lineID': 'L194',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.'},
  {'lineID': 'L195',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Well, I thought we'd start with pronunciation, if that's okay with you."},
  {'lineID': 'L196',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Not the hacking and gagging and spitting part.  Please.'},
  {'lineID': 'L197',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"}]}

In [9]:
qa_pairs = []

for conversation in conversations:
    
    for idx in range(len(conversation['lines']) - 1):
        input_line = conversation['lines'][idx]['text'].strip()
        target_line = conversation['lines'][idx+1]['text'].strip()
        
        if input_line and target_line:
            qa_pairs.append([input_line, target_line])

In [10]:
print(qa_pairs[:2])
print('*'*100)
print(len(qa_pairs))

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you."], ["Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.']]
****************************************************************************************************
221282


In [11]:
delimeter = str(codecs.decode('\t', 'unicode_escape'))

with open('../data/formatted_movie_convo_pairs.txt', 'w', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter = delimeter)
    
    for pair in qa_pairs:
        writer.writerow(pair)

In [12]:
PAD_TOKEN = 0
SOS_TOKEN = 1
EOS_TOKEN = 2

class CustomVocabulary:
    
    def __init__(self, name):
        self.name = name
        self.word2idx = {}
        self.word2count = {}
        self.idx2word = {PAD_TOKEN:'PAD', SOS_TOKEN:'SOS', EOS_TOKEN:'EOS'}
        self.n_words = 3
        
    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.n_words
            self.word2count[word] = 1
            self.idx2word[self.n_words] = word
            self.n_words += 1
            
        else:
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        for word in sentence.split():
            self.add_word(word.strip())
            
    def trim_vocab(self, min_freq_count):
        words_to_keep = []
        
        for k, v in self.word2count.items():
            if v >= min_freq_count:
                words_to_keep.append(k)
        
        #reinitializa the vocab
        self.word2idx = {}
        self.word2count = {}
        self.idx2word = {PAD_TOKEN:'PAD', SOS_TOKEN:'SOS', EOS_TOKEN:'EOS'}
        self.n_words = 3
        
        for word in words_to_keep:
            self.add_word(word)

In [13]:
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def normalizeString(s):
    s = s.lower().strip()
    
    s = re.sub(r'([.!?])', r'\1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s).strip()
    
    return s

In [14]:
normalized_pairs = [[normalizeString(pair[0]), normalizeString(pair[1])] for pair in qa_pairs]

In [15]:
voc = CustomVocabulary('cornell movie corpus')

In [16]:
MAX_LEN = 10

def filter_pair(p):
    return len(p[0].split()) < MAX_LEN and len(p[1].split()) < MAX_LEN

def filter_all_pairs(all_pairs):
    return [pair for pair in all_pairs if filter_pair(pair)]

In [17]:
print('Before trimming, number of pairs: ', len(qa_pairs))
filtered_qa_pairs = filter_all_pairs(qa_pairs)
print('After trimming, number of pairs: ', len(filtered_qa_pairs))

Before trimming, number of pairs:  221282
After trimming, number of pairs:  91004


In [18]:
#Populating the vocab

for pair in filtered_qa_pairs:
#     print(pair)
    voc.add_sentence(pair[0])
    voc.add_sentence(pair[1])
    
print('Total num of words in vocab: ', voc.n_words)

Total num of words in vocab:  63242


In [19]:
def trim_rare_words(vocabulary, pairs, min_count):
    
#     print(vocabulary.n_words)
    vocabulary.trim_vocab(min_count)
#     print(vocabulary.n_words)   
    
    #keeping pairs who have vocab words
    keep_pairs = []
    
    for pair in pairs:
#         print(pair)
        input_seq = pair[0]
        output_seq = pair[1]
        
        keep_input = True
        keep_output = True
        
        for word in input_seq.split(' '):
            if word not in vocabulary.word2idx:
                keep_input = False
                break
        for word in output_seq.split(' '):
            if word not in vocabulary.word2idx:
                keep_output = False
                break
        if keep_input and keep_output:
            keep_pairs.append(pair)
            
    print('No of pairs before trimming: ', len(pairs))
    print('No of pairs after trimmin: ', len(keep_pairs))
    
    return keep_pairs

In [20]:
MIN_COUNT = 3
trimmed_pairs = trim_rare_words(voc, filtered_qa_pairs, MIN_COUNT)

No of pairs before trimming:  91004
No of pairs after trimmin:  46025


## Data Preparation

In [21]:
def sentence_indexing(vocab, sentence):
    return [vocab.word2idx[word] for word in sentence.split()] + [EOS_TOKEN]

In [22]:
def zero_padding(iterList, fillvalue=0):
    return list(itertools.zip_longest(*iterList, fillvalue=fillvalue))

In [23]:
def binaryMask(list_of_list_of_indexes):
    
    mask = []
    for idx, seq in enumerate(list_of_list_of_indexes):
        mask.append([])
        for token in seq:
            if token == PAD_TOKEN:
                mask[idx].append(0)
            else:
                mask[idx].append(1)
    return mask

In [24]:
def encode_input(list_of_input, vocab):
    indexed_batch = [sentence_indexing(vocab, sentence) for sentence in list_of_input]
    lengths_tensor = torch.tensor([len(indexed_list) for indexed_list in indexed_batch])
    padList = zero_padding(indexed_batch)
    padded_tensor = torch.LongTensor(padList)
    return padded_tensor, lengths_tensor

def encode_output(list_of_output, vocab):
    indexed_batch = [sentence_indexing(vocab, sentence) for sentence in list_of_output]
    max_target_len = max([len(indexed_list) for indexed_list in indexed_batch])
    padList = zero_padding(indexed_batch)
    mask = binaryMask(padList)
    mask_tensor = torch.ByteTensor(mask)
    padded_tensor = torch.LongTensor(padList)
    return padded_tensor, mask_tensor, max_target_len

def batch2TrainData(vocab, pair_batch):
    
    #sorting the questions in descending order
    pair_batch.sort(key= lambda x: len(x[0].split()), reverse=True)
    
    input_batch = []
    output_batch = []
    
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
        
    ip, lenghts = encode_input(input_batch, vocab)
    op, mask, max_tar_len = encode_output(output_batch, vocab)
    
    return ip, lenghts, op, mask, max_tar_len

In [25]:
#validating data preparation steps

small_batch_size = 5

batches = batch2TrainData(voc, [random.choice(trimmed_pairs) for _ in range(small_batch_size)])

print('Input seq: ', batches[0])
print('-'*50)
print('Input seq Shape: ', batches[0].shape)
print('Lengths: ', batches[1])
print('*'*100)
print('Output seq: ', batches[2])
print('-'*50)
print('Output seq Shape: ', batches[2].shape)
print('Mask: ', batches[3])
print('-'*50)
print('Mask Shape: ', batches[3].shape)
print('Max target Length: ', batches[4])

Input seq:  tensor([[ 950,  624,   15,  455,  111],
        [  50,   20, 1008,  119,    2],
        [  95,  191, 2845,  456,    0],
        [ 393, 5248, 9349,    2,    0],
        [   2,    2,    2,    0,    0]])
--------------------------------------------------
Input seq Shape:  torch.Size([5, 5])
Lengths:  tensor([5, 5, 5, 4, 2])
****************************************************************************************************
Output seq:  tensor([[  294,   270,   117, 14763,   205],
        [ 9499,  1357,   745,     2,    67],
        [  656,  5249,    67,     0,   277],
        [ 9500,     2,  4901,     0,   384],
        [    2,     0,     2,     0,     2]])
--------------------------------------------------
Output seq Shape:  torch.Size([5, 5])
Mask:  tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 0, 1],
        [1, 1, 1, 0, 1],
        [1, 0, 1, 0, 1]], dtype=torch.uint8)
--------------------------------------------------
Mask Shape:  torch.Size([5, 5])
M

## Modelling 

In [26]:
import torch.nn as nn

In [27]:
class GRUEncoder(nn.Module):
    
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        
        super(GRUEncoder, self).__init__()
        self.n_layers = n_layers
        self.embedding = embedding
        self.hidden_size = hidden_size
        
        #input_size for our GRU encoder is equal to hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, 
                          n_layers, dropout=(0 if n_layers==1 else dropout), bidirectional=True)
        
        
    def forward(self, input_seq, input_lengths, hidden=None):
        
        #input_seq: (max_length, batch_size)
        #input_lengths:  batch_size
        #hidden_state: (num_layers * num_directions, batch_size, hidden_size)
        
        embedded = self.embedding(input_seq)
        #pack_padded_sequence: takesn input of padded_batch and their corresponding variable lengths
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        
        #output_shape: (seq_len, batch_size, num_directions * hidden_size)
        #hidden_shape: (n_layers * num_directions, batch_size, hidden_size)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        
        #only if bidirectional==True then summing the outputs of both gru
        outputs = outputs[:,:,:self.hidden_size] + outputs[:,:,self.hidden_size:]
        
        return outputs, hidden

In [28]:
class Attention(nn.Module):
    
    def __init__(self, hidden_size, method):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.method = method
        
    def dot_product(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        atten_weights = self.dot_product(hidden, encoder_outputs)
        atten_weights = atten_weights.t()
        
        #return (batch_size, 1, max_length)
        return nn.functional.softmax(atten_weights, dim=1).unsqueeze(1)

In [29]:
class GRUDecoderAttn(nn.Module):
    
    def __init__(self, attention_method, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(GRUDecoderAttn, self).__init__()
        self.attention_method = attention_method
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        #layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers==1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.atten = Attention(attention_method, hidden_size)
        
        
    def forward(self, input_seq, last_hidden, encoder_outputs):
        
        #input_seq: one time step of input seq batch, (1, batch_size)
        #last_hidden: final hidden states of GRU, (n_layers * n_directions, batch_size, hidden_size)
        
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        
        #gru_output: (1, batch_size, n_directions * hidden_size)
        gru_output, hidden = self.gru(embedded, last_hidden)
        
        attn_weights = self.atten(gru_output, encoder_outputs)
        
        context = attn_weights.bmm(encoder_outputs.transpose(0,1))
        
        gru_output = gru_output.squeeze(0)
        
        context = context.squeeze(1)
        
        concat_input = torch.cat((gru_output, context), 1)
        
        concat_output = torch.tanh(self.concat(concat_input))
        
        out = self.out(concat_output)
        
        out = nn.functional.softmax(out, dim=1)
        
        return out, hidden

In [30]:
def maskNLLLoss(decoder_output, target, mask):
    nTotal = mask.sum()
    target = target.view(-1,1)
    
    gathered_tensor = torch.gather(decoder_output, 1, target)
    crossEntropy = -torch.log(gathered_tensor)
    loss = crossEntropy.masked_select(mask)
    
    return loss.mean(), nTotal.item()

In [31]:
#Training demo for 1 time step

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

small_batch_size = 5

batches = batch2TrainData(voc, [random.choice(trimmed_pairs) for _ in range(small_batch_size)])

input_variable, lengths, target_variable, mask, max_target_len = batches


#Parameters defining
hidden_size = 512
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
atten_method = 'dot'
embedding_maxtrix = nn.Embedding(voc.n_words, hidden_size)


#Architecture defining
encoder = GRUEncoder(hidden_size, embedding_maxtrix, encoder_n_layers, dropout)
decoder = GRUDecoderAttn(atten_method, embedding_maxtrix, hidden_size, voc.n_words, decoder_n_layers, dropout)

encoder.to(device)
decoder.to(device)

#Ensure dropout layers are in train mode
encoder.train()
decoder.train()

#Initializing optimizers
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0001)
#clearing the gradient buffer on each iteration
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()


input_variable.to(device)
lengths.to(device)
target_variable.to(device)
mask.to(device)

loss = 0
print_losses = []
n_totals = 0

encoder_output, encoder_hidden = encoder(input_variable, lengths)
# print(encode_output)
print('Encoder output is of {} and Encoder hidden is of {} shapes'.format(
    encoder_output.shape, encoder_hidden.shape))
print('*'*100)

decoder_input = torch.LongTensor([[SOS_TOKEN for _ in range(small_batch_size)]])
decoder_input = decoder_input.to(device)
print('Initial Decoder input is of {} shapes'.format(decoder_input.shape))
# print('*'*100)

#setting the initial decoder hidden state to encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
print('Initial Decoder hidden state is of {} shape'.format(decoder_hidden.shape))

print('-'*100)
print('<--Inside the one time step of GRU-->')


#if you using teacher forcing
for timestep in range(max_target_len):
    
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_output)
    print('Decoder output: {} and Decoder Hidden: {}'.format(decoder_output.shape, decoder_hidden.shape))
    
    #In Teacher forcing: Next input is current target
    decoder_input = target_variable[timestep].view(1,-1)
    
    print('Target variable at timestep[{}]: {} and Decoder Input: {}'.format(
        timestep, target_variable[timestep].shape, decoder_input.shape))
    
    print('Mask at timestep[{}]: {} with shape {}'.format(timestep, mask, mask.shape))
    maskLoss, nTotal = maskNLLLoss(decoder_output, target_variable[timestep], mask[timestep])
    
    print('Loss: {} and nTotal: {}'.format(maskLoss, nTotal))
    
    loss += maskLoss
    
    print_losses.append(maskLoss.item() * nTotal)
    
    print('print_losses: {}'.format(print_losses))
    
    n_totals += nTotal
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    returned_loss = sum(print_losses) / n_totals
    
    print('Returned loss: ', returned_loss)
    
    print('<--- DONE WITH ONE TIMESTEP --->')

Encoder output is of torch.Size([8, 5, 512]) and Encoder hidden is of torch.Size([4, 5, 512]) shapes
****************************************************************************************************
Initial Decoder input is of torch.Size([1, 5]) shapes
Initial Decoder hidden state is of torch.Size([2, 5, 512]) shape
----------------------------------------------------------------------------------------------------
<--Inside the one time step of GRU-->
Decoder output: torch.Size([5, 18784]) and Decoder Hidden: torch.Size([2, 5, 512])
Target variable at timestep[0]: torch.Size([5]) and Decoder Input: torch.Size([1, 5])
Mask at timestep[0]: tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 0, 1, 1, 1],
        [0, 0, 0, 1, 1],
        [0, 0, 0, 1, 1],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1]], dtype=torch.uint8) with shape torch.Size([8, 5])
Loss: 9.859118461608887 and nTotal: 5
print_losses: [49.295592308044434]
Returned loss:  9.859118461

  loss = crossEntropy.masked_select(mask)


In [32]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding_matrix,
         encoder_optimizer, decoder_optimizer, batch_size, clip, max_len=MAX_LEN):
    
    #zero_gradient
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    #set device option
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    
    #variable initialize
    loss = 0
    print_losses = []
    n_totals = 0
    
    #forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
#     print(encoder_outputs.shape)
    
    #create initial decoder input (start the each sentence with SOS token)
    decoder_input = torch.LongTensor([[SOS_TOKEN for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    
    #setting the initial decoder hidden state to encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]
#     print(decoder_hidden.shape)
    
    #determine if we're using Teacher Forcing for this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        
        for timestep in range(max_target_len):
            
#             print(decoder_input.shape)
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            #In Teacher forcing: Next input is current target
            decoder_input = target_variable[timestep].view(1,-1)
            maskLoss, nTotal = maskNLLLoss(decoder_output, target_variable[timestep], mask[timestep])
            loss += maskLoss
            print_losses.append(maskLoss.item() * nTotal)
    #         print('print_losses: {}'.format(print_losses))
            n_totals += nTotal
    
    else:
        
        for timestep in range(max_target_len):
            
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_output)
            #No Teacher forcing therefore, next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[item][0] for item in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            #calculate and accumlate loss
            maskLoss, nTotal = maskNLLLoss(decoder_output, target_variable[timestep], mask[timestep])
            loss += maskLoss
            print_losses.append(maskLoss.item() * nTotal)
    #         print('print_losses: {}'.format(print_losses))
            n_totals += nTotal
        
    #perform backpropogation
    loss.backward()
    
    #Clip gradient: Gradients are modified in-place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)
    
    #adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return sum(print_losses) / n_totals

In [33]:
def trainIter(model_name, vocab, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
             embedding_matrix, encoder_n_layers, decoder_n_layers, save_dir, n_iterations, batch_size,
             print_every, save_every, clip, corpus_name, loadFileName):
    
    #loading batches for each iteration
    training_batches = [batch2TrainData(vocab, [random.choice(pairs) for _ in range(batch_size)])
                        for _ in range(n_iterations)]
    
    
    print('Initializing....')
    start_iteration = 1
    print_loss = 0
    if loadFileName:
        start_iteration = checkpoint['iteration'] + 1
        
    #Training loop
    print('Training....')
    for iteration in range(start_iteration, n_iterations - 1):
        
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch
        
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder,
                    embedding_matrix, encoder_optimizer, decoder_optimizer, batch_size, clip)
        
        print_loss += loss
        
        
        #print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print('Iteration: {}, Avg loss: {:4f}'.format(iteration, print_loss_avg))
            print_loss = 0
            
            
        #save checkpoint
        if (iteration % save_every) == 0:
            
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(
                encoder_n_layers, decoder_n_layers, hidden_size))
        
            if not os.path.exists(directory):
                os.makedirs(directory)

            torch.save({
                'iteration': iteration,
                'encoder': encoder.state_dict(),
                'decoder': decoder.state_dict(),
                'en_optimizer': encoder_optimizer.state_dict(),
                'de_optimizer': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding_matrix.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))   

In [34]:
#Initializing Encoder-Decoder

# Configure models
model_name = 'chatbot_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 512
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
save_dir = '../models/'
corpus_name = 'cornell_movies'


# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 3500
loadFilename = os.path.join(save_dir, model_name, corpus_name,
                           '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
                           '{}_checkpoint.tar'.format(checkpoint_iter))


print(loadFilename)
# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['encoder']
    decoder_sd = checkpoint['decoder']
    encoder_optimizer_sd = checkpoint['en_optimizer']
    decoder_optimizer_sd = checkpoint['de_optimizer']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']

print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.n_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)

# Initialize encoder & decoder models
encoder = GRUEncoder(hidden_size, embedding, encoder_n_layers, dropout)
decoder = GRUDecoderAttn(attn_model, embedding, hidden_size, voc.n_words, decoder_n_layers, dropout)

if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
    
encoder = encoder.to(device)
decoder = decoder.to(device)

print(encoder, decoder)
if loadFilename:
    print('Models built and ready to go, In eval mode!')
    
else: print('Models built and ready to go!')

../models/chatbot_model/cornell_movies/2-2_512/3500_checkpoint.tar
Building encoder and decoder ...
GRUEncoder(
  (embedding): Embedding(18784, 512)
  (gru): GRU(512, 512, num_layers=2, dropout=0.1, bidirectional=True)
) GRUDecoderAttn(
  (embedding): Embedding(18784, 512)
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(512, 512, num_layers=2, dropout=0.1)
  (concat): Linear(in_features=1024, out_features=512, bias=True)
  (out): Linear(in_features=512, out_features=18784, bias=True)
  (atten): Attention()
)
Models built and ready to go, In eval mode!


In [35]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500
save_dir = '../models/'
corpus_name = 'cornell_movies'

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)
    
# Run training iterations
print("Starting Training!")
trainIter(model_name, voc, trimmed_pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

## Inference

In [36]:
class GreedySearchDecoder(nn.Module):
    
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, input_seq, input_length, max_length):
        
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_TOKEN
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        
        for _ in range(max_length):
            
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
            
        return all_tokens, all_scores

In [37]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LEN):
    
    indexes_batch = [sentence_indexing(voc, sentence)]
    
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)

    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)

    decoded_words = [voc.idx2word[token.item()] for token in tokens]
    
    return decoded_words


def evalInput(encoder, decoder, searcher, voc):
    
    input_sentence = ''
    
    while(1):
        
        try:
            input_sequence = input('> ')
            if input_sequence == 'q' or input_sequence == 'quit':
                break
            input_sequence = normalizeString(input_sequence)
            output_words = evaluate(encoder, decoder, searcher, voc, input_sequence)
            output_words[:] = [item for item in output_words if not (item == 'PAD' or item == 'EOS')]
            print('Bot: ', ' '.join(output_words))
        except KeyError:
            print("Error: Encountered unknown word.")

In [38]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
# evaluateInput(encoder, decoder, searcher, voc)

In [39]:
evalInput(encoder, decoder, searcher, voc)

> hey
Bot:  You know what I mean.
> how are you
Bot:  I'm not sure.
> why
Bot:  Because I'm a little while, of them.
> cool. when would you come here?
Bot:  I don't know.
> okay. How is weather at your end?
Bot:  My wife.
> lol
Error: Encountered unknown word.
> q
