# Chapter 8: Building a Chatbot Using Attention-Based Neural Networks (Notes)

- chatbot logic similar to seq2seq modeling but now rather, we r training it to respond based on the input sequence.
- adding attention
- ability to learn where in the input to took to obtain information needed than using whole seq

## Theory of Attention with Neural Networks

- in prev seq2seq modeling, we have an input then an encoder which obtains the hidden state and then a decoder that interprets the hidden state to produce an output
- however, decoding HS is not efficient as HS contains all of the sentence information (excessive)
- just parts of the sentence is relevant is prediction
- attention makes the model look at only relevant parts to make prediction, more efficient and accurate

## Comparing local and global attention
- 2 main types of attention mechanism: local and global

### local:
- model only knows a few HS from encoder
- basically local weights calculated from a few states based on whats needed
- need aligned position Pt from final HS which tells which HS to be looking at for pred
- we then calculate local weight and apply that to our HS to find context vector
- this weight will tell us to play attention to the more relevant features and less so the others
- this is passed into the decoder to make preds (before we send in final HS which has alot ofinformation)

### global:
- similar but now global weights calculated from all states unlike local
- allows our model to look at any given part of the input sentence that it considers relevant
- global attention framework is that it is essentially learning a mask that only allows through hidden states that are relevant to our prediction

# Building a ChatBot

## Notes:

- dialog data is good for chatbots, as it trains the model to respond like how humans would
- here we are using movie script data instead 
- transform a script of n lines into n-1 pairs of input/output

In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math

In [4]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:
corpus = "movie_corpus"
corpus_name = "movie_corpus"
datafile = os.path.join("Data", "formatted_movie_lines.txt")

In [3]:
with open(datafile, 'rb') as file:
    lines = file.readlines()
    
for line in lines[:3]:
    print(str(line) + '\n')
    
'''
call and response halves of each line are separated by a tab delimiter (/t) 
and that each of our lines is separated by a new line delimiter (/n).
'''

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"

b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n"

b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"



In [5]:
PAD_token = 0 
SOS_token = 1
EOS_token = 2

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

    def addWord(self, w):
        if w not in self.word2index:
            self.word2index[w] = self.num_words
            self.word2count[w] = 1
            self.index2word[self.num_words] = w
            self.num_words += 1
        else:
            self.word2count[w] += 1        
        
    def addSentence(self, sent):
        for word in sent.split(' '):
            self.addWord(word)


    def trim(self, min_cnt):
        if self.trimmed:
            return
        self.trimmed = True

        words_to_keep = []

        for k, v in self.word2count.items():
            if v >= min_cnt:
                words_to_keep.append(k)

        print('Words to Keep: {} / {} = {:.2%}'.format(
            len(words_to_keep), len(self.word2index), len(words_to_keep) / len(self.word2index)
        ))

        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

        for w in words_to_keep:
            self.addWord(w)

In [7]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def cleanString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

def readVocs(datafile, corpus_name):
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    pairs = [[cleanString(s) for s in l.split('\t')] for l in lines]
    voc = Vocabulary(corpus_name)
    return voc, pairs

def filterPair(p, max_length):
    return len(p[0].split(' ')) < max_length and len(p[1].split(' ')) < max_length

def filterPairs(pairs, max_length):
    return [pair for pair in pairs if filterPair(pair, max_length)]

def loadData(corpus, corpus_name, datafile, max_length):
    voc, pairs = readVocs(datafile, corpus_name)
    print(str(len(pairs)) + " Sentence pairs")
    pairs = filterPairs(pairs,max_length)
    print(str(len(pairs))+ " Sentence pairs after trimming")
    for p in pairs:
        voc.addSentence(p[0])
        voc.addSentence(p[1])
    print(str(voc.num_words) + " Distinct words in vocabulary")
    return voc, pairs

In [10]:
%%time 
max_length = 15
voc, pairs = loadData(corpus, corpus_name, datafile, max_length)

221282 Sentence pairs
111344 Sentence pairs after trimming
26856 Distinct words in vocabulary
CPU times: user 7.76 s, sys: 22.4 ms, total: 7.78 s
Wall time: 7.78 s


In [11]:
print("Example Pairs:")
for pair in pairs[-20:]:
    print(pair)

Example Pairs:
['do you have a reservation ?', 'food ! !']
['food ! !', 'i m sorry sir . we only seat by reservation .']
['grrrhmmnnnjkjmmmnn !', 'franz ! help ! lunatic !']
['an historical moment gentlemen .', 'excuse me my lord . norris newman of the standard my lord .']
['excuse me my lord . norris newman of the standard my lord .', 'saw you lead our cavalry sir']
['saw you lead our cavalry sir', 'indeedldid mylord . itwas one ofthe first to cross .']
['indeedldid mylord . itwas one ofthe first to cross .', 'were they in good heart as they entered enemy territory ?']
['what o clock is it mr noggs ?', 'eleven o clock my lorj']
['splendid horsemanship who are they ?', 'sikali horse my lord . christians all i know each one by name .']
['sikali horse my lord . christians all i know each one by name .', 'they come well recommended do they ? durnford']
['are you dictating the strategy of this war sir ?', 'i m explaining my reasons .']
['splendid site crealock splendil i want to establish 

In [None]:
def removeRareWords(voc, all_pairs, minimum):
    voc.trim(minimum)
    
    pairs_to_keep = []
    
    for p in all_pairs:
        keep = True
        
        for word in p[0].split(' '):
            if word not in voc.word2index:
                keep = False
                break
        for word in p[1].split(' '):
            if word not in voc.word2index:
                keep = False
                break

        if keep:
            pairs_to_keep.append(p)

    print("Trimmed from {} pairs to {}, {:.2%} of total".format(len(all_pairs)\
        , len(pairs_to_keep), len(pairs_to_keep)/ len(all_pairs)))
    return pairs_to_keep


minimum_count = 3
pairs = removeRareWords(voc, pairs, minimum_count)

In [None]:
def indexFromSentence(voc, sent):
    return [voc.word2index[w] for w in sent.split(' ')] + [EOS_token]

def zeroPad(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def inputVar(l, voc):
    indexes_batch = [indexFromSentence(voc, sentence) for sentence in l]
    padList = zeroPad(indexes_batch)
    padTensor = torch.LongTensor(padList)
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    return padTensor, lengths

def getMask(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def outputVar(l, voc):
    indexes_batch = [indexFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPad(indexes_batch)
    mask = torch.BoolTensor(getMask(padList))
    padTensor = torch.LongTensor(padList)
    return padTensor, mask, max_target_len

def batch2Train(voc, batch):
    batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    
    input_batch = []
    output_batch = []
    
    for p in batch:
        input_batch.append(p[0])
        output_batch.append(p[1])
        
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    
    return inp, lengths, output, mask, max_target_len