# Chatbot Tutorial

## 1. Preparations

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

## 2. Load & Preprocess Data

In [2]:
corpus_name = 'cornell_movie_dialogs_corpus'
corpus = os.path.join('data', corpus_name)

def printLines(file, n=10):
    with open(file, 'rb') as fr:
        lines = fr.readlines()
    for line in lines[:n]:
        print(line)

In [3]:
printLines(os.path.join(corpus, 'movie_lines.txt'))

b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n"
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n'
b'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n'
b'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'


### Create formatted data file

Formatted data file: each line contains ***a tab-separated query sentence*** and ***a response sentence*** pair.

In [4]:
# Splits each line into a dictionary: 
# lines是字典，每个元素line：key=lineID, value=lineObj={lineID:xxx, characterID:xxx, movieID:xxx, character:xxx, text:xxx}
def loadLines(file, cols):
    lines = {}
    with open(file, 'r', encoding='iso-8859-1') as fr:
        for line in fr:
            values = line.split(' +++$+++ ')
            lineObj = {col: values[i] for i, col in enumerate(cols)}
            lines[lineObj['lineID']] = lineObj
    return lines

In [5]:
# Groups cols of lines from `loadLines` into conversations based on movie_conversations.txt
# conversations是列表，每个元素convObj: {col1:xxx, col2:xxx, ..., lines: [lineObj1, lineObj2, ..., lineObjm]}
def loadConversations(file, lines, cols):
    conversations = []
    with open(file, 'r', encoding='iso-8859-1') as fr:
        for line in fr:
            values = line.split(' +++$+++ ')
            convObj = {col: values[i] for i, col in enumerate(cols)}
            convObj['lines'] = [lines[lineId] for lineId in eval(convObj['utteranceIDs'])]
            conversations.append(convObj)
    return conversations

> 刘尧：训练数据是每个conversation中所有sentence生成的sentece对: <前一句话, 后一句话\>

In [6]:
# Extracts pairs of sentences from conversations
# qa_pair是列表，每个元素是sentence pair: [conv1_text1,conv1_text2], [1_2,1_3], [1_3,1_4], [1_4,1_5], ..., [2_1,2_2], [2_2,2_3], ...
def extractSentencePairs(conversations):
    qa_pair = []
    for conv in conversations:
        for i in range(len(conv['lines']) - 1):  # Ignore the last line (no answer for it)
            inputLine = conv['lines'][i]['text'].strip()
            targetLine = conv['lines'][i + 1]['text'].strip()
            if inputLine and targetLine:
                qa_pair.append([inputLine, targetLine])
    return qa_pair

In [7]:
datafile = os.path.join(corpus, 'formatted_movie_lines.txt')
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, 'unicode_escape'))  # 有时间好好研究这句话！！！

lines = {}
conversations = []
MOVIE_LINES_COLS = ['lineID', 'characterID', 'movieID', 'character', 'text']
MOVIE_CONVERSATIONS_COLS = ['character1ID', 'character2ID', 'movieID', 'utteranceIDs']

In [8]:
print('\nProcessing corpus and loading conversations...')
lines = loadLines(os.path.join(corpus, 'movie_lines.txt'), MOVIE_LINES_COLS)
conversations = loadConversations(os.path.join(corpus, 'movie_conversations.txt'), lines, MOVIE_CONVERSATIONS_COLS)


Processing corpus and loading conversations...


In [9]:
print('\nWriting newly formatted file ...')
with open(datafile, 'w', encoding='utf8') as fw:
    writer = csv.writer(fw, delimiter=delimiter, lineterminator='\n')
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)
print('\nSample lines from file: ')
printLines(datafile)


Writing newly formatted file ...

Sample lines from file: 
b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a 

### Load and trim data

> 刘尧：把vocabulary及其附属或衍生变量以及相关method封装抽象成Class！这样既保护数据又方便使用！好好好！！！

Note that we are dealing with sequences of **words**, we should create a **vocabulary**: mapping each unique word that we encounter in our dataset to an index value.

For this we define a ***Vocabulary*** class, which has 5 attributes and 3 methods:

- 5 attributes

    - **word2index**: A mapping from each word to index

    - **index2word**: A reverse mapping from index to each word

    - **word2count**: A mapping from each word to its count

    - num_words: A total word count
    
    - trimmed: If infrequently seen words are trimmed

- 3 methods

    - **addWord**: Adding a word to the vacabulary

    - addSentence: Adding all words in a sentence

    - trim: Trimming infrequently seen words

In [10]:
# Default word tokens
PAD_TOKEN = 0
SOS_TOKEN = 1  # Start of sentence
EOS_TOKEN = 2  # End of sentence 

MAX_LENGTH = 10

In [11]:
class Vocabulary(object):
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}  # 不默认包含PAD,SOS,EOS这仨
        self.word2count = {}
        self.index2word = {PAD_TOKEN: 'PAD', SOS_TOKEN: 'SOS', EOS_TOKEN: 'EOS'}
        self.num_words = 3  # SOS, EOS, PAD
        
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words    # 添加的word，其index依次往后排
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
    
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
            
    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        
        keep_words = [k for k, v in self.word2count.items() if v >= min_count]
        print(f'keep_words {len(keep_words)} / {len(self.word2index)} = {len(keep_words) / len(self.word2index): .4f}')
        
        # Reinitializa dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_TOKEN: 'PAD', SOS_TOKEN: 'SOS', EOS_TOKEN: 'EOS'}
        self.num_words = 3
        
        for word in keep_words:
            self.addWord(word)
            
        self.trimmed = True

Some data preprocessing:

- **unicodeToAscii**: Convert the Unicode strings to ASCII

- **normalizeString**: Convert all letters to lowercase and trim all non-letter characters except for basic punctuation

- **filterPairs**: Filter sentences with length greater than the *MAX_LENGTH* threshold

> 刘尧：这些常规的预处理，最好封装成一个个function，以方便使用！可以放在**Coding通用工具脚本**里！

In [12]:
def unicodeToAscii(s):
    """Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427"""
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [13]:
def normalizeString(s):
    """Lowercase, trim, and remove non-letter characters"""
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r'([.!?])', r' \1', s)      # 把.!?三个标点符号替换为？
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)  # 把字母和.!?之外的character替换为空格
    s = re.sub(r'\s+', r' ', s).strip()    # 把替换为空格
    return s

In [14]:
def readVocs(datafile, corpus_name):
    """Read <query, response> pairs and return a Vocabulary object"""
    lines = open(datafile, encoding='utf8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in line.split('\t')] for line in lines]
    voc = Vocabulary(corpus_name)
    return voc, pairs

In [15]:
def filterPair(pair):
    """Return True iff both sentences in pair are under the MAX_LENGTH threshold"""
    return len(pair[0].split(' ')) < MAX_LENGTH and len(pair[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    """Filter pairs using filterPair function"""
    return [pair for pair in pairs if filterPair(pair)]

In [16]:
def loadPrepareData(corpus, corpus_name, datafile, save_dir):  # corpus, save_dir 在哪里使用的！？！
    """Using the functions above, return a populated Vocabulary object and pairs list"""
    print('Start preparing training data ...')
    vocabulary, pairs = readVocs(datafile, corpus_name)
    print('Read {!s} sentence pairs'.format(len(pairs)))
    pairs = filterPairs(pairs)
    print('Trimmed to {!s} sentence pairs'.format(len(pairs)))
    print('Counting words ...')
    for pair in pairs:
        vocabulary.addSentence(pair[0])
        vocabulary.addSentence(pair[1])
    print('Counted words: ', vocabulary.num_words)
    return vocabulary, pairs

In [17]:
# Load/Assemble vocabulary and pairs
save_dir = os.path.join('data', 'save')
vocabulary, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)

Start preparing training data ...
Read 221282 sentence pairs
Trimmed to 64271 sentence pairs
Counting words ...
Counted words:  18008


In [18]:
for pair in pairs[:10]:
    print(pair)

['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


Another tactic that is beneficial to achieving faster convergence during training is **trimming rarely used words out of our vocabulary**. 

Decreasing the feature space will also soften the difficulty of the function that the model must learn to approximate.

We will do this as a two-stage process:
    
- Trim words used under *MIN_COUNT* threshold using the *Vocabulary.trim* function

- Filter out pairs with trimmed words

> 刘尧：事先从Vocabulary中定义并删除不常见的word，即**OOV的word**，随后从训练数据中删除这些OOV的word！ 

> 刘尧：疑问：模型应用时遇到OOV的word咋办？？？跟训练一样，应用前也先使用trimRareWords来处理一下！？

In [19]:
MIN_COUNT = 3
def trimRareWords(vocabulary, pairs, MIN_COUNT):
    """基于MIN_COUNT，删除vocabulary中不常见的word，并从训练/应用数据中删除带有不常见word的pairs"""
    vocabulary.trim(MIN_COUNT)
    
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input, keep_output = True, True
        
        # 判断pairs中2个句子中是否存在OOV的word，一旦存在，则删除当前pairs
        for word in input_sentence.split(' '):
            if word not in vocabulary.word2index:
                keep_input = False
                break
        for word in output_sentence.split(' '):
            if word not in vocabulary.word2index:
                keep_output = False
                break
        if keep_input and keep_output:
            keep_pairs.append(pair)
            
    print(f'Trimmed from {len(pairs)} pairs to {len(keep_pairs)}, {len(keep_pairs) / len(pairs): .4f} of total')
    return keep_pairs

In [20]:
pairs = trimRareWords(vocabulary, pairs, MIN_COUNT)

keep_words 7823 / 18005 =  0.4345
Trimmed from 64271 pairs to 53165,  0.8272 of total


## 3. Prepare Data for Models

Already done: preparing and massaging data into a nice vocabulary object and list of sentence pairs.

Not done yet: preparing numerical torch tensors as model inputs.

When batch_size=1, all we have to do is convert words in sentence pairs to their corresponding indexes from vocabulary and feed this to the models

When batch_size>1, which can **speed up training and/or leverage GPU parallelization capabilities**, we must be mindful of the variation of sentence length in our batches. 

To accommodate sentences of different sizes in the same batch, we will make our batched input tensor of shape *(max_length, batch_size)*.

If we simply convert sentences to tensors by converting words to their indexes (*indexesFromSentence*) and zero-pad, our tensor would have shape (*batch_size, max_length*). However, we need to be able to **index our batch along time(为啥子？)**, and **across all sequences in the batch**. 

Therefore, we transpose our input batch shape to (*max_length, batch_size*), so that indexing across the 1st dimension returns **a time step across all sentences** in the batch (*zeroPadding*).

> Shape Transpose: (*batch_size, max_length*) --> (*max_length, batch_size*)

![001](./image/seq2seq_batches.png)

> 刘尧：这么一大段到底要讲个啥子玩意儿？！为什么一定要Transpose一下？！

In [21]:
def sentenceToIndexes(vocabulary, sentence):
    """sentence向量化"""
    return [vocabulary.word2index[word] for word in sentence.split(' ')] + [EOS_TOKEN]

> 刘尧：疑惑：zeroPadding函数中，取seq_batch中的max_length_batch进行padding，那么不同batch的max_length_batch应该无法保证一样吧？

> 刘尧：疑惑：或者本来就不必一样了，因为经Transpose之后，max_length_batch为第1维，**batch_size变成第2维，所有batch的batch_size必然一样**，所以可直接输入到network中！？

> 刘尧：疑惑：以上是不是就是进行Transpose的重要原因！？！貌似与Keras或其他案例中不一样哎？

In [22]:
def zeroPadding(seq_batch, fillvalue=PAD_TOKEN):
    """按seq_batch中最长的seq的长度进行zero-padding，随后进行Transpose"""
    return list(itertools.zip_longest(*seq_batch, fillvalue=fillvalue))  # 内部隐含了Transpose操作！！

In [23]:
def seqsToMask2D(seqs, value=PAD_TOKEN):
    """判断seqs中每个seq的每个token是否取值为value(是=0 否=1)，把seqs转化为0和1的二维mask"""
    return [[int(token != value) for token in seq] for seq in seqs]

以下三个函数 *tempVar, inputVar, outputVar* 用于转化 sentence 数据：最普通的 sentences (list of sentence) --> Tensor of shape (batch_size, max_length_batch)

In [24]:
def sentencesToTensor(sentences, vocabulary):
    """对序列中每个sentence进行向量化和0-padding操作，随后整个序列转化为Tensor"""
    indexes_batch = [sentenceToIndexes(vocabulary, sentence) for sentence in sentences]  # 同一batch中的sentence向量化
    padList = zeroPadding(indexes_batch)    # zero-pad同一batch中的向量
    padVar = torch.LongTensor(padList)      # torch.LongTensor: dtype=torch.int64/torch.long所对应的CPU Tensor
    return indexes_batch, padList, padVar

def inputToTensor(sentences, vocabulary):
    """Returns padded input sequence tensor and lengths"""
    indexes_batch, padList, padVar = sentencesToTensor(sentences, vocabulary)
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    return padVar, lengths

def targetToTensor(sentences, vocabulary):
    """Returns padded target sequence tensor, padding mask, and max target length"""
    indexes_batch, padList, padVar = sentencesToTensor(sentences, vocabulary)
    max_length = max([len(indexes) for indexes in indexes_batch])
    mask2d = torch.ByteTensor(seqsToMask2D(padList))
    return padVar, max_length, mask2d

以下函数 *pairBatch2trainData* 用于转化 sentence pair 数据：pair_batch --> input sentences 和 target sentences --> 相应的 Tensor

In [25]:
def pairBatch2trainData(vocabulary, pair_batch):
    """Returns all items for a given batch of pairs"""
    pair_batch.sort(key=lambda x: len(x[0].split(' ')), reverse=True)  # 对pair_batch，按每个pair中第1个sentece的word数目倒序排序各pair
    input_batch, target_batch = zip(*pair_batch)                       # 解压/拆解每个pair
    inp, lengths = inputToTensor(input_batch, vocabulary)
    target, max_length, mask = targetToTensor(target_batch, vocabulary)
    return inp, lengths, target, max_length, mask

> 刘尧：感悟：数据前期操作处理如向量化、padding等可在转化为Tensor之前开展，以方便操作，待所有前期操作结束后，在输入模型进行训练前，再转化为Tensor！

> 刘尧：疑惑：这些前期操作，是不是在Tensor中有相应的便捷API？那么，需要先转化为Tensor？

In [26]:
# Example for validation
inp, lengths, target, max_length, mask = pairBatch2trainData(vocabulary, [random.choice(pairs) for _ in range(5)])
print("input:", inp)
print("input_lengths:", lengths)
print("target:", target)
print("max_target_len:", max_length)
print("mask:", mask)

input: tensor([[  25,   25,  318,  360,   33],
        [ 197,  200,  883,  361,    6],
        [ 117,   12,    4,    6,    2],
        [  74,  306, 5466,    2,    0],
        [  36, 2558,    4,    0,    0],
        [  37,    4,    2,    0,    0],
        [ 325,    2,    0,    0,    0],
        [   4,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
input_lengths: tensor([9, 7, 6, 4, 3])
target: tensor([[  76,   34,   67,  167,   25],
        [ 115,    4,    9,    4,  200],
        [ 325,   34,  862,  329, 1633],
        [   4,    4,   67,   53,    4],
        [   2,    2, 1800,  361,    4],
        [   0,    0,   56, 1034,    4],
        [   0,    0,  134,    4,    2],
        [   0,    0,    4,    2,    0],
        [   0,    0,    2,    0,    0]])
max_target_len: 9
mask: tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1],
        [0, 0, 1, 1, 1],
        [0, 0, 1, 1, 0],


## 4. Define Models

### Seq2Seq Model

The goal of a seq2seq model is to take **a variable-length sequence** as an input, and return **a variable-length sequence** as an output using a fixed-sized model. By using 2 separate RNN together, we can accomplish this task. 

The fist RNN is an **encoder**, which encodes a variable-length input sequence to a fixed-length context vector. In theory, this context vector (the final hidden layer of the RNN) will contain semantic information about the input sequence. 

> **encoder**: \[X1,X2,...,Xn\](A variable-length sequence) --> A fixed-length context vector

The second RNN is a **decoder**, which takes an input word and the context vector, and returns a guess for the next word in the sequence and a hidden state to use in the next iteration.

> **decoder**: Xn(An input word) + A fixed-length context vector --> Y1(A guess for the next word) + A hidden state

![001](./image/seq2seq_ts.png)

### Encoder

The encoder RNN iterates through the input sequence one token ***Xi*** (e.g. word) at a time, at each time step ***i*** outputting an "output" vector ***Yi*** and a "hidden state" vector ***Si***. The hidden state vector ***Si*** is then passed to the next time step ***i+1***, while the output vector ***Yi*** is recorded.

> At each time step ***i***: ***Xi*** --> ***Yi*** (recorded) + ***Si*** (passed to step ***i+1***)

The encoder transforms the context it saw at each point in the sequence into **a set of points in a high-dimensional space**, which the decoder will use to generate a meaningful output for the given task.

The heart of encoder is a multi-layered GRU. We will use a bidirectional variant of the GRU, meaning that there are essentially 2 independent RNNs: one is fed the input sequence in normal sequential order, and the other is fed in reverse order.

The outputs of each network are summed at each time step ***i***. Using a bidirectional GRU will give us the advantage of encoding both past and future context.

Bidirectional RNN:

![](./image/RNN-bidirectional.png)

Note that:

- An ***embedding layer*** is used to encode our word indices in an arbitrarily sized feature space. For our models, this layer will map each word to a feature space of size *hidden_size*.

- If passing a padded batch of sequences to an RNN module, we must **pack and unpack padding** around the RNN pass using *torch.nn.utils.rnn.pack_padded_sequence* and *torch.nn.utils.rnn.pad_packed_sequence* respectively.

**Computation Graph:**

- Convert word indexes to embeddings

- Pack padded batch of sequences for RNN module

- Forward pass through GRU

- Unpack padding

- Sum bidirectional GRU outputs

- Return output and final hidden state

**Inputs:**

- *input_seq*: Batch of input sentences; shape=***(max_length, batch_size)***

- *input_lengths*: List of sentence lengths corresponding to each sentence in the batch; shape=***(batch_size)***

- *hidden*: Hidden state; shape=***(n_layers x num_directions, batch_size, hidden_size)***

**Outputs:**

- *outputs*: Output features from the last hidden layer of the GRU (sum of bidirectional outputs); shape=***(max_length, batch_size, hidden_size)***

- *hidden*: Updated hidden state from GRU; shape=***(n_layers x num_directions, batch_size, hidden_size)*** (same shape with *hidden* above in Inputs)

**几个注意点：**

- embedding 位于 EncoderRNN 之外额外处理

- nn.GRUCell 和 nn.GRU 有区别，前者是多个 GRUCell 组合在一起，后者才是 Encoder 中展示的那样，多个 GRU 串联在一起，可设置单向或双向，是 Encoder 的核心和主体

- pack_padded_sequence 和 pad_packed_sequence 这2个函数有意思，要好好研究一下到底在干啥子

- 输入中的 hidden 是所谓的 ***S0*** ? Output 是 ***a1,a2,...,ai*** !?! 那输出中的 hidden 是啥？

In [27]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        
        # Initialize GRU  注意nn.GRU与nn.GRUCell的区别！
        # input_size和hidden_size都设置为hidden_size，因为input size is a word embedding with number of features==hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
        
    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)                                           # Convert word indexes to embeddings
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)      # Pack padded batch of sequences for RNN module
        outputs, hidden = self.gru(packed, hidden)                                     # Forward pass through GRU
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)                   # Unpack padding
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]  # Sum bidirectional GRU outpus   此时应该是对hidden进行sum吧或拼接吧？
        return outputs, hidden

**Encoder** (来自Coursera)

已知：a<0\>,  a<Tx+1\>, x<1\>, x<2\>, ..., x<tx\>, ...x<Tx\> 即输入sequence
    
流程：x<tx\> + a<tx-1\> --(BiLSTM/BiGRU)--> a<tx\>, y^<tx\>
    
结果：a<tx\> = (a<tx\>->, a<tx\><-) 即原始单词x<tx\>激活值，用于计算Attention和c<ty\>

### Decoder

The decoder RNN generates the response sentence in a **token-by-token** fashion. It uses the encoder's context vectors, and internal hidden states to generate the next word in the sequence. It continues generating words until it outputs and EOS_TOKEN.

> Token-by-token: context vectors + hidden states --> next word until EOS_TOKEN

### Attention Layer

A common problem with a vanilla seq2seq decoder is that if we **rely solely on the context vector** to encode the entire input sequence's meaning, it is likely that we will have information loss. This is especially the case when dealing with long input sequences, greatly limiting the capability of our decoder.

Attention mechanism can combat this, which allows the decoder to **pay attention to certain parts of the input sequence**, rather than using the entire fixed context at every step.

At a high level, attention is calculated using the decoder's current hidden state and the encoder's outputs (图片中是decoder's inputs ?). The output attention weights have the same shape as the input sequence, allowing us to multiply them by the encoder outputs, giving us **a weighted sum which indicates the parts of encoder output to pay attention to**.

> decoder's current hidden state + decoder's inputs --> attention weights

> attention weights x encoder outputs --> A weighted sum (attended encoder outputs)

![](./image/attn2.png)

An improvement is creating "Global attention". The key difference is that with "Global attention", we consider **all of the encoder's hidden states**, as opposed to "Local attention", which only considers **the encoder's hidden state from the current time step**. Another difference is that with "Global attention", we calculate attention weights, or energies, using the hidden state of the decoder from the current time step only. ...

Overall, the "Global attention mechanism" can be summarized by the following figure. Note that we will implement the "Attention Layer" as a separate *nn.Module* called *Attn*. The output of this module is a softmax normalized weights tensor of shape *(batch_size, 1, max_length)*.

![](./image/global_attn.png)

In [28]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        if method not in ['dot', 'general', 'concat']:
            raise ValueError(method, 'is not an appropriate attention method.')
        self.method = method
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn_nn = nn.Linear(self.hidden_size, hidden_size)      # 小神经网络中的Dense，softmax在哪？
        elif self.method == 'concat':
            self.attn_nn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))
    
    def forward(self, hidden, encoder_outputs):
        if self.method == 'dot':
            attn_energies = torch.sum(hidden * encoder_outputs, dim=2)
        elif self.method == 'general':
            energy = self.attn_nn(encoder_outputs)
            attn_energies = torch.sum(hidden * energy, dim=2)
        elif self.method == 'concat':
            # hidden是Decoder中的s<ty-1>  encoder_outputs是Encoder中的a<tx>  energy是alpha<ty,tx>
            cated = torch.cat((hidden.expand(encoder_outputs.size(0), -1, -1), encoder_outputs), 2)
            energy = self.attn_nn(cated).tanh()                # 流程1：通过小神经网络，softmax在哪？换成了tanh？？？
            attn_energies = torch.sum(self.v * energy, dim=2)  # 流程2：alpha与a加权求和  ??? self.v是个啥玩意？
        attn_energies = attn_energies.t()                      # Transpose max_length and batch_size dimensions
        return F.softmax(attn_energies, dim=1).unsqueeze(1)    # The softmax normalized probability scores   这个softmax是小神经网络里的还是别的？？

**Attention** (来自coursera) 与coursera中学的有一些差别！

已知：s<0\>

流程：s<ty-1\> + a<tx\> --(Dense)--> e<ty,tx\>  --(Softmax)--> alpha<ty,tx\> (即Attention)
    
流程：alpha<ty,tx\> + a<tx\> --(加权之和)-->  c<ty\>

结果：c<ty\> 即Context，用于Decoder中输入给LSTM

注意：a<tx\>两次使用：与s<ty-1\>拼接，与alpha<ty,tx\>加权求和

注意：c<ty\>是承上启下，承Encoder的结果，启Decoder的输入

### AttentionDecoder

For the decoder, we will manually feed our batch **one time step at a time**. This means that our embedded word tensor and GRU output will both have shape ***(1, batch_size, hidden_size)***.

**Computational Graph:**

- Get embedding of current input word

- Forward through unidirectional GRU

- Calculate attention weights from the current GRU output from the step above

- Multiply attention weights to encoder outputs to get new "weighted sum" context vector

- Concatenate weighted context vector and GRU output using Luong eq. 5

- Predict next word using Luong eq. 6 (without softmax)

- Return output and final hidden state

**Inputs:**

- *input_step*: **one time step (one word)** of input sequence batch; shape=***(1, batch_size)***

- *last_hidden*: final hidden layer of GRU; shape=*(n_layers x num_directions, batch_size, hidden_size)*

- *encoder_outputs*: encoder model's output; shape=*(max_length, batch_size, hidden_size)*

**Outputs:**

- *output*: softmax normalized tensor giving probabilities of each word being the correct next word in the decoded sequence; shape=*(batch_size, vocabulary.num_words)*

- *hidden*: final hidden state of GRU; shape=*(n_layers x num_directions, batch_size, hidden_size)*

In [29]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_method, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()
        
        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)          # 干啥子用的？
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.attn = Attn(attn_method, hidden_size)            # "Attention Layer" Attn 使用在此！
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
    # Note: we run this one step (word) at a time
    def forward(self, input_step, last_hidden, encoder_outputs):
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        
        rnn_output, hidden = self.gru(embedded, last_hidden)         # Forward through unidirectional GRU
        attn_weights = self.attn(rnn_output, encoder_outputs)        # Calculate attention weights from the current GRU output
        
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # Multiply attention weights to encoder outputs
        
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        concat_input = torch.cat((rnn_output.squeeze(0), context.squeeze(0)), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        return output, hidden

## 5. Define Training Procedure

### Masked loss

### Single training iteration

### Training iteration

## 6. Define Evaluation

### Greedy decoding

### Evaluate my text

## 7. Run Model

### Run Training

### Run Evaluation

## 8. Conclusion