# 现代循环网络

In [2]:
import collections
import math
import torch
from torch import nn
from d2l import torch as d2l
import os

## 数据加载

In [2]:
d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip', '94646ad1522d915e7b0f9296181140edcf86a4f5')
#@save
def read_data_nmt():
    """载入“英语－法语”数据集"""
    data_dir = d2l.download_extract('fra-eng')
    with open(os.path.join(data_dir, 'fra.txt'), 'r',
        encoding='utf-8') as f:
        return f.read()
raw_text = read_data_nmt()
print(raw_text[:75])

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



In [3]:
def preprocess_nmt(text):
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '
        # 使用空格替换不间断空格
        # 使用小写字母替换大写字母
    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
        # 在单词和标点符号之间插入空格
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
        for i, char in enumerate(text)]
    return ''.join(out)
text = preprocess_nmt(raw_text)
print(text[:80])

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !


In [4]:
def tokenize_nmt(text,num_examples=None):
    source,target=[],[]
    for i , line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts=line.split('\t')
        if len(parts)==2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source,target
source,target=tokenize_nmt(text)
source[:3]

[['go', '.'], ['hi', '.'], ['run', '!']]

In [5]:
src_vocab = d2l.Vocab(source, min_freq=2,
reserved_tokens=['<pad>', '<bos>', '<eos>'])
len(src_vocab)

10012

为了提高计算效率，我们仍然可以通过截断（truncation）和填充（padding）方式实现一次只处理一个小批
量的文本序列。假设同一个小批量中的每个序列都应该具有相同的长度num_steps，那么如果文本序列的词
元数目少于num_steps时，我们将继续在其末尾添加特定的“<pad>”词元，直到其长度达到num_steps；反
之，我们将截断文本序列时，只取其前num_steps 个词元，并且丢弃剩余的词元。这样，每个文本序列将具
有相同的长度，以便以相同形状的小批量进行加载|

In [6]:
def truncate_pad(line,num_steps,padding_token):
    if len(line) > num_steps:
        return line[:num_steps]
    else:
        return line + [padding_token]*(num_steps-len(line))
truncate_pad(src_vocab[source[0]],10,src_vocab['<pad>'])

[47, 4, 1, 1, 1, 1, 1, 1, 1, 1]

现在我们定义一个函数，可以将文本序列转换成小批量数据集用于训练。我们将特定的“<eos>”词元添加
到所有序列的末尾，用于表示序列的结束。当模型通过一个词元接一个词元地生成序列进行预测时，生成的
“<eos>”词元说明完成了序列输出工作。此外，我们还记录了每个文本序列的长度，统计长度时排除了填充
词元，在稍后将要介绍的一些模型会需要这个长度信息

In [7]:
def build_array_nmt(lines,vocab,num_steps):
    lines=[vocab[l] for l in lines]
    lines=[l+[vocab['<eos>']] for l in lines]
    array= torch.tensor([truncate_pad(l,num_steps,src_vocab['<pad>']) for l in lines])
    print(array[0])
    valid_len=(array!=vocab['<pad>']).type(torch.int32).sum(1)
    return array ,valid_len


In [8]:
array,valid_len=build_array_nmt(source,src_vocab,10)


tensor([47,  4,  3,  1,  1,  1,  1,  1,  1,  1])


## 编解码架构

## 编码器

In [9]:
class Seq2SeqEncoder(d2l.Encoder):
    def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,drop_out=0,**kwargs):
        super(Seq2SeqEncoder,self).__init__(**kwargs)
        self.embedding=nn.Embedding(vocab_size,embed_size)
        self.rnn=nn.GRU(embed_size,num_hiddens,num_layers,dropout=drop_out)
    def forward(self,X,*args):
        X=self.embedding(X) 
        X=X.permute(1,0,2)
        output,state=self.rnn(X)
        return output,state

In [10]:
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16,
num_layers=2)
encoder.eval()
X = torch.zeros((4, 7), dtype=torch.long)
output, state = encoder(X)
print(output.shape)
print(state.shape)

torch.Size([7, 4, 16])
torch.Size([2, 4, 16])


In [11]:
state=encoder(X)
type(state)

tuple

## 解码器

In [12]:
class Seq2SeqDecoder(d2l.Decoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
        dropout=0, **kwargs):
        super(Seq2SeqDecoder, self).__init__(**kwargs)
        self.embedding=nn.Embedding(vocab_size,embed_size)
        self.rnn= nn.GRU(num_hiddens+embed_size,num_hiddens,num_layers,dropout=dropout)
        self.dense=nn.Linear(num_hiddens,vocab_size)
    
    def init_state(self,state,*args):
        return state[1]
    
    def forward(self,X,state):
        X=self.embedding(X).permute(1,0,2)
        context=state[-1].repeat(X.shape[0],1,1)
        X_and_context=torch.cat((X,context),2)
        output,state=self.rnn(X_and_context,state)
        output=self.dense(output).permute(1,0,2)
        return output,state


In [13]:
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8, num_hiddens=16,
num_layers=2)
decoder.eval()
state = decoder.init_state(encoder(X))
output, state = decoder(X, state)
output.shape, state.shape

(torch.Size([4, 7, 10]), torch.Size([2, 4, 16]))

In [14]:
encoder(X)

(tensor([[[-2.2833e-02,  1.1956e-01, -6.3467e-03,  8.5144e-02, -1.0668e-01,
           -5.5747e-02, -1.3948e-02, -9.6089e-03,  1.1452e-01,  1.4833e-01,
            5.1862e-02,  8.6366e-02,  9.2546e-03, -6.4532e-02, -2.6648e-02,
           -2.1060e-02],
          [-2.2833e-02,  1.1956e-01, -6.3467e-03,  8.5144e-02, -1.0668e-01,
           -5.5747e-02, -1.3948e-02, -9.6089e-03,  1.1452e-01,  1.4833e-01,
            5.1862e-02,  8.6366e-02,  9.2546e-03, -6.4532e-02, -2.6648e-02,
           -2.1060e-02],
          [-2.2833e-02,  1.1956e-01, -6.3467e-03,  8.5144e-02, -1.0668e-01,
           -5.5747e-02, -1.3948e-02, -9.6089e-03,  1.1452e-01,  1.4833e-01,
            5.1862e-02,  8.6366e-02,  9.2546e-03, -6.4532e-02, -2.6648e-02,
           -2.1060e-02],
          [-2.2833e-02,  1.1956e-01, -6.3467e-03,  8.5144e-02, -1.0668e-01,
           -5.5747e-02, -1.3948e-02, -9.6089e-03,  1.1452e-01,  1.4833e-01,
            5.1862e-02,  8.6366e-02,  9.2546e-03, -6.4532e-02, -2.6648e-02,
           -2

## 生成掩码 - 使得pad不参与计算

In [15]:
def sequence_mask(X,valid_len,value=0):
    maxlen=X.size(1)
    mask=torch.arange((maxlen),dtype=torch.float32,device=X.device)[None,:] < valid_len[:,None] # 广播机制
    '''
    a=torch.tensor([[1,2,3]])
    b=torch.tensor([[1],[2]])
    a<b
    '''
    X[~mask]=value
    return X


In [16]:
X = torch.tensor([[1, 2, 3], [4, 5, 6]])
sequence_mask(X, torch.tensor([1, 2]))

tensor([[1, 0, 0],
        [4, 5, 0]])

In [1]:
a=torch.ones((8,4,2))
b=torch.rand(2)

NameError: name 'torch' is not defined

## 损失函数

In [17]:
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    # pred的形状：(batch_size,num_steps,vocab_size)
    # label的形状：(batch_size,num_steps)
    # valid_len的形状：(batch_size,)
    def forward(self,pred,label,valid_len):
        weight=torch.ones_like(label)   # weight 不是学习参数。 对每个类别的加权。
        weight=sequence_mask(weight,valid_len)
        self.reduction='none'
        unweighted_loss=super(MaskedSoftmaxCELoss,self).forward(
            pred.permute(0,2,1),label
        )
        weight_loss=(unweighted_loss*weight).mean(dim=1)
        return weight_loss


In [18]:
loss = MaskedSoftmaxCELoss()
loss(torch.ones(3, 4, 10), torch.ones((3, 4), dtype=torch.long),
torch.tensor([4, 2, 0]))

tensor([2.3026, 1.1513, 0.0000])

## 训练

In [19]:
def train_seq2seq(net,device,lr,num_epochs,data_iter,tgt_vocab):
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])
    
    net.apply(xavier_init_weights)
    net.to(device)
    optimizer= torch.optim.Adam(net.parameters(),lr=lr)
    loss=MaskedSoftmaxCELoss()
    net.train()
    animator = d2l.Animator(xlabel='epoch', ylabel='loss',
    xlim=[10, num_epochs])
    for epoch in range(num_epochs):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)
        for batch in data_iter:
            optimizer.zero_grad()
            X,X_valid_len,Y,Y_valid_len=[x.to(device) for x in batch]
            bos=torch.tensor([tgt_vocab['<bos>']]*Y.shape[0],device=device).reshape(-1,1)
            dec_input=torch.cat([bos,Y[:,:-1]],1)
            Y_hat,_ =net(X,dec_input,X_valid_len)
            l=loss(Y_hat,Y,Y_valid_len)
            l.sum().backward() # 损失函数的标量进行“反向传播”
            d2l.grad_clipping(net, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
    if (epoch + 1) % 10 == 0:
        animator.add(epoch + 1, (metric[0] / metric[1],))
    print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
    f'tokens/sec on {str(device)}')



In [21]:
A=torch.rand((2,3,4))
for a in A:
    print(a.shape)

torch.Size([3, 4])
torch.Size([3, 4])
