In [1]:
import spacy
import torch
import torch.nn as nn
 
from torchtext.datasets import Multi30k
from torchtext.data  import Field,BucketIterator

In [2]:
#创建分词器器
spacy_en=spacy.load("en_core_web_sm")#英语分词器
spacy_de=spacy.load("de_core_news_sm")#德语分词器
 
def en_seq(text):
    return [word.text for word in spacy_en.tokenizer(text)]
 
def de_seq(text):
    return [word.text for word in spacy_de.tokenizer(text)][::-1]#源端倒序
 
#源端的处理手段
SRC=Field(tokenize=de_seq,
         init_token="<sos>",
         eos_token="<eos>",
         lower=True)
 
#目标端的处理手段
TRG=Field(tokenize=en_seq,
         init_token="<sos>",
         eos_token="<eos>",
         lower=True)

In [3]:
#定义dataset数据集，这里将其数据经过fiels处理
train_data,valid_data,test_data=Multi30k.splits(exts=(".de",".en"),
                                               fields=(SRC,TRG))
 
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [4]:
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)

In [5]:
BATCH_SIZE = 128
cuda=torch.cuda.is_available()
 
train_iterator, valid_iterator, test_iterator=BucketIterator.splits(
    (train_data,valid_data,test_data),
    batch_size=BATCH_SIZE,
    device=torch.device('cuda' if cuda else 'cpu')
)

In [6]:
class LuongAttention(nn.Module):
    def __init__(self, hidden_size, method="dot"):
        super(LuongAttention, self).__init__()
        self.method = method
        if method == "general":
            self.W = nn.Linear(hidden_size, hidden_size)
        
    def forward(self, h, encoder_outputs):
        # h: [batch_size, hidden_size]
        # encoder_outputs: [batch_size, seq_len, hidden_size]
        if self.method == "dot":
            score = torch.bmm(encoder_outputs, h.unsqueeze(2))  # [batch_size, seq_len, 1]
        elif self.method == "general":
            score = self.W(encoder_outputs)  # [batch_size, seq_len, hidden_size]
            score = torch.bmm(score, h.unsqueeze(2))  # [batch_size, seq_len, 1]
        
        attention_weights = torch.softmax(score, dim=1)
        context_vector = torch.bmm(encoder_outputs.transpose(1, 2), attention_weights).squeeze(2)
        return context_vector, attention_weights

In [7]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size, 1)

    def forward(self, h, encoder_outputs):
        # h: [batch_size, hidden_size], decoder 的当前隐藏状态
        # encoder_outputs: [batch_size, seq_len, hidden_size], 编码器的所有输出
        h = h.unsqueeze(1)  # [batch_size, 1, hidden_size]
        score = self.V(torch.tanh(self.W1(encoder_outputs) + self.W2(h)))  # [batch_size, seq_len, 1]
        attention_weights = torch.softmax(score, dim=1)  # [batch_size, seq_len, 1]
        
        context_vector = attention_weights * encoder_outputs  # [batch_size, seq_len, hidden_size]
        context_vector = context_vector.sum(1)  # [batch_size, hidden_size]
        
        return context_vector, attention_weights

In [8]:
import torch.nn as nn

class  Encoder(nn.Module):
    #src_vocab_size德语词汇表大小，emb_model词向量维度，hidden_size隐藏向量维度，n_layers lstm深度
    def __init__(self,src_vocab_size,emb_model,hidden_size,n_layers,dropout):
        super(Encoder,self).__init__()
        
        self.embed=nn.Embedding(src_vocab_size,emb_model,padding_idx=1)
        self.lstm=nn.LSTM(input_size=emb_model,hidden_size=hidden_size,num_layers=n_layers,batch_first=True,dropout=dropout)
    
    def forward(self,src):
        #src[batch_size,seq_len]
        src=self.embed(src)
        #src[batch_size,seq_len,emb_model]
        output,(h_n,c_n)=self.lstm(src)
        #output[batch_size,seq_len,hidden_size]  最后一层每个时间步的隐状态h_t
        
        #h_n[batch_size,n_layers,hidden_size] 最后一个时间步每层的隐状态(实际上并非这样，Pytorch机制原因)
        #c_n[batch_size,n_layers,hidden_size] 最后一个时间步每层的记忆c（实际上并非这样，Pytorch机制原因）
        
        return output,(h_n,c_n)#output的意义不大，主要是(h_n,c_n)，其作为上下文向量

In [9]:
#测试，参数
emb_model=256
hidden_size=512
n_layers=4
dropout=0.5
src_vocab_size=len(SRC.vocab)

enModel=Encoder(src_vocab_size,emb_model,hidden_size,n_layers,dropout)
if(cuda):
    enModel=enModel.cuda()

In [21]:
class Decoder(nn.Module):
    #trg_vocab_size 目标端的词汇表大小
    #emb_dim为词向量维度（我们将其设置与源端一样大小）
    #hidden_size 为目标端隐层维度（将其设置为与源端一样大小）
    #n_layers 网络层数（将其设置为一样大小）
    def __init__(self,trg_vocab_size,emb_dim,hidden_size,n_layers,dropout,attention):
        super(Decoder,self).__init__()
        
        self.emb=nn.Embedding(trg_vocab_size,emb_dim,padding_idx=1)
        self.lstm=nn.LSTM(emb_dim*3,hidden_size,num_layers=n_layers,batch_first=True,dropout=dropout)
        self.classify=nn.Linear(hidden_size,trg_vocab_size)
        self.attention = attention
    
    def forward(self,trg,h_n,c_n,encoder_outputs):
        #trg为应该为[batch,seq_len]，不过实际训练中是一个一个传入（要考虑是否采用强制教学），所以seq_len为1
        #trg真正的输入维度为[batch]
        #h_n与c_n是源端的上下文向量（若计算不指定，则默认为0（若Encoder编码中））
        #维度均为：[n_layers,batch_size,hidden_size]
        trg=trg.unsqueeze(1)
        #trg[batch,1]
        embedded=self.emb(trg)
        #trg[batch,1,emb]

        # 计算注意力权重和上下文向量
        context_vector, attention_weights = self.attention(h_n[-1], encoder_outputs)
        # 将嵌入层输出和上下文向量结合起来
        lstm_input = torch.cat((embedded, context_vector.unsqueeze(1)), dim=2)


        output,(h_n,c_n)=self.lstm(lstm_input,(h_n,c_n))#这里的lstm指定了h，c，因此其内部不会自己创建一个全为0的h，c
        #output[batch,1,emb]
        #h_i[1，batch,emb]
        #c_i[1,batch,emb]
        output=self.classify(output.squeeze())#output.squeeze()使得output[batch 1 emb]->[batch emb]
        #output[batch trg_vocab_size]
        return output,(h_n,c_n) #返回(h_n,c_n)是为了下一解码器继续使用

In [22]:
#测试，参数
trg_vocab_size=len(TRG.vocab)

attention = LuongAttention(hidden_size)

Demodel=Decoder(trg_vocab_size,emb_model,hidden_size,n_layers,dropout, attention)
if cuda:
    Demodel=Demodel.cuda()

In [23]:
import random

class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super(Seq2Seq,self).__init__()
        self.encoder=encoder
        self.decoder=decoder
    
    def forward(self,src,trg,teach_rate=0.5):
        #src [bacth,seq_len]
        #trg  [bacth,seq_len]
        #teach_radio 强制教学的阈值
        batch_size=trg.shape[0]
        trg_seqlen=trg.shape[1]
        
        #保存每次输出的结果
        outputs_save=torch.zeros(batch_size,trg_seqlen,trg_vocab_size)
        if(cuda):
            outputs_save=outputs_save.cuda()
        #对源端进行编码
        encoder_outputs,(h_n,c_n)=self.encoder(src)
        
        #第一个输入到解码器中为<sos>
        trg_i=trg[:,0]
        #trg_i [batch]
        for i in range(1,trg_seqlen):
            output,(h_n,c_n)=self.decoder(trg_i,h_n,c_n,encoder_outputs)
            #output[batch trg_vocab_size]
            outputs_save[:,i,:]=output
            #产生一个随机概率(即是否强制教学)
            probability=random.random()
            
            #获取时间步预测的结果
            top=output.argmax(1)
            #top[batch]
            #下一时间步的输入
            trg_i=trg[:,i] if probability>teach_rate else top
        return outputs_save

In [24]:
model=Seq2Seq(enModel,Demodel)
if(cuda):
    model=model.cuda()

In [25]:
from torch.optim import Adam

epochs=10
optim=Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = 1)#pad不参与损失函数的计算

In [26]:
#参数初始化
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embed): Embedding(7853, 256, padding_idx=1)
    (lstm): LSTM(256, 512, num_layers=4, batch_first=True, dropout=0.5)
  )
  (decoder): Decoder(
    (emb): Embedding(5893, 256, padding_idx=1)
    (lstm): LSTM(768, 512, num_layers=4, batch_first=True, dropout=0.5)
    (classify): Linear(in_features=512, out_features=5893, bias=True)
    (attention): LuongAttention()
  )
)

In [27]:
def train(model,train_iter,optim,criterion):
    model.train()#即dropout产生作用
    epoch_loss=0
    for i,example in enumerate(train_iter):
        src=example.src.permute(1,0)
        trg=example.trg.permute(1,0)
        #src[batch seqlen]
        #trg[batch seqlen]
 
        optim.zero_grad()
        output=model(src,trg)#output[batch trg_seqlen trq_vocab_size]
        #<sos>不参与运算，pad也不参与运算（criterion已经设置了ignore）
        output=output[:,1:,:].reshape(-1,trg_vocab_size)
        trg=trg[:,1:].reshape(-1)
        #output[batch*(trg_len-1),trg_vocab_size]
        #trg[batch*(trg_len-1)]
        loss=criterion(output,trg)
        loss.backward()
        optim.step()
        epoch_loss+=loss.item()
    return epoch_loss/len(train_iter)

In [28]:
def evaluate(model,test_iter,criterion):
    model.eval()#即dropout产生作用
    epoch_loss=0
    
    with torch.no_grad():
        for i,example in enumerate(test_iter):
            
            src=example.src.permute(1,0)
            trg=example.trg.permute(1,0)
            #src[batch seqlen]
            #trg[batch seqlen]
 
 
            #即无法在进行强制教学
            output=model(src,trg,0)#output[batch trg_seqlen trq_vocab_size]
            
            #<sos>不参与运算，pad也不参与运算（criterion已经设置了ignore）
            output=output[:,1:].reshape(-1,trg_vocab_size)
            trg=trg[:,1:].reshape(-1)
            #output[batch*(trg_len-1),trg_vocab_size]
            #trg[batch*(trg_len-1)]
            loss=criterion(output,trg)
            epoch_loss+=loss.item()
 
    return epoch_loss/len(test_iter)

In [29]:
import time,math

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## LuongAttention

In [30]:
for epoch in range(epochs):
    start_time=time.time()
    
    train_loss=train(model,train_iterator,optim,criterion)
    valid_loss=evaluate(model,valid_iterator,criterion)
    
    end_time=time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 33s
	Train Loss: 5.150 | Train PPL: 172.405
	 Val. Loss: 4.874 |  Val. PPL: 130.809
Epoch: 02 | Time: 0m 34s
	Train Loss: 4.884 | Train PPL: 132.124
	 Val. Loss: 4.295 |  Val. PPL:  73.357
Epoch: 03 | Time: 0m 34s
	Train Loss: 4.637 | Train PPL: 103.183
	 Val. Loss: 4.059 |  Val. PPL:  57.926
Epoch: 04 | Time: 0m 35s
	Train Loss: 4.535 | Train PPL:  93.189
	 Val. Loss: 4.006 |  Val. PPL:  54.935
Epoch: 05 | Time: 0m 38s
	Train Loss: 4.439 | Train PPL:  84.725
	 Val. Loss: 3.920 |  Val. PPL:  50.423
Epoch: 06 | Time: 0m 38s
	Train Loss: 4.151 | Train PPL:  63.473
	 Val. Loss: 3.575 |  Val. PPL:  35.694
Epoch: 07 | Time: 0m 35s
	Train Loss: 3.924 | Train PPL:  50.624
	 Val. Loss: 3.355 |  Val. PPL:  28.653
Epoch: 08 | Time: 0m 35s
	Train Loss: 3.719 | Train PPL:  41.240
	 Val. Loss: 3.177 |  Val. PPL:  23.981
Epoch: 09 | Time: 0m 35s
	Train Loss: 3.541 | Train PPL:  34.502
	 Val. Loss: 3.044 |  Val. PPL:  20.999
Epoch: 10 | Time: 0m 35s
	Train Loss: 3.389 | Train PPL

## BahdanauAttention

In [34]:
attention = BahdanauAttention(hidden_size)

Demodel=Decoder(trg_vocab_size,emb_model,hidden_size,n_layers,dropout, attention)
if cuda:
    Demodel=Demodel.cuda()

model=Seq2Seq(enModel,Demodel)
if(cuda):
    model=model.cuda()

epochs=10
optim=Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = 1)#pad不参与损失函数的计算

#参数初始化
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

for epoch in range(epochs):
    start_time=time.time()
    
    train_loss=train(model,train_iterator,optim,criterion)
    valid_loss=evaluate(model,valid_iterator,criterion)
    
    end_time=time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 42s
	Train Loss: 5.144 | Train PPL: 171.460
	 Val. Loss: 4.881 |  Val. PPL: 131.774
Epoch: 02 | Time: 0m 42s
	Train Loss: 4.834 | Train PPL: 125.772
	 Val. Loss: 4.297 |  Val. PPL:  73.499
Epoch: 03 | Time: 0m 43s
	Train Loss: 4.437 | Train PPL:  84.535
	 Val. Loss: 3.865 |  Val. PPL:  47.704
Epoch: 04 | Time: 0m 43s
	Train Loss: 4.200 | Train PPL:  66.672
	 Val. Loss: 3.655 |  Val. PPL:  38.673
Epoch: 05 | Time: 0m 44s
	Train Loss: 4.036 | Train PPL:  56.574
	 Val. Loss: 3.532 |  Val. PPL:  34.182
Epoch: 06 | Time: 0m 44s
	Train Loss: 3.922 | Train PPL:  50.493
	 Val. Loss: 3.429 |  Val. PPL:  30.843
Epoch: 07 | Time: 0m 45s
	Train Loss: 3.811 | Train PPL:  45.182
	 Val. Loss: 3.331 |  Val. PPL:  27.970
Epoch: 08 | Time: 0m 44s
	Train Loss: 3.706 | Train PPL:  40.675
	 Val. Loss: 3.251 |  Val. PPL:  25.815
Epoch: 09 | Time: 0m 44s
	Train Loss: 3.607 | Train PPL:  36.860
	 Val. Loss: 3.178 |  Val. PPL:  23.995
Epoch: 10 | Time: 0m 43s
	Train Loss: 3.522 | Train PPL

In [2]:
def translate(model, src_sentence, SRC, TRG, device="cuda"):
    model.eval()
    with torch.no_grad():
        # Tokenize sentence if not already.
        if isinstance(src_sentence, str):
            spacy_de = spacy.load("de_core_news_sm")
            tokens = [tok.text.lower() for tok in spacy_de(src_sentence)]
        else:
            tokens = [tok.lower() for tok in src_sentence]

        # Add <sos> and <eos> tokens
        tokens = [SRC.init_token] + tokens + [TRG.eos_token]
        
        # Convert to indices
        src_indexes = [SRC.vocab.stoi[token] for token in tokens]
        src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
        
        # Forward pass through the encoder
        encoder_outputs, hidden = model.encoder(src_tensor)
        
        # First input to the decoder is the <sos> token
        trg_indexes = [TRG.vocab.stoi[TRG.init_token]]
        
        # Initial context is the last hidden state from the encoder
        context=hidden.permute(1,0,2)

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        input=trg_tensor[:,0]

        # Begin decoding
        for i in range(100):  # assuming maximum length of the translated sentence is 100 tokens
            
            
            # Compute the attention and output from decoder
            output, hidden = model.decoder(input, context, hidden, encoder_outputs)
            
            # Select the token with the highest probability
            pred_token = output.argmax(1).item()
            trg_indexes.append(pred_token)

            # If <eos> token is generated, stop
            if pred_token == TRG.vocab.stoi[TRG.eos_token]:
                break

        # Convert indices to words
        trg_tokens = [TRG.vocab.itos[i] for i in trg_indexes]

        # Return the generated tokens, removing the initial <sos>
        return trg_tokens[1:]

In [3]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):

    trgs = []
    pred_trgs = []

    for datum in data:

        src = vars(datum)['src']
        trg = vars(datum)['trg']

        pred_trg, _ = translate(model, src, src_field, trg_field, device)

        #cut off <eos> token
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        trgs.append([trg])

    return bleu_score(pred_trgs, trgs)
bleu_score = calculate_bleu(test_data, SRC, TRG, model, 'cuda')
print(f"BLEU score: {bleu_score:.2f}")

BLEU score = 37.75
