In [1]:
import torch
import spacy
 
from torchtext.data import Field,BucketIterator
from torchtext.datasets import Multi30k

In [2]:
cuda=torch.cuda.is_available()
device=torch.device("cuda" if cuda else "cpu")

In [3]:
spacy_de=spacy.load("de_core_news_sm")
spacy_en=spacy.load("en_core_web_sm")
 
def de_seq(text):
    #return [word.text for word in spacy_de.tokenizer(text)][::-1]这里并未进行倒序
    return [word.text for word in spacy_de.tokenizer(text)]
 
def en_seq(text):
    return [word.text for word in spacy_en.tokenizer(text)]
 
SRC=Field(tokenize=de_seq,
         init_token="<sos>",
         eos_token="<eos>",
         lower=True)
TRG=Field(tokenize=en_seq,
         init_token="<sos>",
         eos_token="<eos>",
         lower=True)

In [4]:
train_data,val_data,test_data=Multi30k.splits(exts=(".de",".en"),
                                             fields=(SRC,TRG))

In [5]:
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)

In [6]:
batch_size=128
 
train_iter,val_iter,test_iter=BucketIterator.splits(
    (train_data,val_data,test_data),
    batch_size=batch_size,
    device=device
)

In [7]:
import torch.nn as nn
import random

In [8]:
class LuongAttention(nn.Module):
    def __init__(self, hidden_size, method="dot"):
        super(LuongAttention, self).__init__()
        self.method = method
        if method == "general":
            self.W = nn.Linear(hidden_size, hidden_size)
        
    def forward(self, h, encoder_outputs):
        # h: [batch_size, hidden_size]
        # encoder_outputs: [batch_size, seq_len, hidden_size]
        if self.method == "dot":
            score = torch.bmm(encoder_outputs, h.unsqueeze(2))  # [batch_size, seq_len, 1]
        elif self.method == "general":
            score = self.W(encoder_outputs)  # [batch_size, seq_len, hidden_size]
            score = torch.bmm(score, h.unsqueeze(2))  # [batch_size, seq_len, 1]
        
        attention_weights = torch.softmax(score, dim=1)
        context_vector = torch.bmm(encoder_outputs.transpose(1, 2), attention_weights).squeeze(2)
        return context_vector, attention_weights

In [9]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size, 1)

    def forward(self, h, encoder_outputs):
        # h: [batch_size, hidden_size], decoder 的当前隐藏状态
        # encoder_outputs: [batch_size, seq_len, hidden_size], 编码器的所有输出
        h = h.unsqueeze(1)  # [batch_size, 1, hidden_size]
        score = self.V(torch.tanh(self.W1(encoder_outputs) + self.W2(h)))  # [batch_size, seq_len, 1]
        attention_weights = torch.softmax(score, dim=1)  # [batch_size, seq_len, 1]
        
        context_vector = attention_weights * encoder_outputs  # [batch_size, seq_len, hidden_size]
        context_vector = context_vector.sum(1)  # [batch_size, hidden_size]
        
        return context_vector, attention_weights

In [10]:
class Encoder(nn.Module):
    def __init__(self,src_vocab_size,embed_size,hidden_size,dropout=0.5):
        #src_vocab_size 为德语词库的大小 enbed_size,词向量大小
        #hidden_size 隐藏状态的维度
        
        super(Encoder,self).__init__()
        self.embedding=nn.Embedding(src_vocab_size,embed_size,padding_idx=1)
        self.rnn=nn.GRU(embed_size,hidden_size,batch_first=True)
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,src):
        #src [batch seq_len]
        x_embeding=self.dropout(self.embedding(src))
        #x_embeding [batch seq_len embed_size]
        encoder_outputs,h_n=self.rnn(x_embeding)
        #h_n也是我们要的上下文向量，其为最后一个时间步各层的输出，即[n_layers,batch_size,hidden_size]
        #这里就是[1,batch_size,hidden_size]
        return encoder_outputs, h_n

In [11]:
#测试，也是最终模型的参数
src_vocab_size=len(SRC.vocab)
trg_vocab_size=len(TRG.vocab)
embed_size=256
hidden_size=512

enModel=Encoder(src_vocab_size,embed_size,hidden_size)
if cuda:
    enModel=enModel.cuda()

In [12]:
class Decoder(nn.Module):
    def __init__(self,trg_vocab_size,embed_size,hidden_size,dropout=0.5,attention=None):
        super(Decoder,self).__init__()
        #trg_vocab_size为英语词库大小 embed_size词向量维度
        #hidden_size隐状态大小
        self.embedding=nn.Embedding(trg_vocab_size,embed_size,padding_idx=1)
        #输入多了原始上下文向量，这里我们将源端和目标端的隐状态维度都默认为hidden_size
        self.rnn=nn.GRU(embed_size+hidden_size,hidden_size,batch_first=True)
        self.classify=nn.Linear(embed_size+hidden_size*2,trg_vocab_size)
        self.dropout=nn.Dropout(dropout)
        self.attention=attention
        
    def forward(self,trg_i,context,h_n,encoder_outputs):
        #trg_i为某一时间步词的输入，[bacth_size]
        #context为原始上下文向量，[bacth_size,1,hidden_size]
        #h_n为上一时间布的隐状态[1，batch_size，hidden_size]
        trg_i=trg_i.unsqueeze(1)
        #trg_i[bacth_size,1]
        trg_i_embed=self.dropout(self.embedding(trg_i))
        #trg_i_embed [bacth_size,1,embed_size]
        
        context_vector, _ = self.attention(h_n[-1], encoder_outputs)
        rnn_input = torch.cat((trg_i_embed, context_vector.unsqueeze(1)), dim=2)


        #输入rnn模块的不仅仅只有词嵌入和上一时间步的隐状态，还有原始上下向量
        input=torch.cat((trg_i_embed,context),dim=2)
        #input[bacth_size,1,embed_size+hidden_size]
        output,h_n=self.rnn(rnn_input,h_n)
        #output[batch_size,1,hidden_size]
        #h_n[1,batch_size,hidden_size]
        
        
        #原本rnn模型的输入直接带入线性分类层映射到英语空间中，这里新添原始词嵌入和原始上下文向量，即上面的input
        input=rnn_input.squeeze()
        output=output.squeeze()
        #input[bacth_size embed_size+hidden_size]
        #output[batch_szie hidden_size]
        input=torch.cat((input,output),dim=1)
        output=self.classify(input)
        #output[bacth trg_vocab_size]
        return output,h_n

In [14]:
attention = LuongAttention(hidden_size)

deModel=Decoder(trg_vocab_size,embed_size,hidden_size,attention=attention)
if cuda:
    deModel=deModel.cuda()

In [15]:
#这一步其实就整合上面我们的测试
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super(Seq2Seq,self).__init__()
        self.encoder=encoder
        self.decoder=decoder
        
    def forward(self,src,trg,teach_threshold=0.5):
        #src[batch seq_len]
        #trg[bacth seq_len]
        #teacher_rate 进行forcing teaching 的阈值
        trg_seq_len=trg.shape[1]
        #这一步很重要，最后一个batch大小没有batch_size大小
        batch=trg.shape[0]
        
        #设置一个tensor保存所有预测结果
        outputs_save=torch.zeros(batch,trg_seq_len,trg_vocab_size)
        if cuda:
            outputs_save=outputs_save.cuda()
        
        #获取编码层的输出
        encoder_outputs, h_n=self.encoder(src)
        #h_n[1,batch_size,hidden_size]
        context=h_n.permute(1,0,2)
        #context[batch_size,1,hidden_size]
        input=trg[:,0]
        
        #遍历每个英语的输入，代入翻译
        for t in range(1,trg_seq_len):
            #将每个时间步词典代入计算
            output,h_n=self.decoder(input,context,h_n,encoder_outputs)
            #保存rnn模型经过线性分类层的输出
            outputs_save[:,t,:]=output
            probability=random.random()
            #是否采用强制教学
            input=trg[:,t] if probability<teach_threshold else output.argmax(1)
        return outputs_save

In [16]:
model=Seq2Seq(enModel,deModel)
if cuda:
    model=model.cuda()

In [17]:
import time,math
from torch.optim import Adam

In [18]:
epochs=10
criterion=nn.CrossEntropyLoss(ignore_index=1)
model = Seq2Seq(enModel, deModel)
optim=Adam(model.parameters())

In [19]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256, padding_idx=1)
    (rnn): GRU(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256, padding_idx=1)
    (rnn): GRU(768, 512, batch_first=True)
    (classify): Linear(in_features=1280, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (attention): LuongAttention()
  )
)

In [20]:
def train(model,train_iter,criterion,optim):
    model.train()
    lossAll=0
    for example in train_iter:
        src=example.src.permute(1,0)
        trg=example.trg.permute(1,0)
        #src[batch seq_len]
        #trg[batch seq_len]
        
        optim.zero_grad()
        output=model(src,trg)
        #output[batch seq_len trg_vocab_size]
        trg_vocab_size=output.shape[-1]
        output=output[:,1:,:].reshape(-1,trg_vocab_size)
        #output[bacth*(seq_len-1),trg_vocab_size]
        trg=trg[:,1:].reshape(-1)
        #trg[bacth*(seq_len-1)]
        
        loss=criterion(output,trg)
        loss.backward()
        optim.step()
        
        lossAll+=loss.item()
    return lossAll/len(train_iter)

In [21]:
def evaluate(model,val_iter,criterion):
    model.eval()
    lossAll=0
    
    with torch.no_grad():
        for example in val_iter:
            src=example.src.permute(1,0)
            trg=example.trg.permute(1,0)
            #src[batch seq_len]
            #trg[batch seq_len]
            
            output=model(src,trg)
            #output[batch seq_len trg_vocab_size]
            trg_vocab_size=output.shape[-1]
            output=output[:,1:,:].reshape(-1,trg_vocab_size)
            #output[bacth*(seq_len-1),trg_vocab_size]
            trg=trg[:,1:].reshape(-1)
            #trg[bacth*(seq_len-1)]
            loss=criterion(output,trg)
            lossAll+=loss.item()
 
    return lossAll/len(val_iter)

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## LuongAttention

In [61]:
for epoch in range(epochs):
    start_time = time.time()
    train_loss = train(model,train_iter,criterion,optim)
    valid_loss = evaluate(model,val_iter,criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 33s
	Train Loss: 1.597 | Train PPL:   4.937
	 Val. Loss: 2.380 |  Val. PPL:  10.803
Epoch: 02 | Time: 0m 33s
	Train Loss: 1.500 | Train PPL:   4.481
	 Val. Loss: 2.307 |  Val. PPL:  10.046
Epoch: 03 | Time: 0m 33s
	Train Loss: 1.399 | Train PPL:   4.050
	 Val. Loss: 2.438 |  Val. PPL:  11.449
Epoch: 04 | Time: 0m 34s
	Train Loss: 1.336 | Train PPL:   3.805
	 Val. Loss: 2.449 |  Val. PPL:  11.581
Epoch: 05 | Time: 0m 35s
	Train Loss: 1.261 | Train PPL:   3.529
	 Val. Loss: 2.445 |  Val. PPL:  11.531
Epoch: 06 | Time: 0m 35s
	Train Loss: 1.195 | Train PPL:   3.304
	 Val. Loss: 2.516 |  Val. PPL:  12.379
Epoch: 07 | Time: 0m 35s
	Train Loss: 1.095 | Train PPL:   2.990
	 Val. Loss: 2.695 |  Val. PPL:  14.813
Epoch: 08 | Time: 0m 35s
	Train Loss: 1.057 | Train PPL:   2.878
	 Val. Loss: 2.505 |  Val. PPL:  12.246
Epoch: 09 | Time: 0m 36s
	Train Loss: 1.033 | Train PPL:   2.810
	 Val. Loss: 2.633 |  Val. PPL:  13.909
Epoch: 10 | Time: 0m 36s
	Train Loss: 0.978 | Train PPL

## BahdanauAttention

In [62]:
attention = BahdanauAttention(hidden_size)

deModel=Decoder(trg_vocab_size,embed_size,hidden_size,attention=attention)
if cuda:
    deModel=deModel.cuda()

model=Seq2Seq(enModel,deModel)
if cuda:
    model=model.cuda()

epochs=10
criterion=nn.CrossEntropyLoss(ignore_index=1)
model = Seq2Seq(enModel, deModel)
optim=Adam(model.parameters())

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

for epoch in range(epochs):
    start_time = time.time()
    train_loss = train(model,train_iter,criterion,optim)
    valid_loss = evaluate(model,val_iter,criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 35s
	Train Loss: 5.047 | Train PPL: 155.493
	 Val. Loss: 4.466 |  Val. PPL:  87.014
Epoch: 02 | Time: 0m 36s
	Train Loss: 4.377 | Train PPL:  79.565
	 Val. Loss: 4.079 |  Val. PPL:  59.076
Epoch: 03 | Time: 0m 36s
	Train Loss: 4.029 | Train PPL:  56.193
	 Val. Loss: 3.834 |  Val. PPL:  46.255
Epoch: 04 | Time: 0m 36s
	Train Loss: 3.585 | Train PPL:  36.061
	 Val. Loss: 3.180 |  Val. PPL:  24.035
Epoch: 05 | Time: 0m 36s
	Train Loss: 3.096 | Train PPL:  22.118
	 Val. Loss: 2.796 |  Val. PPL:  16.376
Epoch: 06 | Time: 0m 35s
	Train Loss: 2.706 | Train PPL:  14.962
	 Val. Loss: 2.629 |  Val. PPL:  13.861
Epoch: 07 | Time: 0m 36s
	Train Loss: 2.437 | Train PPL:  11.438
	 Val. Loss: 2.516 |  Val. PPL:  12.379
Epoch: 08 | Time: 0m 36s
	Train Loss: 2.204 | Train PPL:   9.064
	 Val. Loss: 2.533 |  Val. PPL:  12.586
Epoch: 09 | Time: 0m 36s
	Train Loss: 1.988 | Train PPL:   7.300
	 Val. Loss: 2.313 |  Val. PPL:  10.105
Epoch: 10 | Time: 0m 36s
	Train Loss: 1.841 | Train PPL

In [55]:
def translate(model, src_sentence, SRC, TRG, device="cuda"):
    model.eval()
    with torch.no_grad():
        # Tokenize sentence if not already.
        if isinstance(src_sentence, str):
            spacy_de = spacy.load("de_core_news_sm")
            tokens = [tok.text.lower() for tok in spacy_de(src_sentence)]
        else:
            tokens = [tok.lower() for tok in src_sentence]

        # Add <sos> and <eos> tokens
        tokens = [SRC.init_token] + tokens + [TRG.eos_token]
        
        # Convert to indices
        src_indexes = [SRC.vocab.stoi[token] for token in tokens]
        src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
        
        # Forward pass through the encoder
        encoder_outputs, hidden = model.encoder(src_tensor)
        
        # First input to the decoder is the <sos> token
        trg_indexes = [TRG.vocab.stoi[TRG.init_token]]
        
        # Initial context is the last hidden state from the encoder
        context=hidden.permute(1,0,2)

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        input=trg_tensor[:,0]

        # Begin decoding
        for i in range(100):  # assuming maximum length of the translated sentence is 100 tokens
            
            
            # Compute the attention and output from decoder
            output, hidden = model.decoder(input, context, hidden, encoder_outputs)
            
            # Select the token with the highest probability
            pred_token = output.argmax(1).item()
            trg_indexes.append(pred_token)

            # If <eos> token is generated, stop
            if pred_token == TRG.vocab.stoi[TRG.eos_token]:
                break

        # Convert indices to words
        trg_tokens = [TRG.vocab.itos[i] for i in trg_indexes]

        # Return the generated tokens, removing the initial <sos>
        return trg_tokens[1:]


In [60]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):

    trgs = []
    pred_trgs = []

    for datum in data:

        src = vars(datum)['src']
        trg = vars(datum)['trg']

        pred_trg, _ = translate(model, src, src_field, trg_field, device)

        #cut off <eos> token
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        trgs.append([trg])

    return bleu_score(pred_trgs, trgs)
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)
print(f"BLEU score: {bleu_score:.2f}")

BLEU score = 35.25
