In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import io
import unicodedata
import string
import re
import random
import os
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from torch.utils.data import Dataset
from torch.optim import lr_scheduler
import itertools
import glob
plt.switch_backend('agg')
import matplotlib.ticker as ticker
from sacrebleu import corpus_bleu
import sacrebleu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import pdb

In [3]:
batch_size = 32
words_to_load = 100000
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_token = 3
LR_RATE = 0.001
MAX_LENGTH = 40
hidden_size = 300
teacher_forcing_ratio = 0.5

__Preprocess Data__

In [4]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "<pad>", 3: "<unk>"}
        self.n_words = 4  # Count SOS, EOS, pad and unk

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def normalizeZh(s):
    s = s.strip()
    s = re.sub("\s+", " ", s)
    return s

In [6]:
def filterPair(p):
    filtered = []
    for i in p:
        filtered.append(' '.join(i.split()[:MAX_LENGTH-1]))
    return filtered

def filterPairs(pairs):
    return [filterPair(pair) for pair in pairs]

In [7]:
def readLangs(dataset, lang1, lang2):
    chinese = os.getcwd()+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang1)
    english = os.getcwd()+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang2)

    chinese_lines = open(chinese, encoding='utf-8').read().strip().split('\n')
    english_lines = open(english, encoding='utf-8').read().strip().split('\n')
    length = len(chinese_lines)

    pairs = [[normalizeZh(chinese_lines[i]), normalizeString(english_lines[i])] for i in range(length)]
    pairs = filterPairs(pairs)
    
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    return input_lang, output_lang, pairs

In [8]:
train_input_lang, train_output_lang, train_pairs = readLangs('train', 'zh', 'en')
val_input_lang, val_output_lang, val_pairs = readLangs('dev', 'zh', 'en')
test_input_lang, test_output_lang, test_pairs = readLangs('test', 'zh', 'en')

In [9]:
train_input_lang.n_words

86784

In [10]:
train_output_lang.n_words

49924

In [11]:
len(train_pairs)  # 6621 batch

213376

In [12]:
len(val_pairs)  # 39 batch

1261

In [13]:
len(test_pairs)

1397

__Embedding__

In [10]:
def load_embedding(ft_path, words_to_load):
    fin = io.open(ft_path, 'r', encoding='utf-8', newline='\n', errors='ignore')

    n, d = map(int, fin.readline().split())
    vocab_size = words_to_load + 4
    embedding_dim = d

    embedding_mat = np.zeros((vocab_size, embedding_dim))
    token2id = {}
    id2token = {}
    all_tokens = ['SOS', 'EOS', '<unk>', '<pad>']

    for i, line in enumerate(fin):
        if i >= words_to_load:
            break
        s = line.rstrip().split(' ')
        embedding_mat[i+4, :] = np.asarray(s[1:])
        token2id[s[0]] = i+4
        id2token[i+4] = s[0]
        all_tokens.append(s[0])

    token2id['<pad>'] = PAD_token 
    token2id['<unk>'] = UNK_token
    token2id['SOS'] = SOS_token
    token2id['EOS'] = EOS_token
    id2token[PAD_token] = '<pad>'
    id2token[UNK_token] = '<unk>'
    id2token[SOS_token] = 'SOS'
    id2token[EOS_token] = 'EOS'
    embedding_mat[PAD_token, :] = np.zeros((1,d))
    #generate normal dist 1d array for UNK, SOS, EOS token
    embedding_mat[UNK_token, :] = np.random.normal(size=d)
    embedding_mat[SOS_token, :] = np.random.normal(size=d)
    embedding_mat[EOS_token, :] = np.random.normal(size=d)
        
    return embedding_mat, all_tokens, token2id, id2token

In [11]:
fname_zh = os.getcwd()+'/wiki.zh.vec'
fname_eng = '/'.join(os.getcwd().split('/')[:-1])+'/hw2/wiki-news-300d-1M.vec'
embedding_mat_zh, all_tokens_zh, token2id_zh, id2token_zh = load_embedding(fname_zh, words_to_load)
embedding_mat_en, all_tokens_en, token2id_en, id2token_en = load_embedding(fname_eng, words_to_load)

In [14]:
embedding_mat_zh.shape

(100004, 300)

In [15]:
embedding_mat_en.shape

(100004, 300)

__Data Loader__

In [12]:
class NMTDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, input_lang, output_lang, pairs):
        """
        @param data_list_1: list of sentence 1 tokens 
        @param data_list_2: list of sentence 2 tokens
        @param target_list: list of review targets 

        """
        self.input_w2i = input_lang
        self.output_w2i = output_lang
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        input_sentence = self.pairs[key][0]
        input_indexes = [self.input_w2i[word] if word in self.input_w2i else UNK_token for word in input_sentence.split(' ')]
        input_indexes.append(EOS_token)
        input_length = len(input_indexes)

        output_sentence = self.pairs[key][1]
        output_indexes = [self.output_w2i[word] if word in self.output_w2i else UNK_token for word in output_sentence.split(' ')]
        output_indexes.append(EOS_token)
        output_length = len(output_indexes)
        return [input_indexes, input_length, output_indexes, output_length]

    
def NMTDataset_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    input_ls = []
    output_ls = []
    input_length_ls = []
    output_length_ls = []
    
    for datum in batch:
        input_length_ls.append(datum[1])
        output_length_ls.append(datum[3])
    
    #find max length in each batch
    max_input = sorted(input_length_ls)[-1]
    max_output = sorted(output_length_ls)[-1]
    
    # padding
    for datum in batch:
        padded_vec_input = np.pad(np.array(datum[0]), 
                                  pad_width=((0,MAX_LENGTH-datum[1])), 
                                  mode="constant", constant_values=2).tolist()
        padded_vec_output = np.pad(np.array(datum[2]), 
                                   pad_width=((0,MAX_LENGTH-datum[3])), 
                                   mode="constant", constant_values=2).tolist()
        input_ls.append(padded_vec_input)
        output_ls.append(padded_vec_output)
    return [torch.tensor(torch.from_numpy(np.array(input_ls)), device=device), 
            torch.tensor(input_length_ls, device=device), 
            torch.tensor(torch.from_numpy(np.array(output_ls)), device=device), 
            torch.tensor(output_length_ls, device=device)]

In [13]:
# create pytorch dataloader
train_dataset = NMTDataset(token2id_zh, token2id_en, train_pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=NMTDataset_collate_func,
                                           shuffle=True,
                                           drop_last=True)

val_dataset = NMTDataset(token2id_zh, token2id_en, val_pairs)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                         batch_size=batch_size,
                                         collate_fn=NMTDataset_collate_func,
                                         shuffle=False,
                                         drop_last=True)

In [70]:
next(iter(val_loader))[0].shape

torch.Size([32, 40])

In [26]:
for i in val_loader:
#     print(i)
    if i[0].shape[1] != 40:
        print(True)
#     print(i[0].shape)
#     for ind in i[2]:
#         for token in ind:
#             print(token)
#             print(train_output_lang.index2word[token.item()])
#     for ind in i[2]:
#         print(' '.join(train_output_lang.index2word[token.item()] for token in ind))
#     print([train_output_lang.index2word[token.item()] for ind in i[2] for token in ind])
#     break

__Encoder__

In [14]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        embed_mat = torch.from_numpy(embedding_mat_zh).float()
        n, embed_dim = embed_mat.shape
#         mask = np.zeros((n,1))
#         mask[0] = 1
#         mask[1] = 1
#         mask[2] = 1
#         mask[3] = 1
#         mask = torch.from_numpy(mask).float()
#         self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)
        
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)

    def forward(self, input, input_len, hidden):
        # Compute sorted sequence lengths
#         _, idx_sort = torch.sort(input_len, dim=0, descending=True)
#         _, idx_unsort = torch.sort(idx_sort, dim=0)
        
        # get embedding of characters
        embed = self.embedding(input)
#         mask = self.mask_embedding(input)
        
#         embedded = mask*embed + (1-mask)*embed.clone().detach()
        embedded = embed
        output, hidden = self.gru(embedded, hidden)
        
        # Sort embedding and length
#         embedded = embedded.index_select(0, idx_sort)
#         input_len = input_len.index_select(0, idx_sort)
        
#         packed_emb = nn.utils.rnn.pack_padded_sequence(embedded, input_len.cpu().numpy(), batch_first=True)
#         packed_output, hidden = self.gru(packed_emb, hidden)
#         output, output_lens =  nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Unsort output and last hidden unit
#         output = output.index_select(0, idx_unsort)
        
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [15]:
class EncoderCNN(nn.Module):
    def __init__(self, hidden_size, kernel_dim, batch_size):
        super(EncoderCNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        embed_mat = torch.from_numpy(embedding_mat_zh).float()
        n, embed_dim = embed_mat.shape
#         mask = np.zeros((n,1))
#         mask[0] = 1
#         mask[1] = 1
#         mask[2] = 1
#         mask[3] = 1
#         mask = torch.from_numpy(mask).float()
#         self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)

        self.conv1 = nn.Conv1d(embed_dim, hidden_size*2, kernel_size=kernel_dim, padding=1)
        self.conv2 = nn.Conv1d(hidden_size*2, hidden_size*2, kernel_size=kernel_dim, padding=1)
        self.linear1 = nn.Linear(hidden_size*2, hidden_size)

    def forward(self, input):
        # get embedding of words
        embed = self.embedding(input)
#         mask = self.mask_embedding(input)
        
#         embedded = mask*embed + (1-mask)*embed.clone().detach()
        embedded = embed
    
        # perform convolution 1
        hidden = self.conv1(embedded.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden)
#         hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, hidden.size(1), hidden.size(-1))

        # perform convolution 2
        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden)
#         hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, hidden.size(1), hidden.size(-1))

        hidden,_ = hidden.max(dim=1)
        out = self.linear1(hidden)
        out = out.view(1,out.size(0),out.size(1))
        
        return out

__Decoder Without Attention__

In [35]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        embed_mat = torch.from_numpy(embedding_mat_en).float()
        n, embed_dim = embed_mat.shape
#         mask = np.zeros((n,1))
#         mask[0] = 1
#         mask[1] = 1
#         mask[2] = 1
#         mask[3] = 1
#         mask = torch.from_numpy(mask).float()
#         self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)
        
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, word_input, hidden):
        # get embedding of words
        embed = self.embedding(word_input)
#         mask = self.mask_embedding(word_input)
        
#         embedded = mask*embed + (1-mask)*embed.clone().detach()
        embedded = embed
#         print(embedded.size())
    
        output, hidden = self.gru(embedded, hidden)
        
        # Final output layer
        output = output.squeeze(1) # B x N
        output = self.linear(output)
        output = self.softmax(output)

        return output, hidden

#### Decoder with Attention

In [15]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers 
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        embed_mat = torch.from_numpy(embedding_mat_zh).float()
        n, embed_dim = embed_mat.shape
        
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze=True)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)

        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, encoder_outputs):
        
        ### embed: [batch size * 1 * emb_dim = 300 ] ###
        ### hidden: [batch size * 1 * hidden_size = 300 ] ###
        ### encoder_outputs: [batch size * max_sentence_length_zh * hidden_size = 300 ] ###
        ### 因为这里concat之后，attn layer 他给的是 hidden size *2 
        ### 所以我这儿的hidden size就只能写300了 
        
        embed = self.embedding(input)
        embed = self.dropout(embed)   
        
        
        ### torch.cat((embed, hidden), 2)  
        ### [batch size * 1 * (emb_dim + hidden_size) ]
        
        ### attn_weights: [batch size * 1 * max_sentence_length_zh ]###
        
        ### softmax dim=2 因为最后一个dimension是 词组什么的，不能是1，1的话就是
        ### 不同batch间这样比较了？
        
        attn_weights = F.softmax(self.attn(torch.cat((embed, hidden.squeeze(0).unsqueeze(1)), 2)), dim=2)
        

        ### torch.bmm(attn_weights[0].unsqueeze(1),encoder_outputs).squeeze(1) :
        ### [batch size * 1 * hidden_size ]###

        ### attn_applied: [batch size * hidden_size (= 300) ] ###
#         pdb.set_trace()
#         print(attn_weights.size())
#         print(encoder_outputs.size())
        attn_applied = torch.bmm(attn_weights, encoder_outputs).squeeze(1)
        
        ### output: [batch size * hidden_size (= 300) ] ###
        ### embed[0]: [batch size * hidden_size (= 300) ] ###

        output = torch.cat((embed.squeeze(1), attn_applied), 1)
 
        ### output: [batch size * 1 * hidden_size (= 300) ] ###
        output = self.attn_combine(output).unsqueeze(1)
        
        ### output: [batch size * 1 * hidden_size (= 300) ] ###
        output = F.relu(output)
        
        ### output: [batch size * 1 * hidden_size (= 300) ] ###
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output.squeeze(1)))
        
        return output, hidden, attn_weights

    def initHidden(self,batch_size):
        return torch.randn(self.num_layers, batch_size, self.hidden_size,device=device)

In [46]:
# class Attn(nn.Module):
#     def __init__(self, method, hidden_size):
#         super(Attn, self).__init__()
#         self.method = method
#         self.hidden_size = hidden_size
#         self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
#         self.v = nn.Parameter(torch.rand(hidden_size))
#         stdv = 1. / math.sqrt(self.v.size(0))
#         self.v.data.normal_(mean=0, std=stdv)

#     def forward(self, hidden, encoder_outputs, src_len=None):
#         '''
#         :param hidden: 
#             previous hidden state of the decoder, in shape (layers*directions,B,H)
#         :param encoder_outputs:
#             encoder outputs from Encoder, in shape (T,B,H)
#         :param src_len:
#             used for masking. NoneType or tensor in shape (B) indicating sequence length
#         :return
#             attention energies in shape (B,T)
#         '''
#         max_len = encoder_outputs.size(1)
#         this_batch_size = encoder_outputs.size(1)
#         H = hidden.repeat(max_len,1,1).transpose(0,1)
# #         encoder_outputs = encoder_outputs.transpose(0,1) # [B*T*H]
#         attn_energies = self.score(H,encoder_outputs) # compute attention score
        
#         if src_len is not None:
#             mask = []
#             for b in range(src_len.size(0)):
#                 mask.append([0] * src_len[b].item() + [1] * (encoder_outputs.size(1) - src_len[b].item()))
#             mask = cuda_(torch.ByteTensor(mask).unsqueeze(1)) # [B,1,T]
#             attn_energies = attn_energies.masked_fill(mask, -1e18)
        
#         return F.softmax(attn_energies, dim = 1).unsqueeze(1) # normalize with softmax

#     def score(self, hidden, encoder_outputs):
#         energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2))) # [B*T*2H]->[B*T*H]
#         energy = energy.transpose(2,1) # [B*H*T]
#         v = self.v.repeat(encoder_outputs.data.shape[0],1).unsqueeze(1) #[B*1*H]
#         energy = torch.bmm(v,energy) # [B*1*T]
#         return energy.squeeze(1) #[B*T]

# class BahdanauAttnDecoderRNN(nn.Module):
#     def __init__(self, weights_matrix, hidden_size, embed_size, output_size, dropout_p=0.5):
#         super(BahdanauAttnDecoderRNN, self).__init__()
#         # Define parameters
#         self.hidden_size = hidden_size
#         self.embed_size = embed_size
#         self.output_size = output_size
#         self.dropout_p = dropout_p
#         # Define layers
        
#         embed_mat = torch.from_numpy(weights_matrix).float()
#         self.num_embeddings, self.embedding_dim = embed_mat.shape
#         mask = np.zeros((self.num_embeddings,1))
#         mask[0] = 1
#         mask[1] = 1
#         mask[2] = 1
#         mask[3] = 1
#         mask = torch.from_numpy(mask).float()
#         self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
#         self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)
#         self.dropout = nn.Dropout(dropout_p)
#         self.attn = Attn('concat', hidden_size)
#         self.gru = nn.GRU(hidden_size + self.embedding_dim, hidden_size, dropout=dropout_p)
#         #self.attn_combine = nn.Linear(hidden_size + embed_size, hidden_size)
#         self.out = nn.Linear(hidden_size, self.output_size)

#     def forward(self, word_input, last_hidden, encoder_outputs):
#         '''
#         :param word_input:
#             word input for current time step, in shape (B)
#         :param last_hidden:
#             last hidden stat of the decoder, in shape (layers*direction*B*H)
#         :param encoder_outputs:
#             encoder outputs in shape (T*B*H)
#         :return
#             decoder output
#         Note: we run this one step at a time i.e. you should use a outer loop 
#             to process the whole sequence
#         Tip(update):
#         EncoderRNN may be bidirectional or have multiple layers, so the shape of hidden states can be 
#         different from that of DecoderRNN
#         You may have to manually guarantee that they have the same dimension outside this function,
#         e.g, select the encoder hidden state of the foward/backward pass.
#         '''
#         # Get the embedding of the current input word (last output word)
#         embed = self.embedding(word_input)
#         mask = self.mask_embedding(word_input)
        
#         word_embedded = mask*embed + (1-mask)*embed.clone().detach()
#         word_embedded = self.embedding(word_input).view(word_input.size(0), 1, -1) # (1,B,V)
#         word_embedded = self.dropout(word_embedded)
#         # Calculate attention weights and apply to encoder outputs
#         attn_weights = self.attn(last_hidden[-1], encoder_outputs)
#         context = attn_weights.bmm(encoder_outputs)  # (B,1,V)
# #         context = context.transpose(0, 1)  # (1,B,V)
#         # Combine embedded input word and attended context, run through RNN
#         rnn_input = torch.cat((word_embedded, context), 2)
#         #rnn_input = self.attn_combine(rnn_input) # use it in case your size of rnn_input is different
#         output, hidden = self.gru(rnn_input.transpose(0, 1), last_hidden)
#         output = output.squeeze(0)  # (1,B,V)->(B,V)
#         # context = context.squeeze(0)
#         # update: "context" input before final layer can be problematic.
#         # output = F.log_softmax(self.out(torch.cat((output, context), 1)))
#         output = F.log_softmax(self.out(output), dim = 1)
#         # Return final output, hidden state
#         return output, hidden, attn_weights

__Training__

In [16]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [17]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [18]:
def train(input, target, input_len, target_len, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, teach_forcing_ratio=0.5, encoder_cnn = False):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    max_input_len = max(input_len)
    max_target_len = max(target_len)

    loss = 0
    
    if not encoder_cnn:
        encoder_hidden = encoder.initHidden(batch_size)
        encoder_output, encoder_hidden = encoder(input, input_len, encoder_hidden)
#         print(encoder_output.size())
    else:
        encoder_hidden = encoder(input)
        
    decoder_input = torch.tensor([[SOS_token]]*batch_size, device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(max_target_len):
#             decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            decoder_output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_output)
            loss += criterion(decoder_output, target[:,di])
            decoder_input = target[:,di].unsqueeze(1)  # Teacher forcing (batch_size, 1)

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(max_target_len):
#             decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            decoder_output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_output)
            loss += criterion(decoder_output, target[:,di])
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().unsqueeze(1)  # detach from history as input
    #         if decoder_input.item() == EOS_token:
    #             break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / float(max_target_len)

In [19]:
def trainIters(loader, encoder, decoder, n_iters, encoder_cnn, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()
    
    best_bleu = None
    save_path = os.getcwd() + '/saved_model/En-RNN-De-Attn-Nomask.pt'
            
    for iter in range(1, n_iters + 1):
        for i, (input, input_len, target, target_len) in enumerate(train_loader):
            loss = train(input, target, input_len, target_len, encoder, decoder, 
                         encoder_optimizer, decoder_optimizer, criterion, 
                         teach_forcing_ratio=teacher_forcing_ratio, encoder_cnn = encoder_cnn)
            print_loss_total += loss
            plot_loss_total += loss
            
            
            if i % print_every == 0:
                current_bleu = test(encoder, decoder, val_loader, encoder_cnn)
                if not best_bleu or current_bleu > best_bleu:
                    torch.save({
                                'epoch': iter,
                                'encoder_state_dict': encoder.state_dict(),
                                'decoder_state_dict': decoder.state_dict(),
                                'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
                                'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
                                'train_loss': loss,
                                'best_BLEU': best_bleu
                                }, save_path)
                    best_bleu = current_bleu
                
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (Epoch: %d %d%%) | Train Loss: %.4f | Best Bleu: %.4f | Current Bleu: %.4f' 
                      % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg, best_bleu, current_bleu))

            if i % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
#     showPlot(plot_losses)

In [20]:
def evaluate(encoder, decoder, input, input_len, encoder_cnn):
    with torch.no_grad():
#         for i, (input, input_len, target, target_len) in enumerate(data_loader):
        max_input_len = max(input_len)

        if not encoder_cnn:
            encoder_hidden = encoder.initHidden(batch_size)
            encoder_output, encoder_hidden = encoder(input, input_len, encoder_hidden)
        else:
            encoder_hidden = encoder(input)

        decoder_input = torch.tensor([[SOS_token]]*batch_size, device=device)
        decoder_hidden = encoder_hidden # decoder starts from the last encoding sentence

        # output of this function
        decoded_words = []

        for di in range(max_input_len):
            # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
#             decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            
#             pdb.set_trace()

            decoder_output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_output)
            topv, topi = decoder_output.topk(1)
            decoded_words.append(topi.cpu().numpy())
            decoder_input = topi.squeeze().detach().unsqueeze(1)  # detach from history as input

            
            
            
        return np.asarray(decoded_words).T#, decoder_attentions[:di + 1]

In [21]:
def test(encoder, decoder, data_loader, encoder_cnn):
    total_score = 0
    count = 0
    check = 0
    
    candidate_corpus = []
    reference_corpus = []

    for i, (input, input_len, target, target_len) in enumerate(data_loader):
        decoded_words = evaluate(encoder, decoder, input, input_len, encoder_cnn)
        candidate_sentences = []
        for ind in range(decoded_words.shape[1]):
            sent_words = []
            for token in decoded_words[0][ind]:
                if token != PAD_token and token != EOS_token:
                    sent_words.append(id2token_en[token])
                else:
                    break
            sent_words = ' '.join(sent_words)
            if check == 0:
                print('predict: '+sent_words)
                check += 1
            candidate_sentences.append(sent_words)
#         candidate_corpus.extend(candidate_sentences)

        reference_sentences = []
        for sent in target:
            sent_words = []
            for token in sent:
                if token.item() != EOS_token:
                    sent_words.append(id2token_en[token.item()])
                else:
                    break
            sent_words = ' '.join(sent_words)
            if check == 1:
                print('target: '+sent_words)
                check += 1
            reference_sentences.append(sent_words)
#         reference_corpus.extend(reference_sentences)
        count += 1
        score = corpus_bleu(candidate_sentences, [reference_sentences], smooth='exp', smooth_floor=0.0, force=False).score
        total_score += score
    return total_score / float(count)

#     score = corpus_bleu(candidate_sentences, [reference_sentences], smooth='exp', smooth_floor=0.0, force=False).score
#     return score

In [None]:
encoder = EncoderRNN(hidden_size=hidden_size).to(device)
# encoder = EncoderCNN(hidden_size,kernel_dim=3,batch_size=batch_size).to(device)
# noattn_decoder = DecoderRNN(hidden_size, embedding_mat_zh.shape[0]).to(device)
attn_decoder = AttnDecoderRNN(hidden_size, embedding_mat_zh.shape[0]).to(device)
# attn_decoder1 = BahdanauAttnDecoderRNN(hidden_size, output_lang.n_words, n_layers=1, dropout_p=0.1).to(device)

#UNCOMMENT TO TRAIN THE MODEL
trainIters(train_loader, encoder, attn_decoder, n_iters=20, encoder_cnn=False, print_every=1000, plot_every=1, learning_rate=LR_RATE)
# trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

# encoder.load_state_dict(torch.load("encoder.pth"))
# attn_decoder1.load_state_dict(torch.load("attn_decoder.pth"))

predict: 
target: when i was i remember waking up one morning to the sound of joy in my house .
0m 13s (- 4m 17s) (Epoch: 1 5%) | Train Loss: 0.0115 | Best Bleu: 0.0000 | Current Bleu: 0.0000
predict: and <unk> s <unk> s the <unk> the the the the the <unk> the the
target: when i was i remember waking up one morning to the sound of joy in my house .
11m 29s (- 218m 13s) (Epoch: 1 5%) | Train Loss: 3.2231 | Best Bleu: 4.6632 | Current Bleu: 4.6632
predict: i i <unk> i i i to the i .
target: when i was i remember waking up one morning to the sound of joy in my house .
22m 41s (- 431m 13s) (Epoch: 1 5%) | Train Loss: 2.9468 | Best Bleu: 4.6632 | Current Bleu: 3.6162
predict: i i <unk> t i i <unk> <unk> <unk> <unk> <unk> <unk> <unk> .
target: when i was i remember waking up one morning to the sound of joy in my house .
33m 57s (- 645m 19s) (Epoch: 1 5%) | Train Loss: 2.8501 | Best Bleu: 5.0300 | Current Bleu: 5.0300
predict: i <unk> m <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <u

predict: i remember remember remember night and remember night and remember the .
target: when i was i remember waking up one morning to the sound of joy in my house .
333m 39s (- 1000m 59s) (Epoch: 5 25%) | Train Loss: 2.3107 | Best Bleu: 6.3529 | Current Bleu: 6.3138
predict: remember remember days days days days and remember and the and .
target: when i was i remember waking up one morning to the sound of joy in my house .
344m 55s (- 1034m 45s) (Epoch: 5 25%) | Train Loss: 2.3320 | Best Bleu: 6.3529 | Current Bleu: 5.5061
predict: i remember remember i remember heard and i remember the night and i heard the . .
target: when i was i remember waking up one morning to the sound of joy in my house .
356m 8s (- 1068m 25s) (Epoch: 5 25%) | Train Loss: 2.3256 | Best Bleu: 6.3529 | Current Bleu: 5.7908
predict: i remember remember remember remember remember and remember the day and i remember the . .
target: when i was i remember waking up one morning to the sound of joy in my house .
367m

In [None]:
corpus_bleu(sys_stream, ref_streams, smooth='exp', smooth_floor=0.0, force=False, lowercase=False,
                tokenize=DEFAULT_TOKENIZER, use_effective_order=False) 

In [119]:
ref = [['this is   test']]
candidates = ['this is a test']
# score = sacrebleu.corpus_bleu(ref,candidates)
score = sacrebleu.corpus_bleu(candidates,ref)
print(score.score)

35.35533905932737


#### load saved model

In [43]:
save_path = os.getcwd() + '/saved_model/En-RNN-De-NoAttn.pt'
device = torch.device("cuda")
checkpoint = torch.load(save_path)
en = EncoderRNN(train_input_lang.n_words, hidden_size=128).to(device)
de = DecoderRNN(hidden_size=128, output_size=train_output_lang.n_words).to(device)
en.load_state_dict(checkpoint['encoder_state_dict'])
de.load_state_dict(checkpoint['decoder_state_dict'])
epoch = checkpoint['epoch']
bleu = checkpoint['best_BLEU']

In [43]:
save_path = os.getcwd() + '/saved_model/En-RNN-De-Attn-Nomask.pt'
device = torch.device("cuda")
checkpoint = torch.load(save_path)
encoder = EncoderRNN(hidden_size=hidden_size).to(device)
attn_decoder = AttnDecoderRNN(hidden_size, embedding_mat_zh.shape[0]).to(device)
encoder.load_state_dict(checkpoint['encoder_state_dict'])
attn_decoder.load_state_dict(checkpoint['decoder_state_dict'])
epoch = checkpoint['epoch']
bleu = checkpoint['best_BLEU']

In [32]:
save_path = os.getcwd() + '/saved_model/En-CNN-De-NoAttn.pt'
device = torch.device("cuda")
checkpoint = torch.load(save_path)
cnn_en = EncoderCNN(hidden_size=128,kernel_dim=3,batch_size=batch_size).to(device)
cnn_de = DecoderRNN(hidden_size=128,output_size=42228).to(device)
cnn_en.load_state_dict(checkpoint['encoder_state_dict'])
cnn_de.load_state_dict(checkpoint['decoder_state_dict'])
epoch = checkpoint['epoch']
bleu = checkpoint['best_BLEU']

In [35]:
def test_val(encoder, decoder, data_loader, encoder_cnn):
    total_score = 0
    count = 0
    check = 0
    
    candidate_corpus = []
    reference_corpus = []

    for i, (input, input_len, target, target_len) in enumerate(data_loader):
        candidate_sentences = []
        decoded_words = evaluate(encoder, decoder, input, input_len, encoder_cnn)
        for ind in range(decoded_words.shape[1]):
            sent_words = []
            for token in decoded_words[0][ind]:
                if token != PAD_token and token != EOS_token:
                    sent_words.append(train_output_lang.index2word[token])
                else:
                    break
            sent_words = ' '.join(sent_words)
            candidate_sentences.append(sent_words)
#         candidate_corpus.extend(candidate_sentences)
#             if i % 20 == 0:
#                 input_sent = ' '.join([train_input_lang.index2word[token.item()] for token in input[ind]])
#                 print('input: '+input_sent)
            if i % 20 == 0:
                print('predict: '+sent_words)
                check += 1

        reference_sentences = []
        for sent in target:
            sent_words = []
            for token in sent:
                if token.item() != EOS_token:
                    sent_words.append(train_output_lang.index2word[token.item()])
                else:
                    break
            sent_words = ' '.join(sent_words)
            reference_sentences.append(sent_words)
#         reference_corpus.extend(reference_sentences)
            if i % 20 == 0:
                print('target: '+sent_words)
                check += 1
        
        count += 1
        score = corpus_bleu(candidate_sentences, [reference_sentences], smooth='exp', smooth_floor=0.0, force=False).score
        print('batch {}: bleu: {}'.format(i+1,score))
        total_score += score

    return total_score / float(count)

#     score = corpus_bleu(candidate_sentences, [reference_sentences], smooth='exp', smooth_floor=0.0, force=False).score
#     return score

In [44]:
test_val(cnn_en,cnn_de,val_loader,encoder_cnn=True)

predict: m submarines bizarre lange big earth captured captured captured
predict: the submarines bizarre the the the the the the captured
predict: us the the the the the the the
predict: us submarines bizarre the the the captured captured
predict: the submarines bizarre the the the the the the captured captured
predict: the submarines bizarre the the the volcanic captured captured
predict: m submarines bizarre big the captured captured captured
predict: get m submarines bizarre captured
predict: the submarines bizarre the the the the the captured captured
predict: the submarines bizarre the the the the the the the the captured
predict: to submarines some lange the earth the volcanic captured captured captured
predict: to submarines some of earth earth earth the volcanic captured captured
predict: to submarines habitats valleys earth earth earth the volcanic the captured captured captured
predict: get to submarines some animal the volcanic earth the to captured captured
predict: hardwir

batch 22: bleu: 0.12591854302765146
batch 23: bleu: 0.10199104658234201
batch 24: bleu: 0.08889222036908966
batch 25: bleu: 0.10508610322666333
batch 26: bleu: 0.12327139014581195
batch 27: bleu: 0.1129226399275569
batch 28: bleu: 0.11091220140580191
batch 29: bleu: 0.11342733507495625
batch 30: bleu: 0.1200610555414891
batch 31: bleu: 0.12259024916341915
batch 32: bleu: 0.17780114766473706
batch 33: bleu: 0.10454691228489726
batch 34: bleu: 0.08041177840733403
batch 35: bleu: 0.10092450598329217
batch 36: bleu: 0.11306688865727357
batch 37: bleu: 0.08009273840678621
batch 38: bleu: 0.11597295158093461
batch 39: bleu: 0.09587407112735168


0.1194049677133052

In [45]:
test_val(en,de,val_loader,encoder_cnn=False)

predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: and i apos m going to be a lot of the world .
predict: a

batch 21: bleu: 0.7090420408048385
batch 22: bleu: 0.47676412889722
batch 23: bleu: 2.0270913893564497
batch 24: bleu: 0.371090549424509
batch 25: bleu: 1.067189592769331
batch 26: bleu: 1.1906195417586753
batch 27: bleu: 0.7621629074750933
batch 28: bleu: 1.0430932182527446
batch 29: bleu: 1.3534049587438952
batch 30: bleu: 1.1687269055231078
batch 31: bleu: 1.0057984051085882
batch 32: bleu: 0.9683688958798589
batch 33: bleu: 1.045485361533528
batch 34: bleu: 0.41718143946528957
batch 35: bleu: 0.4990749788577744
batch 36: bleu: 0.7737583152500009
batch 37: bleu: 0.5667263889375768
batch 38: bleu: 0.4541978565349855
batch 39: bleu: 0.936236091090201


0.9903099438709904