In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import io
import unicodedata
import string
import re
import random
import os
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from torch.utils.data import Dataset
from torch.optim import lr_scheduler
import itertools
import glob
plt.switch_backend('agg')
import matplotlib.ticker as ticker
from sacrebleu import corpus_bleu
import sacrebleu
import math
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import pdb

In [3]:
batch_size = 32
words_to_load = 100000
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_token = 3
LR_RATE = 0.0005
MAX_LENGTH = 40
hidden_size_decoder = 256
hidden_size_encoder = 256
teacher_forcing_ratio = 0.5
EMBED_DIM = 300

In [4]:
add = '/scratch/wz1218'

__Preprocess Data__

In [5]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "<pad>", 3: "<unk>"}
        self.n_words = 4  # Count SOS, EOS, pad and unk

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def normalizeZh(s):
    s = s.strip()
    s = re.sub("\s+", " ", s)
    return s

In [7]:
# def filterPair(p):
#     return len(p[0].split(' ')) < MAX_LENGTH and \
#         len(p[1].split(' ')) < MAX_LENGTH

# def filterPairs(pairs):
#     return [pair for pair in pairs if filterPair(pair)]

In [8]:
def filterPair(p):
    filtered = []
    for i in p:
        filtered.append(' '.join(i.split()[:MAX_LENGTH]))
#         filtered.append(' '.join(i.split()[:]))
    return filtered

def filterPairs(pairs):
    return [filterPair(pair) for pair in pairs]

In [9]:
def readLangs(dataset, lang1, lang2):
    chinese = add+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang1)
    english = add+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang2)

    chinese_lines = open(chinese, encoding='utf-8').read().strip().split('\n')
    english_lines = open(english, encoding='utf-8').read().strip().split('\n')
    length = len(chinese_lines)

    pairs = [[normalizeZh(chinese_lines[i]), normalizeString(english_lines[i])] for i in range(length)]
    pairs = filterPairs(pairs)
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    return input_lang, output_lang, pairs

In [10]:
train_input_lang, train_output_lang, train_pairs = readLangs('train', 'zh', 'en')
val_input_lang, val_output_lang, val_pairs = readLangs('dev', 'zh', 'en')
test_input_lang, test_output_lang, test_pairs = readLangs('test', 'zh', 'en')

In [None]:
len(train_pairs)  # 6621 batch

In [None]:
len(val_pairs)  # 39 batch

In [None]:
len(test_pairs)

__Data Loader__

In [11]:
class NMTDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, input_lang, output_lang, pairs):
        """
        @param data_list_1: list of sentence 1 tokens 
        @param data_list_2: list of sentence 2 tokens
        @param target_list: list of review targets 

        """
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        input_sentence = self.pairs[key][0]
        input_indexes = [self.input_lang.word2index[word] if word in self.input_lang.word2index else UNK_token for word in input_sentence.split(' ')]
        input_indexes.append(EOS_token)
        input_length = len(input_indexes)

        output_sentence = self.pairs[key][1]
        output_indexes = [self.output_lang.word2index[word] if word in self.output_lang.word2index else UNK_token for word in output_sentence.split(' ')]
        output_indexes.append(EOS_token)
        output_length = len(output_indexes)
        return [input_indexes, input_length, output_indexes, output_length]

    
def NMTDataset_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    input_ls = []
    output_ls = []
    input_length_ls = []
    output_length_ls = []
    
    for datum in batch:
        input_length_ls.append(datum[1])
        output_length_ls.append(datum[3])
    
    #find max length in each batch
    max_input = sorted(input_length_ls)[-1]
    max_output = sorted(output_length_ls)[-1]
    
    # padding
    for datum in batch:
        padded_vec_input = np.pad(np.array(datum[0]), 
                                  pad_width=((0,max_input-datum[1])), 
                                  mode="constant", constant_values=2).tolist()
        padded_vec_output = np.pad(np.array(datum[2]), 
                                   pad_width=((0,max_output-datum[3])), 
                                   mode="constant", constant_values=2).tolist()
        input_ls.append(padded_vec_input)
        output_ls.append(padded_vec_output)
    return [torch.tensor(torch.from_numpy(np.array(input_ls)), device=device), 
            torch.tensor(input_length_ls, device=device), 
            torch.tensor(torch.from_numpy(np.array(output_ls)), device=device), 
            torch.tensor(output_length_ls, device=device)]

In [12]:
# create pytorch dataloader
train_dataset = NMTDataset(train_input_lang, train_output_lang, train_pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=NMTDataset_collate_func,
                                           shuffle=True,
                                           drop_last=True)

val_dataset = NMTDataset(train_input_lang, train_output_lang, val_pairs)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                         batch_size=batch_size,
                                         collate_fn=NMTDataset_collate_func,
                                         shuffle=False,
                                         drop_last=True)

In [53]:
next(iter(train_loader))[0].size()

torch.Size([32, 41])

In [52]:
for i in train_loader:
#     print(i)
#     print(i[2])
#     for ind in i[2]:
#         for token in ind:
#             print(token)
#             print(train_output_lang.index2word[token.item()])
#     for ind in i[2]:
#         print(' '.join(train_output_lang.index2word[token.item()] for token in ind))
#     print([train_output_lang.index2word[token.item()] for ind in i[2] for token in ind])
    break

__Embedding__

In [13]:
def load_embedding(ft_path, words_to_load):
    fin = io.open(ft_path, 'r', encoding='utf-8', newline='\n', errors='ignore')

    n, d = map(int, fin.readline().split())
    vocab_size = words_to_load + 4
    embedding_dim = d

    embedding_mat = np.zeros((vocab_size, embedding_dim))
    token2id = {}
    id2token = {}
    all_tokens = ['SOS', 'EOS', '<unk>', '<pad>']

    for i, line in enumerate(fin):
        if i >= words_to_load:
            break
        s = line.rstrip().split(' ')
        embedding_mat[i+4, :] = np.asarray(s[1:])
        token2id[s[0]] = i+4
        id2token[i+4] = s[0]
        all_tokens.append(s[0])

    token2id['<pad>'] = PAD_token 
    token2id['<unk>'] = UNK_token
    token2id['SOS'] = SOS_token
    token2id['EOS'] = EOS_token
    id2token[PAD_token] = '<pad>'
    id2token[UNK_token] = '<unk>'
    id2token[SOS_token] = 'SOS'
    id2token[EOS_token] = 'EOS'
    embedding_mat[PAD_token, :] = np.zeros((1,d))
    #generate normal dist 1d array for UNK, SOS, EOS token
    embedding_mat[UNK_token, :] = np.random.normal(size=d)
    embedding_mat[SOS_token, :] = np.random.normal(size=d)
    embedding_mat[EOS_token, :] = np.random.normal(size=d)
        
    return embedding_mat, all_tokens, token2id, id2token

In [14]:
fname_zh = add + '/zh/zh.vec'
fname_eng = add + '/zh/fasttext300d.vec'
embedding_mat_zh, all_tokens_zh, token2id_zh, id2token_zh = load_embedding(fname_zh, words_to_load)
embedding_mat_en, all_tokens_en, token2id_en, id2token_en = load_embedding(fname_eng, words_to_load)

In [None]:
embedding_mat_zh.shape

In [None]:
embedding_mat_en.shape

__Encoder__

In [15]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        embed_mat = torch.from_numpy(embedding_mat_zh).float()
        n, embed_dim = embed_mat.shape
        mask = np.zeros((n,1))
        mask[0] = 1
        mask[1] = 1
        mask[2] = 1
        mask[3] = 1
        mask = torch.from_numpy(mask).float()
        self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)
        
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)

    def forward(self, input, input_len, hidden):
        # Compute sorted sequence lengths
        _, idx_sort = torch.sort(input_len, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        
        # get embedding of characters
        embed = self.embedding(input)
        mask = self.mask_embedding(input)
        
        embedded = mask*embed + (1-mask)*embed.clone().detach()
        
        # Sort embedding and length
        embedded = embedded.index_select(0, idx_sort)
        input_len = input_len.index_select(0, idx_sort)
        
        packed_emb = nn.utils.rnn.pack_padded_sequence(embedded, input_len.cpu().numpy(), batch_first=True)
        packed_output, hidden = self.gru(packed_emb, hidden)
        output, output_lens =  nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Unsort output and last hidden unit
        output = output.index_select(0, idx_unsort)
        
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [None]:
class EncoderCNN(nn.Module):
    def __init__(self, hidden_size, kernel_dim, batch_size):
        super(EncoderCNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        embed_mat = torch.from_numpy(embedding_mat_zh).float()
        n, embed_dim = embed_mat.shape
        mask = np.zeros((n,1))
        mask[0] = 1
        mask[1] = 1
        mask[2] = 1
        mask[3] = 1
        mask = torch.from_numpy(mask).float()
        self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)

        self.conv1 = nn.Conv1d(embed_dim, hidden_size*2, kernel_size=kernel_dim, padding=1)
        self.conv2 = nn.Conv1d(hidden_size*2, hidden_size*2, kernel_size=kernel_dim, padding=1)
        self.linear1 = nn.Linear(hidden_size*2, hidden_size)

    def forward(self, input):
        # get embedding of words
        embed = self.embedding(input)
        mask = self.mask_embedding(input)
        
        embedded = mask*embed + (1-mask)*embed.clone().detach()
        
        # perform convolution 1
        hidden = self.conv1(embedded.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden)
#         hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, hidden.size(1), hidden.size(-1))

        # perform convolution 2
        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden)
#         hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, hidden.size(1), hidden.size(-1))

        hidden,_ = hidden.max(dim=1)
        out = self.linear1(hidden)
        out = out.view(1,out.size(0),out.size(1))
        
        return out

__Decoder Without Attention__

In [None]:
# class DecoderRNN(nn.Module):
#     def __init__(self, hidden_size, output_size):
#         super(DecoderRNN, self).__init__()
#         self.hidden_size = hidden_size
        
#         embed_mat = torch.from_numpy(embedding_mat_en).float()
#         n, embed_dim = embed_mat.shape
#         mask = np.zeros((n,1))
#         mask[0] = 1
#         mask[1] = 1
#         mask[2] = 1
#         mask[3] = 1
#         mask = torch.from_numpy(mask).float()
#         self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
#         self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)
        
#         self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
#         self.linear = nn.Linear(hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, word_input, hidden):
#         # get embedding of words
#         embed = self.embedding(word_input)
#         mask = self.mask_embedding(word_input)
        
#         embedded = mask*embed + (1-mask)*embed.clone().detach()

#         output, hidden = self.gru(embedded, hidden)
        
#         # Final output layer
#         output = output.squeeze(1) # B x N
#         output = self.linear(output)
#         output = self.softmax(output)

#         return output, hidden

#### Decoder with Attention

In [156]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.linear_out = nn.Linear(hidden_size*2, hidden_size)
        self.mask = None

    def set_mask(self, mask):
        """
        Sets indices to be masked
        Args:
            mask (torch.Tensor): tensor containing indices to be masked
        """
        self.mask = mask

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.size(0)
        hidden_size = encoder_outputs.size(2)
        hidden = hidden.unsqueeze(1)
        #input_size = hidden.size(1)
        # (batch, out_len, dim) * (batch, in_len, dim) -> (batch, out_len, in_len)
        attn = torch.bmm(encoder_outputs, hidden.transpose(1, 2))
        # (batch, out_len, in_len) * (batch, in_len, dim) -> (batch, out_len, dim)
        mix = torch.bmm(attn, hidden)
         # concat -> (batch, out_len, 2*dim)
        combined = torch.cat((mix, encoder_outputs), dim=2)
        # output -> (batch, out_len, dim)
        output = F.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size)
        return output, attn

class BahdanauAttnDecoderRNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, embed_size, output_size, dropout_p=0.5):
        super(BahdanauAttnDecoderRNN, self).__init__()
        # Define parameters
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        # Define layers
        
        embed_mat = torch.from_numpy(weights_matrix).float()
        self.num_embeddings, self.embedding_dim = embed_mat.shape
        mask = np.zeros((self.num_embeddings,1))
        mask[0] = 1
        mask[1] = 1
        mask[2] = 1
        mask[3] = 1
        mask = torch.from_numpy(mask).float()
        self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)
        self.dropout = nn.Dropout(dropout_p)
        self.attn = Attn('concat', hidden_size)
        self.gru = nn.GRU(hidden_size + self.embedding_dim, hidden_size, dropout=dropout_p)
        #self.attn_combine = nn.Linear(hidden_size + embed_size, hidden_size)
        self.out = nn.Linear(hidden_size, self.output_size)

    def forward(self, word_input, last_hidden, encoder_outputs):
        '''
        :param word_input:
            word input for current time step, in shape (B)
        :param last_hidden:
            last hidden stat of the decoder, in shape (layers*direction*B*H)
        :param encoder_outputs:
            encoder outputs in shape (T*B*H)
        :return
            decoder output
        Note: we run this one step at a time i.e. you should use a outer loop 
            to process the whole sequence
        Tip(update):
        EncoderRNN may be bidirectional or have multiple layers, so the shape of hidden states can be 
        different from that of DecoderRNN
        You may have to manually guarantee that they have the same dimension outside this function,
        e.g, select the encoder hidden state of the foward/backward pass.
        '''
        # Get the embedding of the current input word (last output word)
        embed = self.embedding(word_input)
        mask = self.mask_embedding(word_input)
        
        word_embedded = mask*embed + (1-mask)*embed.clone().detach()
        word_embedded = self.embedding(word_input).view(word_input.size(0), 1, -1) # (1,B,V)
        word_embedded = self.dropout(word_embedded)
        # Calculate attention weights and apply to encoder outputs
        #attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        #context = attn_weights.bmm(encoder_outputs)  # (B,1,V)
#         context = context.transpose(0, 1)  # (1,B,V)
        # Combine embedded input word and attended context, run through RNN
        rnn_input, attn = self.attn(last_hidden[-1], encoder_outputs)
        #         rnn_input = torch.cat((word_embedded, context), 2)
        print(rnn_input.size(), last_hidden.size())
        #rnn_input = self.attn_combine(rnn_input) # use it in case your size of rnn_input is different
        output, hidden = self.gru(rnn_input, last_hidden)
        output = output.squeeze(0)  # (1,B,V)->(B,V)
        # context = context.squeeze(0)
        # update: "context" input before final layer can be problematic.
        # output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        output = F.log_softmax(self.out(output), dim = 1)
        # Return final output, hidden state
        return output, hidden, attn_weights

In [None]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.normal_(mean=0, std=stdv)

    def forward(self, hidden, encoder_outputs, src_len=None):
        '''
        :param hidden: 
            previous hidden state of the decoder, in shape (layers*directions,B,H)
        :param encoder_outputs:
            encoder outputs from Encoder, in shape (T,B,H)
        :param src_len:
            used for masking. NoneType or tensor in shape (B) indicating sequence length
        :return
            attention energies in shape (B,T)
        '''
        max_len = encoder_outputs.size(1)
        this_batch_size = encoder_outputs.size(1)
        H = hidden.repeat(max_len,1,1).transpose(0,1)
#         encoder_outputs = encoder_outputs.transpose(0,1) # [B*T*H]
        attn_energies = self.score(H,encoder_outputs) # compute attention score
        
        if src_len is not None:
            mask = []
            for b in range(src_len.size(0)):
                mask.append([0] * src_len[b].item() + [1] * (encoder_outputs.size(1) - src_len[b].item()))
            mask = cuda_(torch.ByteTensor(mask).unsqueeze(1)) # [B,1,T]
            attn_energies = attn_energies.masked_fill(mask, -1e18)
        result = F.softmax(attn_energies, dim = 1).unsqueeze(1)
        return result # normalize with softmax

    def score(self, hidden, encoder_outputs):
        energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2))) # [B*T*2H]->[B*T*H]
        energy = energy.transpose(2,1) # [B*H*T]
        v = self.v.repeat(encoder_outputs.data.shape[0],1).unsqueeze(1) #[B*1*H]
        energy = torch.bmm(v,energy) # [B*1*T]
        return energy.squeeze(1) #[B*T]

class BahdanauAttnDecoderRNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, embed_size, output_size, dropout_p=0.5):
        super(BahdanauAttnDecoderRNN, self).__init__()
        # Define parameters
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        # Define layers
        
        embed_mat = torch.from_numpy(weights_matrix).float()
        self.num_embeddings, self.embedding_dim = embed_mat.shape
        mask = np.zeros((self.num_embeddings,1))
        mask[0] = 1
        mask[1] = 1
        mask[2] = 1
        mask[3] = 1
        mask = torch.from_numpy(mask).float()
        self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)
        self.dropout = nn.Dropout(dropout_p)
        self.attn = Attn('concat', hidden_size)
        self.gru = nn.GRU(hidden_size + self.embedding_dim, hidden_size, dropout=dropout_p)
        #self.attn_combine = nn.Linear(hidden_size + embed_size, hidden_size)
        self.out = nn.Linear(hidden_size, self.output_size)

    def forward(self, word_input, last_hidden, encoder_outputs):
        '''
        :param word_input:
            word input for current time step, in shape (B)
        :param last_hidden:
            last hidden stat of the decoder, in shape (layers*direction*B*H)
        :param encoder_outputs:
            encoder outputs in shape (T*B*H)
        :return
            decoder output
        Note: we run this one step at a time i.e. you should use a outer loop 
            to process the whole sequence
        Tip(update):
        EncoderRNN may be bidirectional or have multiple layers, so the shape of hidden states can be 
        different from that of DecoderRNN
        You may have to manually guarantee that they have the same dimension outside this function,
        e.g, select the encoder hidden state of the foward/backward pass.
        '''
        # Get the embedding of the current input word (last output word)
        embed = self.embedding(word_input)
        mask = self.mask_embedding(word_input)
        
        word_embedded = mask*embed + (1-mask)*embed.clone().detach()
        word_embedded = self.embedding(word_input).view(word_input.size(0), 1, -1) # (1,B,V)
        word_embedded = self.dropout(word_embedded)
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs)  # (B,1,V)
#         context = context.transpose(0, 1)  # (1,B,V)
        # Combine embedded input word and attended context, run through RNN
        rnn_input = torch.cat((word_embedded, context), 2)
        #rnn_input = self.attn_combine(rnn_input) # use it in case your size of rnn_input is different
        print(rnn_input.transpose(0, 1).size(), last_hidden.size())
        output, hidden = self.gru(rnn_input.transpose(0, 1), last_hidden)
        output = output.squeeze(0)  # (1,B,V)->(B,V)
        # context = context.squeeze(0)
        # update: "context" input before final layer can be problematic.
        # output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        output = F.log_softmax(self.out(output), dim = 1)
        # Return final output, hidden state
        return output, hidden, attn_weights

In [None]:
encoder = EncoderRNN(train_input_lang.n_words, hidden_size_encoder).to(device)
attn_decoder = BahdanauAttnDecoderRNN(embedding_mat_en, hidden_size_decoder, EMBED_DIM, train_output_lang.n_words, dropout_p=0.4).to(device)
trainIters(train_loader, encoder, attn_decoder, n_iters=10, encoder_cnn=False, print_every=100, plot_every=1, learning_rate=LR_RATE)

__Training__

In [59]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [60]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [61]:
def train(input, target, input_len, target_len, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH, teach_forcing_ratio=0.5, encoder_cnn = False):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    max_input_len = max(input_len)
    max_target_len = max(target_len)

    loss = 0
    
    if not encoder_cnn:
        encoder_hidden = encoder.initHidden(batch_size)
        encoder_output, encoder_hidden = encoder(input, input_len, encoder_hidden)
    else:
        encoder_hidden = encoder(input)
        
    decoder_input = torch.tensor([[SOS_token]]*batch_size, device=device)
    decoder_hidden = encoder_hidden
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(max_target_len):
            decoder_output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_output)
            loss += criterion(decoder_output, target[:,di])
            decoder_input = target[:,di].unsqueeze(1)  # Teacher forcing (batch_size, 1)

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(max_target_len):
            decoder_output, decoder_hidden, attn_weights= decoder(decoder_input, decoder_hidden, encoder_output)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().unsqueeze(1)  # detach from history as input
            loss += criterion(decoder_output, target[:,di])
    #         if decoder_input.item() == EOS_token:
    #             break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / float(max_target_len)

In [62]:
def trainIters(loader, encoder, decoder, n_iters, encoder_cnn, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()
    
    best_bleu = None
    save_path = os.getcwd() + '/saved_model/news2s_att.pt'
            
    for iter in range(1, n_iters + 1):
        for i, (input, input_len, target, target_len) in enumerate(train_loader):
            loss = train(input, target, input_len, target_len, encoder, decoder, 
                         encoder_optimizer, decoder_optimizer, criterion, 
                         max_length=MAX_LENGTH, teach_forcing_ratio=teacher_forcing_ratio, encoder_cnn = encoder_cnn)
            print_loss_total += loss
            plot_loss_total += loss
            
            
            if i % print_every == 0:
                current_bleu = test(encoder, decoder, val_loader, encoder_cnn)
                if not best_bleu or current_bleu > best_bleu:
                    torch.save({
                                'epoch': iter,
                                'encoder_state_dict': encoder.state_dict(),
                                'decoder_state_dict': decoder.state_dict(),
                                'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
                                'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
                                'train_loss': loss,
                                'best_BLEU': best_bleu
                                }, save_path)
                    best_bleu = current_bleu
                
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (Epoch: %d %d%%) | Train Loss: %.4f | Best Bleu: %.4f | Current Blue: %.4f' 
                      % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg, best_bleu, current_bleu))

            if i % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
            torch.cuda.empty_cache()
#     showPlot(plot_losses)

In [63]:
def evaluate(encoder, decoder, input, input_len, encoder_cnn, max_length=MAX_LENGTH):
    """
    Function that generate translation.
    First, feed the source sentence into the encoder and obtain the hidden states from encoder.
    Secondly, feed the hidden states into the decoder and unfold the outputs from the decoder.
    Lastly, for each outputs from the decoder, collect the corresponding words in the target language's vocabulary.
    And collect the attention for each output words.
    @param encoder: the encoder network
    @param decoder: the decoder network
    @param input: string, input sentence in source language to be translated
    @param max_length: the max # of words that the decoder can return
    @output decoded_words: a list of words in target language
    @output decoder_attentions: a list of vector, each of which sums up to 1.0
    """    
    # process input sentence
    with torch.no_grad():
        
        max_input_len = max(input_len)
        
        if not encoder_cnn:
            encoder_hidden = encoder.initHidden(batch_size)
            encoder_output, encoder_hidden = encoder(input, input_len, encoder_hidden)
        else:
            encoder_hidden = encoder(input)

        decoder_input = torch.tensor([[SOS_token]]*batch_size, device=device)
        # decode the context vector
        decoder_hidden = encoder_hidden # decoder starts from the last encoding sentence
        
        # output of this function
        decoded_words = []
#         decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
            decoder_output, decoder_hidden, attention_weights = decoder(decoder_input, decoder_hidden, encoder_output)    
            topv, topi = decoder_output.topk(1)
            decoded_words.append(topi.cpu().numpy())
            decoder_input = topi.squeeze().detach().unsqueeze(1)  # detach from history as input
        return np.asarray(decoded_words).T#, decoder_attentions[:di + 1]

In [34]:
def beam_search_decoder(data, k):
    sequences = [[list(), 1.0]]
    # walk over each step in sequence
    for row in data:
        all_candidates = list()
        # expand each current candidate
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score * - np.log(row[j])]
                all_candidates.append(candidate)
            print(all_candidates)
        # order all candidates by score
        ordered = sorted(all_candidates, key=lambda tup:tup[1])
        # select k best
        sequences = ordered[:k]
    return sequences

In [36]:
# def evaluate(encoder, decoder, input, input_len, encoder_cnn, max_length=MAX_LENGTH):
#     """
#     Function that generate translation.
#     First, feed the source sentence into the encoder and obtain the hidden states from encoder.
#     Secondly, feed the hidden states into the decoder and unfold the outputs from the decoder.
#     Lastly, for each outputs from the decoder, collect the corresponding words in the target language's vocabulary.
#     And collect the attention for each output words.
#     @param encoder: the encoder network
#     @param decoder: the decoder network
#     @param input: string, input sentence in source language to be translated
#     @param max_length: the max # of words that the decoder can return
#     @output decoded_words: a list of words in target language
#     @output decoder_attentions: a list of vector, each of which sums up to 1.0
#     """    
#     # process input sentence
#     with torch.no_grad():
        
#         max_input_len = max(input_len)
        
#         if not encoder_cnn:
#             encoder_hidden = encoder.initHidden(batch_size)
#             encoder_output, encoder_hidden = encoder(input, input_len, encoder_hidden)
#         else:
#             encoder_hidden = encoder(input)

#         decoder_input = torch.tensor([[SOS_token]]*batch_size, device=device)
#         # decode the context vector
#         decoder_hidden = encoder_hidden # decoder starts from the last encoding sentence
        
#         # output of this function
#         decoder_outputs = torch.zeros([batch_size, max_length, train_output_lang.n_words], device=device)
# #         decoder_attentions = torch.zeros(max_length, max_length)
#         decoded_words = []
#         collections = []
#         for di in range(max_length):
#             # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
#             decoder_output, decoder_hidden, attention_weights = decoder(decoder_input, decoder_hidden, encoder_output)    
#             decoder_outputs[: , di, :] = np.exp(decoder_output)
#             topv, topi = decoder_output.topk(1)
#             decoded_words.append(topi.cpu().numpy())
#             decoder_input = topi.squeeze().detach().unsqueeze(1)  # detach from history as input
        
#         for i in range(batch_size):
#             data = decoder_outputs[i, : :]
#             collections.append(beam_search_decoder(data, 1))
#             print(collections)
#         return (np.asarray(collections).T)
#         #return np.asarray(decoded_words).T#, decoder_attentions[:di + 1]

In [121]:
test(encoder, attn_decoder, val_loader, encoder_cnn = False)

predict: i was a the i i the the the the the . . .
target: when i was i remember waking up one morning to the sound of joy in my house .


2.550712031994356

In [None]:
# def test(encoder, decoder, data_loader, encoder_cnn):
#     total_score = 0
#     count = 0
    
#     candidate_corpus = []
#     reference_corpus = []

#     for i, (input, input_len, target, target_len) in enumerate(data_loader):
#         decoded_words = evaluate(encoder, decoder, input, input_len, encoder_cnn)
#         candidate_sentences = []
#         for ind in range(decoded_words.shape[1]):
#             sent_words = []
#             for token in decoded_words[0][ind]:
#                 if token != PAD_token and token != EOS_token:
# #                     pdb.set_trace()
#                     sent_words.append(train_output_lang.index2word[token])
#                 else:
#                     break
#             sent_words = ' '.join(sent_words)
#             if count == 0:
#                 print('predict: '+sent_words)
#                 count += 1
#     #             sent_words = ' '.join([train_output_lang.index2word[token] for token in decoded_words[0][ind]])
#             candidate_sentences.append(sent_words)
#         candidate_corpus.extend(candidate_sentences)

#         reference_sentences = []
#         for sent in target:
#             sent_words = []
#             for token in sent:
#                 if token.item() != EOS_token:
#                     sent_words.append(train_output_lang.index2word[token.item()])
#                 else:
#                     break
#             sent_words = ' '.join(sent_words)
#             if count == 1:
#                 print('target: '+sent_words)
#                 count += 1
#     #             sent_words = ' '.join([train_output_lang.index2word[token.item()] for token in sent])
#             reference_sentences.append(sent_words)
#         reference_corpus.extend(reference_sentences)
    
#     score = corpus_bleu(candidate_corpus, [reference_corpus], smooth='exp', smooth_floor=0.0, force=False).score
#     return score

In [64]:
def test(encoder, decoder, data_loader, encoder_cnn):
    total_score = 0
    count = 0
    check = 0
    
    candidate_corpus = []
    reference_corpus = []

    for i, (input, input_len, target, target_len) in enumerate(data_loader):
        decoded_words = evaluate(encoder, decoder, input, input_len, encoder_cnn)
        candidate_sentences = []
        for ind in range(decoded_words.shape[1]):
            sent_words = []
            for token in decoded_words[0][ind]:
                if token != PAD_token and token != EOS_token:
                    sent_words.append(train_output_lang.index2word[token])
                else:
                    break
            sent_words = ' '.join(sent_words)
            if check == 0:
                print('predict: '+sent_words)
                check += 1
            candidate_sentences.append(sent_words)

        reference_sentences = []
        for sent in target:
            sent_words = []
            for token in sent:
                if token.item() != EOS_token:
                    sent_words.append(train_output_lang.index2word[token.item()])
                else:
                    break
            sent_words = ' '.join(sent_words)
            if check == 1:
                print('target: '+sent_words)
                check += 1
            reference_sentences.append(sent_words)
        count += 1
        score = corpus_bleu(candidate_sentences, [reference_sentences], smooth='exp', smooth_floor=0.0, force=False).score
        total_score += score
    return total_score / float(count)

In [119]:
encoder = EncoderRNN(train_input_lang.n_words, hidden_size_encoder).to(device)
attn_decoder = BahdanauAttnDecoderRNN(embedding_mat_en, hidden_size_decoder, EMBED_DIM, train_output_lang.n_words, dropout_p=0.4).to(device)

In [120]:
cpt = os.getcwd() + '/saved_model/news2s_att.pt'
m_dict = torch.load(cpt)
encoder.load_state_dict(m_dict['encoder_state_dict'])
attn_decoder.load_state_dict(m_dict['decoder_state_dict'])