In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import io
import unicodedata
import string
import re
import random
import os
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from torch.utils.data import Dataset
from torch.optim import lr_scheduler
import itertools
import glob
plt.switch_backend('agg')
import matplotlib.ticker as ticker
from sacrebleu import corpus_bleu
import sacrebleu
import pdb
import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
batch_size = 32
words_to_load = 100000
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_token = 3
LR_RATE = 0.001
MAX_LENGTH = 40
hidden_size = 300
teacher_forcing_ratio = 0.5
save_path = os.getcwd() + '/saved_model/VI En-RNN-De-Attn_LSTM|Hidden-{}|LR-{}|TF-{}|MaxLen-{}'.format(hidden_size,
                                                                                                 LR_RATE,
                                                                                                 teacher_forcing_ratio,
                                                                                                 MAX_LENGTH)
train_loss_save_path = os.getcwd() + '/train_loss/VI TrainLoss|En-RNN-De-Attn_LSTM|Hidden-{}|LR-{}|TF-{}|MaxLen-{}'.format(hidden_size,
                                                                                                 LR_RATE,
                                                                                                 teacher_forcing_ratio,
                                                                                                 MAX_LENGTH)
bleu_save_path = os.getcwd() + '/bleu/VI BLEU|En-RNN-De-Attn_LSTM|Hidden-{}|LR-{}|TF-{}|MaxLen-{}'.format(hidden_size,
                                                                                                 LR_RATE,
                                                                                                 teacher_forcing_ratio,
                                                                                                 MAX_LENGTH)

__Preprocess Data__

In [3]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "<pad>", 3: "<unk>"}
        self.n_words = 4  # Count SOS, EOS, pad and unk

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

# def normalizeZh(s):
#     s = s.strip()
#     s = re.sub("\s+", " ", s)
#     return s

In [5]:
# def filterPair(p):
#     return len(p[0].split(' ')) < MAX_LENGTH and \
#         len(p[1].split(' ')) < MAX_LENGTH

# def filterPairs(pairs):
#     return [pair for pair in pairs if filterPair(pair)]

In [6]:
def filterPair(p):
    filtered = []
    for i in p:
        filtered.append(' '.join(i.split(' ')[:MAX_LENGTH-1]))
    return filtered

def filterPairs(pairs):
    return [filterPair(pair) for pair in pairs]

In [7]:
def readLangs(dataset, lang1, lang2):
    vietnamese = os.getcwd()+'/iwslt-vi-en/{}.tok.{}'.format(dataset, lang1)
    english = os.getcwd()+'/iwslt-vi-en/{}.tok.{}'.format(dataset, lang2)

    vietnamese_lines = open(vietnamese, encoding='utf-8').read().strip().split('\n')
    english_lines = open(english, encoding='utf-8').read().strip().split('\n')
    length = len(vietnamese_lines)

    pairs = [[vietnamese_lines[i], normalizeString(english_lines[i])] for i in range(length)]
    pairs = filterPairs(pairs)
    
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    return input_lang, output_lang, pairs

In [8]:
train_input_lang, train_output_lang, train_pairs = readLangs('train', 'vi', 'en')
val_input_lang, val_output_lang, val_pairs = readLangs('dev', 'vi', 'en')
test_input_lang, test_output_lang, test_pairs = readLangs('test', 'vi', 'en')

__Embedding__

In [9]:
def load_embedding(ft_path, words_to_load):
    fin = io.open(ft_path, 'r', encoding='utf-8', newline='\n', errors='ignore')

    n, d = map(int, fin.readline().split())
    vocab_size = words_to_load + 4
    embedding_dim = d

    embedding_mat = np.zeros((vocab_size, embedding_dim))
    token2id = {}
    id2token = {}
    all_tokens = ['SOS', 'EOS', '<unk>', '<pad>']

    for i, line in enumerate(fin):
        if i >= words_to_load:
            break
        s = line.rstrip().split(' ')
        embedding_mat[i+4, :] = np.asarray(s[1:])
        token2id[s[0]] = i+4
        id2token[i+4] = s[0]
        all_tokens.append(s[0])

    token2id['<pad>'] = PAD_token 
    token2id['<unk>'] = UNK_token
    token2id['SOS'] = SOS_token
    token2id['EOS'] = EOS_token
    id2token[PAD_token] = '<pad>'
    id2token[UNK_token] = '<unk>'
    id2token[SOS_token] = 'SOS'
    id2token[EOS_token] = 'EOS'
    embedding_mat[PAD_token, :] = np.zeros((1,d))
    #generate normal dist 1d array for UNK, SOS, EOS token
    embedding_mat[UNK_token, :] = np.random.normal(size=d)
    embedding_mat[SOS_token, :] = np.random.normal(size=d)
    embedding_mat[EOS_token, :] = np.random.normal(size=d)
        
    return embedding_mat, all_tokens, token2id, id2token

In [10]:
fname_vi = os.getcwd()+'/Embedding/wiki.vi.vec'
fname_eng = os.getcwd()+'/Embedding/wiki-news-300d-1M.vec'
embedding_mat_vi, all_tokens_vi, token2id_vi, id2token_vi = load_embedding(fname_vi, words_to_load)
embedding_mat_en, all_tokens_en, token2id_en, id2token_en = load_embedding(fname_eng, words_to_load)

__Data Loader__

In [11]:
class NMTDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, input_lang, output_lang, pairs):
        """
        @param data_list_1: list of sentence 1 tokens 
        @param data_list_2: list of sentence 2 tokens
        @param target_list: list of review targets 

        """
        self.input_w2i = input_lang
        self.output_w2i = output_lang
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        input_sentence = self.pairs[key][0]
        input_indexes = [self.input_w2i[word] if word in self.input_w2i else UNK_token for word in input_sentence.split(' ')]
        input_indexes.append(EOS_token)
        input_length = len(input_indexes)

        output_sentence = self.pairs[key][1]
        output_indexes = [self.output_w2i[word] if word in self.output_w2i else UNK_token for word in output_sentence.split(' ')]
        output_indexes.append(EOS_token)
        output_length = len(output_indexes)
        return [input_indexes, input_length, output_indexes, output_length]

    
def NMTDataset_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    input_ls = []
    output_ls = []
    input_length_ls = []
    output_length_ls = []
    
    for datum in batch:
        input_length_ls.append(datum[1])
        output_length_ls.append(datum[3])
    
    #find max length in each batch
    max_input = sorted(input_length_ls)[-1]
    max_output = sorted(output_length_ls)[-1]
    
    # padding
    for datum in batch:
        padded_vec_input = np.pad(np.array(datum[0]), 
                                  pad_width=((0,MAX_LENGTH-datum[1])), 
                                  mode="constant", constant_values=2).tolist()
        padded_vec_output = np.pad(np.array(datum[2]), 
                                   pad_width=((0,MAX_LENGTH-datum[3])), 
                                   mode="constant", constant_values=2).tolist()
        input_ls.append(padded_vec_input)
        output_ls.append(padded_vec_output)
    return [torch.tensor(torch.from_numpy(np.array(input_ls)), device=device), 
            torch.tensor(input_length_ls, device=device), 
            torch.tensor(torch.from_numpy(np.array(output_ls)), device=device), 
            torch.tensor(output_length_ls, device=device)]

In [12]:
# create pytorch dataloader
train_dataset = NMTDataset(token2id_vi, token2id_en, train_pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=NMTDataset_collate_func,
                                           shuffle=True,
                                           drop_last=True)

val_dataset = NMTDataset(token2id_vi, token2id_en, val_pairs)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                         batch_size=batch_size,
                                         collate_fn=NMTDataset_collate_func,
                                         shuffle=True,
                                         drop_last=True)

__Encoder__

In [15]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        embed_mat = torch.from_numpy(embedding_mat_vi).float()
        n, embed_dim = embed_mat.shape
#         mask = np.zeros((n,1))
#         mask[0] = 1
#         mask[1] = 1
#         mask[2] = 1
#         mask[3] = 1
#         mask = torch.from_numpy(mask).float()
#         self.mask_embedding = nn.Embedding.from_pretrained(mask, freeze = False)
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = True)
        
#         self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True, bidirectional=True)

    def forward(self, input, input_len, hidden):        
        # get embedding of characters
        embed = self.embedding(input)
#         mask = self.mask_embedding(input)
        
#         embedded = mask*embed + (1-mask)*embed.clone().detach()
        embedded = embed
#         output, hidden = self.gru(embedded, hidden)
        output, hidden = self.lstm(embedded, hidden)
        
        return output, hidden

    def initHidden(self, batch_size):
        return (torch.zeros(2, batch_size, self.hidden_size, device=device),
                torch.zeros(2, batch_size, self.hidden_size, device=device))

__Decoder With Attention__

In [16]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers 
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        embed_mat = torch.from_numpy(embedding_mat_vi).float()
        n, embed_dim = embed_mat.shape
        
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze=True)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)

#         self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, encoder_outputs):
        embed = self.embedding(input)
        embed = self.dropout(embed)   
        
        hidden_reshaped = [h.view(hidden[0].size()[1],1,-1) for h in hidden]
        attn_weights = F.softmax(self.attn(torch.cat((embed, hidden_reshaped[0]), 2)), dim=2)
        
#         pdb.set_trace()
#         print(attn_weights.size())
#         print(encoder_outputs.size())
        attn_applied = torch.bmm(attn_weights, encoder_outputs).squeeze(1)
        
        output = torch.cat((embed.squeeze(1), attn_applied), 1)
 
        output = self.attn_combine(output).unsqueeze(1)
        
        output = F.relu(output)
        
#         pdb.set_trace()
        hidden_reshaped = [h.view(1,h.size()[0],-1) for h in hidden_reshaped]
        output, hidden = self.lstm(output, hidden_reshaped)
        output = self.softmax(self.out(output.squeeze(1)))
        
        return output, hidden, attn_weights

    def initHidden(self,batch_size):
        return torch.randn(self.num_layers, batch_size, self.hidden_size,device=device)

__Training__

In [17]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [18]:
def train(input, target, input_len, target_len, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, teach_forcing_ratio=0.5, encoder_cnn = False):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    max_input_len = max(input_len)
    max_target_len = max(target_len)

    loss = 0
    
    if not encoder_cnn:
        encoder_hidden = encoder.initHidden(batch_size)
        encoder_output, encoder_hidden = encoder(input, input_len, encoder_hidden)
#         print(encoder_output.size())
    else:
        encoder_hidden = encoder(input)
        
    decoder_input = torch.tensor([[SOS_token]]*batch_size, device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(max_target_len):
#             decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            decoder_output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_output)
            loss += criterion(decoder_output, target[:,di])
            decoder_input = target[:,di].unsqueeze(1)  # Teacher forcing (batch_size, 1)

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(max_target_len):
#             decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            decoder_output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_output)
            loss += criterion(decoder_output, target[:,di])
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().unsqueeze(1)  # detach from history as input
    #         if decoder_input.item() == EOS_token:
    #             break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / float(max_target_len)

In [19]:
def trainIters(loader, encoder, decoder, n_iters, encoder_cnn, save_path, print_every=1000, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()
    
    best_bleu = None
#     save_path = os.getcwd() + '/saved_model/En-CNN-De-NoAttn|Hidden-{}|LR-{}|TF-{}|MaxLen-{}.pt'.format(hidden_size, 
#                                                                                                         learning_rate,
#                                                                                                         teacher_forcing_ratio,
#                                                                                                         max_length)
    save_path = save_path + '.pt'
    
    train_loss_hist = []
    bleu_hist = []
    
    for iter in range(1, n_iters + 1):
        for i, (input, input_len, target, target_len) in enumerate(train_loader):
            loss = train(input, target, input_len, target_len, encoder, decoder, 
                         encoder_optimizer, decoder_optimizer, criterion, 
                         teach_forcing_ratio=teacher_forcing_ratio, encoder_cnn = encoder_cnn)
            print_loss_total += loss
            plot_loss_total += loss
            
            
            if i % print_every == 0:
                current_bleu = test(encoder, decoder, val_loader, encoder_cnn)
                if not best_bleu or current_bleu > best_bleu:
                    torch.save({
                                'epoch': iter,
                                'encoder_state_dict': encoder.state_dict(),
                                'decoder_state_dict': decoder.state_dict(),
                                'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
                                'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
                                'train_loss': loss,
                                'best_BLEU': best_bleu
                                }, save_path)
                    best_bleu = current_bleu
                
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                train_loss_hist.append(print_loss_avg)
                bleu_hist.append(current_bleu)
                print('%s (Epoch: %d %d%%) | Train Loss: %.4f | Best Bleu: %.4f | Current Blue: %.4f' 
                      % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg, best_bleu, current_bleu))

#             if i % plot_every == 0:
#                 plot_loss_avg = plot_loss_total / plot_every
#                 plot_losses.append(plot_loss_avg)
#                 plot_loss_total = 0
#     showPlot(plot_losses)
    return train_loss_hist, bleu_hist

__Test__

In [20]:
def evaluate(encoder, decoder, input, input_len, encoder_cnn, max_length=MAX_LENGTH):
    """
    Function that generate translation.
    First, feed the source sentence into the encoder and obtain the hidden states from encoder.
    Secondly, feed the hidden states into the decoder and unfold the outputs from the decoder.
    Lastly, for each outputs from the decoder, collect the corresponding words in the target language's vocabulary.
    And collect the attention for each output words.
    @param encoder: the encoder network
    @param decoder: the decoder network
    @param input: string, input sentence in source language to be translated
    @param max_length: the max # of words that the decoder can return
    @output decoded_words: a list of words in target language
    @output decoder_attentions: a list of vector, each of which sums up to 1.0
    """    
    # process input sentence
    with torch.no_grad():
        
        max_input_len = max(input_len)
        
        if not encoder_cnn:
            encoder_hidden = encoder.initHidden(batch_size)
            encoder_output, encoder_hidden = encoder(input, input_len, encoder_hidden)
        else:
            encoder_hidden = encoder(input)

        decoder_input = torch.tensor([[SOS_token]]*batch_size, device=device)
        # decode the context vector
        decoder_hidden = encoder_hidden # decoder starts from the last encoding sentence
        
        # output of this function
        decoded_words = []
#         decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
            decoder_output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_output)
            topv, topi = decoder_output.topk(1)
            decoded_words.append(topi.cpu().numpy())
            decoder_input = topi.squeeze().detach().unsqueeze(1)  # detach from history as input

        return np.asarray(decoded_words).T#, decoder_attentions[:di + 1]

In [21]:
def test(encoder, decoder, data_loader, encoder_cnn):
    total_score = 0
    count = 0
    
    candidate_corpus = []
    reference_corpus = []

    for i, (input, input_len, target, target_len) in enumerate(data_loader):
        decoded_words = evaluate(encoder, decoder, input, input_len, encoder_cnn)
        candidate_sentences = []
        for ind in range(decoded_words.shape[1]):
            sent_words = []
            for token in decoded_words[0][ind]:
                if token != PAD_token and token != EOS_token:
#                     pdb.set_trace()
                    sent_words.append(id2token_en[token.item()])
#                     sent_words.append(train_output_lang.index2word[token])
                else:
                    break
            sent_words = ' '.join(sent_words)
            if count == 0:
                print('predict: '+sent_words)
                count += 1
    #             sent_words = ' '.join([train_output_lang.index2word[token] for token in decoded_words[0][ind]])
            candidate_sentences.append(sent_words)
        candidate_corpus.extend(candidate_sentences)

        reference_sentences = []
        for sent in target:
            sent_words = []
            for token in sent:
                if token.item() != EOS_token:
                    sent_words.append(id2token_en[token.item()])
#                     sent_words.append(train_output_lang.index2word[token.item()])
                else:
                    break
            sent_words = ' '.join(sent_words)
            if count == 1:
                print('target: '+sent_words)
                count += 1
    #             sent_words = ' '.join([train_output_lang.index2word[token.item()] for token in sent])
            reference_sentences.append(sent_words)
        reference_corpus.extend(reference_sentences)
    
    score = corpus_bleu(candidate_corpus, [reference_corpus], smooth='exp', smooth_floor=0.0, force=False).score
    return score

__Run__

In [None]:
encoder_hidden_size = int(hidden_size/2)
encoder = EncoderRNN(hidden_size = encoder_hidden_size).to(device)
# encoder = EncoderCNN(hidden_size,kernel_dim=3,batch_size=batch_size).to(device)
# noattn_decoder = DecoderRNN(hidden_size, train_output_lang.n_words).to(device)
# noattn_decoder = DecoderRNN(hidden_size, embedding_mat_en.shape[0]).to(device)
attn_decoder = AttnDecoderRNN(hidden_size, embedding_mat_vi.shape[0]).to(device)

#UNCOMMENT TO TRAIN THE MODEL
train_loss_hist, bleu_hist = trainIters(train_loader, encoder, attn_decoder, n_iters=20, encoder_cnn=False, save_path = save_path, print_every=1000, learning_rate=LR_RATE)

with open(train_loss_save_path, 'wb') as f:
     pickle.dump(train_loss_hist, f)
with open(bleu_save_path, 'wb') as f:
     pickle.dump(bleu_hist, f)
# trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

# encoder.load_state_dict(torch.load("encoder.pth"))
# attn_decoder1.load_state_dict(torch.load("attn_decoder.pth"))

predict: indica proficiency accessory accessory
target: i joined forces with many other <unk> inside and outside <unk> to call for a day of rage and to initiate a revolution against the tyrannical regime of <unk> .
0m 14s (- 4m 26s) (Epoch: 1 5%) | Train Loss: 0.0115 | Best Bleu: 0.0013 | Current Blue: 0.0013
predict: and <unk> s <unk> s the <unk> s the <unk> s the <unk> <unk> .
target: we have never met a single human being in the world who can make it sell it and look after the money .




12m 1s (- 228m 35s) (Epoch: 1 5%) | Train Loss: 3.2727 | Best Bleu: 5.1695 | Current Blue: 5.1695
predict: and <unk> <unk> <unk> . . .
target: some i even considered like my second home .




23m 57s (- 455m 4s) (Epoch: 1 5%) | Train Loss: 2.9352 | Best Bleu: 5.3165 | Current Blue: 5.3165
predict: and <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> . <unk> <unk>
target: so so the government says <unk> do it again . <unk> <unk>




35m 47s (- 680m 2s) (Epoch: 1 5%) | Train Loss: 2.8111 | Best Bleu: 5.9015 | Current Blue: 5.9015
predict: and we <unk> <unk> to to to to the the the the the <unk> <unk> <unk> .
target: there was no question that his children would receive an education including his daughters despite the taliban despite the risks .




47m 39s (- 905m 36s) (Epoch: 1 5%) | Train Loss: 2.7478 | Best Bleu: 6.2177 | Current Blue: 6.2177
predict: it <unk> s a . . . .
target: thanks so much . max little everybody .




49m 46s (- 447m 59s) (Epoch: 2 10%) | Train Loss: 0.4475 | Best Bleu: 6.2177 | Current Blue: 6.1929
predict: and the it <unk> the to to to to to to . .
target: <unk> this world will be much poorer without these wonderful species .




61m 39s (- 554m 52s) (Epoch: 2 10%) | Train Loss: 2.6364 | Best Bleu: 7.0160 | Current Blue: 7.0160
predict: and is the the and and and and and and <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>
target: they were lawyers journalists priests they all said <unk> we don <unk> t want this . <unk> <unk>




73m 29s (- 661m 21s) (Epoch: 2 10%) | Train Loss: 2.6091 | Best Bleu: 7.0206 | Current Blue: 7.0206
predict: we <unk> to to to . . .
target: we have helped to start businesses .




85m 16s (- 767m 31s) (Epoch: 2 10%) | Train Loss: 2.5838 | Best Bleu: 7.0206 | Current Blue: 6.8971
predict: and i i to to to people the of of the . . .
target: patronizing i treat everybody from another culture as if they were my servants .




97m 4s (- 873m 39s) (Epoch: 2 10%) | Train Loss: 2.5304 | Best Bleu: 7.6110 | Current Blue: 7.6110
predict: this is s in the in i i i a a a a a . .
target: this is six months of my life into this file .




99m 9s (- 561m 56s) (Epoch: 3 15%) | Train Loss: 0.4175 | Best Bleu: 7.6110 | Current Blue: 7.4320
predict: and if i <unk> m a a to to and and and and and and you you to you
target: and if i could communicate just one thing to <unk> and to sam and to you it would be that you don <unk> t have to be normal .




110m 55s (- 628m 37s) (Epoch: 3 15%) | Train Loss: 2.4545 | Best Bleu: 8.4655 | Current Blue: 8.4655
predict: and i have a . .
target: so i had an idea .




122m 41s (- 695m 16s) (Epoch: 3 15%) | Train Loss: 2.4387 | Best Bleu: 8.4655 | Current Blue: 7.7597
predict: and <unk> you <unk> re going to it it <unk> <unk> <unk> .
target: and now you <unk> re of course curious if it also worked .




134m 29s (- 762m 7s) (Epoch: 3 15%) | Train Loss: 2.4298 | Best Bleu: 8.4655 | Current Blue: 8.0117
predict: and the the i i i i i i <unk> a a of the the the the the the the the the
target: in the <unk> i found children carrying stone for miles down mountainous terrain to trucks waiting at roads below .




146m 15s (- 828m 46s) (Epoch: 3 15%) | Train Loss: 2.4148 | Best Bleu: 8.4655 | Current Blue: 8.0003


In [None]:
with open(train_loss_save_path, 'rb') as f:
     train_loss_hist_pk = pickle.load(f)
with open(bleu_save_path, 'rb') as f:
     bleu_hist_pk = pickle.load(f)

In [None]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)