In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

In [18]:
#Mount google drive
from google.colab import drive
drive.mount('/content/gdrive') 

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import pickle

from torch.autograd import Variable

plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline
from torch.utils.data import Dataset

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.nn import functional

import time
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_CUDA = torch.cuda.is_available()

MAX_LENGTH = 40 #temp

MAX_VOCAB_SIZE = 50000

PAD_IDX = 0 
SOS_token = 1
EOS_token = 2
UNK_IDX = 3


In [0]:
en_loc = 'gdrive/My Drive/iwslt-vi-en'

In [0]:
def load_emb_matrix(language):  
  words_to_load = 50000
  count=0
  if language == 'english':
    file = 'wiki-news-300d-1M.vec'
  if language == 'vietnamese':
    file = 'wiki.vi.vec'
  with open('gdrive/My Drive/'+file) as f:
    #remove the first line
    firstLine = f.readline()
    loaded_embeddings = np.zeros((words_to_load + 4, 300))
    words2id = {}
    idx2words = {}
    for i, line in enumerate(f):
      if count >= 50000: 
        break
      s = line.split()
      if len(s[1:])==300:
        loaded_embeddings[count + 4 , :] = np.asarray(s[1:])
        words2id[s[0]] = count + 4
        idx2words[count + 4] = s[0]
        count=count+1
    words2id['<SOS>'] = SOS_token
    words2id['<EOS>'] = EOS_token
    words2id['<pad>'] = PAD_IDX
    words2id['<unk>'] = UNK_IDX
    idx2words[SOS_token] = '<SOS>'
    idx2words[EOS_token] = '<EOD>'
    idx2words[PAD_IDX] = '<pad>'
    idx2words[UNK_IDX] = '<unk>'
    return words2id,idx2words,loaded_embeddings

def generate_weights_matrix(idx2words,loaded_embeddings):
  matrix_len = len(idx2words)
  weights_matrix = np.zeros((matrix_len, 300))
  for key in idx2words.keys():
    try: 
      weights_matrix[key] = loaded_embeddings[key]
    except KeyError:
      weights_matrix[key] = np.random.normal(scale=0.6, size=(emb_dim, ))
  return weights_matrix

In [0]:
import pickle as pkl
words2id_eng,idx2words_eng,loaded_embeddings_eng = load_emb_matrix('english')
words2id_vi,idx2words_vi,loaded_embeddings_vi = load_emb_matrix('vietnamese')

pkl.dump(words2id_eng, open(en_loc + '/words2id_eng.pkl', 'wb'))
pkl.dump(idx2words_eng, open(en_loc +'/idx2words_eng.pkl', 'wb'))
pkl.dump(loaded_embeddings_eng, open(en_loc+'/embedding_matrix_eng.pkl', 'wb'))

pkl.dump(words2id_vi, open(en_loc + '/words2id_vi.pkl', 'wb'))
pkl.dump(idx2words_vi, open(en_loc + '/idx2words_vi.pkl', 'wb'))
pkl.dump(loaded_embeddings_vi, open(en_loc +'/embedding_matrix_vi.pkl', 'wb'))

In [0]:
weights_matrix_eng = generate_weights_matrix(idx2words_eng,loaded_embeddings_eng)
pkl.dump(weights_matrix_eng, open(en_loc + '/weights_matrix_eng.pkl', 'wb'))
weights_matrix_eng = torch.from_numpy(weights_matrix_eng).to(device)

weights_matrix_vi = generate_weights_matrix(idx2words_vi,loaded_embeddings_vi)
pkl.dump(weights_matrix_vi, open(en_loc + '/weights_matrix_vi.pkl', 'wb'))
weights_matrix_vi = torch.from_numpy(weights_matrix_vi).to(device)

In [0]:
import torch
from torch.nn import functional
from torch.autograd import Variable

def sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.range(0, max_len - 1).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def masked_cross_entropy(logits, target, length):
    length = Variable(torch.LongTensor(length))

    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.
    Returns:
        loss: An average loss value masked by the length.
    """

    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1)).to(device)
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = functional.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = sequence_mask(sequence_length=length, max_len=target.size(1)).to(device)
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum().to(device)
    return loss


# Remove punctuation
def removePunctuation(s):

    to_remove = ('&lt;', '&gt;', '&amp;', '&apos;', '&quot;')
    table = str.maketrans(dict.fromkeys('.!?:,'))
    s = s.translate(table)
    for i in to_remove:
        s=s.replace(i,'')   
    s = s.strip()
    
    return s


from typing import List
from collections import Counter, namedtuple
from itertools import zip_longest

def tokenize_13a(line):
    """
    Tokenizes an input line using a relatively minimal tokenization that is however equivalent to mteval-v13a, used by WMT.
    :param line: a segment to tokenize
    :return: the tokenized line
    """

    norm = line

    # language-independent part:
    norm = norm.replace('<skipped>', '')
    norm = norm.replace('-\n', '')
    norm = norm.replace('\n', ' ')
    norm = norm.replace('&quot;', '"')
    norm = norm.replace('&amp;', '&')
    norm = norm.replace('&lt;', '<')
    norm = norm.replace('&gt;', '>')

    # language-dependent part (assuming Western languages):
    norm = " {} ".format(norm)
    norm = re.sub(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', norm)
    norm = re.sub(r'([^0-9])([\.,])', '\\1 \\2 ', norm)  # tokenize period and comma unless preceded by a digit
    norm = re.sub(r'([\.,])([^0-9])', ' \\1 \\2', norm)  # tokenize period and comma unless followed by a digit
    norm = re.sub(r'([0-9])(-)', '\\1 \\2 ', norm)  # tokenize dash when preceded by a digit
    norm = re.sub(r'\s+', ' ', norm)  # one space only between words
    norm = re.sub(r'^\s+', '', norm)  # no leading space
    norm = re.sub(r'\s+$', '', norm)  # no trailing space

    return norm

def corpus_bleu(sys_stream, ref_streams, smooth='exp', smooth_floor=0.0, force=False, lowercase=False,
                 use_effective_order=False):
    """Produces BLEU scores along with its sufficient statistics from a source against one or more references.
    :param sys_stream: The system stream (a sequence of segments)
    :param ref_streams: A list of one or more reference streams (each a sequence of segments)
    :param smooth: The smoothing method to use
    :param smooth_floor: For 'floor' smoothing, the floor to use
    :param force: Ignore data that looks already tokenized
    :param lowercase: Lowercase the data
    :param tokenize: The tokenizer to use
    :return: a BLEU object containing everything you'd want
    """

    # Add some robustness to the input arguments
    if isinstance(sys_stream, str):
        sys_stream = [sys_stream]
    if isinstance(ref_streams, str):
        ref_streams = [[ref_streams]]

    sys_len = 0
    ref_len = 0

    correct = [0 for n in range(NGRAM_ORDER)]
    total = [0 for n in range(NGRAM_ORDER)]
    

    # look for already-tokenized sentences
    tokenized_count = 0

    fhs = [sys_stream] + ref_streams
    for lines in zip_longest(*fhs):
        if None in lines:
            raise EOFError("Source and reference streams have different lengths!")

        if lowercase:
            lines = [x.lower() for x in lines]
            
        tokenize= 'tokenize_13a'    

        if not (force or tokenize == 'none') and lines[0].rstrip().endswith(' .'):
            tokenized_count += 1

            if tokenized_count == 100:
                logging.warning('That\'s 100 lines that end in a tokenized period (\'.\')')
                logging.warning('It looks like you forgot to detokenize your test data, which may hurt your score.')
                logging.warning('If you insist your data is detokenized, or don\'t care, you can suppress this message with \'--force\'.')

        output, *refs = [tokenize_13a(x.rstrip()) for x in lines]
        

        ref_ngrams, closest_diff, closest_len = ref_stats(output, refs)
        

        sys_len += len(output.split())
        ref_len += closest_len

        sys_ngrams = extract_ngrams(output)
        for ngram in sys_ngrams.keys():
            n = len(ngram.split())
            correct[n-1] += min(sys_ngrams[ngram], ref_ngrams.get(ngram, 0))
            total[n-1] += sys_ngrams[ngram]
            

    return compute_bleu(correct, total, sys_len, ref_len, smooth, smooth_floor, use_effective_order)
  
  
# n-gram order. Don't change this.
NGRAM_ORDER = 4
  
def compute_bleu(correct: List[int], total: List[int], sys_len: int, ref_len: int, smooth = 'none', smooth_floor = 0.01,
                 use_effective_order = False):
    """Computes BLEU score from its sufficient statistics. Adds smoothing.
    :param correct: List of counts of correct ngrams, 1 <= n <= NGRAM_ORDER
    :param total: List of counts of total ngrams, 1 <= n <= NGRAM_ORDER
    :param sys_len: The cumulative system length
    :param ref_len: The cumulative reference length
    :param smooth: The smoothing method to use
    :param smooth_floor: The smoothing value added, if smooth method 'floor' is used
    :param use_effective_order: Use effective order.
    :return: A BLEU object with the score (100-based) and other statistics.
    """

    precisions = [0 for x in range(NGRAM_ORDER)]

    smooth_mteval = 1.
    effective_order = NGRAM_ORDER
    for n in range(NGRAM_ORDER):
        if total[n] == 0:
            break

        if use_effective_order:
            effective_order = n + 1

        if correct[n] == 0:
            if smooth == 'exp':
                smooth_mteval *= 2
                precisions[n] = 100. / (smooth_mteval * total[n])
            elif smooth == 'floor':
                precisions[n] = 100. * smooth_floor / total[n]
        else:
            precisions[n] = 100. * correct[n] / total[n]

    # If the system guesses no i-grams, 1 <= i <= NGRAM_ORDER, the BLEU score is 0 (technically undefined).
    # This is a problem for sentence-level BLEU or a corpus of short sentences, where systems will get no credit
    # if sentence lengths fall under the NGRAM_ORDER threshold. This fix scales NGRAM_ORDER to the observed
    # maximum order. It is only available through the API and off by default

    brevity_penalty = 1.0
    if sys_len < ref_len:
        brevity_penalty = math.exp(1 - ref_len / sys_len) if sys_len > 0 else 0.0
        

    bleu = brevity_penalty * math.exp(sum(map(my_log, precisions[:effective_order])) / effective_order)

    return bleu 
  
  
def ref_stats(output, refs):
    ngrams = Counter()
    closest_diff = None
    closest_len = None
    for ref in refs:
        tokens = ref.split()
        reflen = len(tokens)
        diff = abs(len(output.split()) - reflen)
        if closest_diff is None or diff < closest_diff:
            closest_diff = diff
            closest_len = reflen
        elif diff == closest_diff:
            if reflen < closest_len:
                closest_len = reflen

        ngrams_ref = extract_ngrams(ref)
        for ngram in ngrams_ref.keys():
            ngrams[ngram] = max(ngrams[ngram], ngrams_ref[ngram])

    return ngrams, closest_diff, closest_len
  
  
def extract_ngrams(line, min_order=1, max_order=NGRAM_ORDER) -> Counter:
    """Extracts all the ngrams (1 <= n <= NGRAM_ORDER) from a sequence of tokens.
    :param line: a segment containing a sequence of words
    :param max_order: collect n-grams from 1<=n<=max
    :return: a dictionary containing ngrams and counts
    """

    ngrams = Counter()
    tokens = line.split()
    for n in range(min_order, max_order + 1):
        for i in range(0, len(tokens) - n + 1):
            ngram = ' '.join(tokens[i: i + n])
            ngrams[ngram] += 1

    return ngrams  

def my_log(num):
    """
    Floors the log function
    :param num: the number
    :return: log(num) floored to a very low number
    """

    if num == 0.0:
        return -9999999999
    return math.log(num)
  

class Lang:
#     def __init__(self, name):
#         self.name = name
#         self.word2index = {}
#         self.word2index = {"PAD" : 0, "<SOS>" : 1, "<EOS>" : 2, "UNK" : 3}
#         self.word2count = {}
#         self.index2word = {0: "PAD", 1: "<SOS>", 2: "<EOS>", 3: "UNK"}
#         self.n_words = 4  # Count SOS and EOS and Pad
#         self.all_words = []
    def __init__(self, name,word2index,index2word):
        self.name = name
        self.word2index = word2index
        #self.word2count = {}
        self.index2word = index2word
        self.n_words = len(word2index)

#     def addSentence(self, sentence):
#         'Add all words from all sentences'
#         for word in sentence.split(' '):
#             if word.strip(): #if not empty space
#                 self.all_words.append(word)
                
                
#     def build_vocab(self, vocab_size=MAX_VOCAB_SIZE):
#         'Build vocabulary of vocab_size most common words'
        
#         token_counter = Counter(self.all_words)
#         vocab, count = zip(*token_counter.most_common(vocab_size)) #* unzips the tuples
#         for word in vocab:
#             self.addWord(word)

#     def addWord(self, word):
#         if word not in self.word2index:
#             self.word2index[word] = self.n_words
#             self.word2count[word] = 1
#             self.index2word[self.n_words] = word
#             self.n_words += 1
#         else:
#             self.word2count[word] += 1
            
def remove_blanks(pair):
    '''Remove empty lines'''
    if len(pair[0]) == 0 and len(pair[1]) == 0:
        return False
    else:
        return True
    
def set_max_length(pair, max_length=MAX_LENGTH):
    if len(pair[0].split(' ')) > max_length or len(pair[1].split(' '))>max_length:
        return False
    else:
        return True
    
def readLangs(filename1, filename2):
    print("Reading lines...")

    # Read the file and split into lines
    with open(filename1, encoding='utf-8') as f:
        lines1 = f.read().strip().split('\n')
        
    with open(filename2, encoding='utf-8') as f:
        lines2 = f.read().strip().split('\n')   
        
    # Remove punctuation
    lines1 = [removePunctuation(l) for l in lines1]
    lines2 = [removePunctuation(l) for l in lines2]
              
#     # Reverse pairs, make Lang instances
#     if reverse: #change from english->french to french->english for example
#         pairs =list(zip(lines2, lines1))
#         input_lang = Lang(filename2[-2:]) #take last two letters
#         output_lang = Lang(filename1[-2:])
#     else:
    pairs =list(zip(lines1, lines2))
    input_lang = Lang("vi",words2id_vi,idx2words_vi)
    output_lang = Lang("en",words2id_eng,idx2words_eng)

    pairs = list(filter(remove_blanks, pairs))  
    pairs = list(filter(set_max_length, pairs))

    return input_lang, output_lang, pairs 


def prepareData(lang1, lang2, num_sent=None):
    
    input_lang, output_lang, pairs = readLangs(lang1, lang2)
    
    pairs = pairs[:num_sent]
    print("Read %s sentence pairs" % len(pairs))
    
    print("Counting words...")
#     for pair in pairs:
#         input_lang.addSentence(pair[0])
#         output_lang.addSentence(pair[1])
        
#     input_lang.build_vocab()
#     output_lang.build_vocab()
        
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    return input_lang, output_lang, pairs
  
class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_tuple, word2id_lang1, word2id_lang2):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list1, self.data_list2 = zip(*data_tuple)
        assert (len(self.data_list1) == len(self.data_list2))
        self.word2id1 = word2id_lang1
        self.word2id2 = word2id_lang2

    def __len__(self):
        return len(self.data_list1)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        
        input_sentence = [self.word2id1[c] if c in self.word2id1.keys() 
                         else UNK_IDX for c in self.data_list1[key].split()][:MAX_LENGTH-1]
        input_sentence.append(EOS_token)
                                                                   
        output_sentence = [self.word2id2[c] if c in self.word2id2.keys() 
                          else UNK_IDX for c in self.data_list2[key].split()][:MAX_LENGTH-1]
        output_sentence.append(EOS_token)

        return [input_sentence, output_sentence, len(input_sentence), len(output_sentence)]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    length_list1 = []
    length_list2 = []
     
    # padding
    for datum in batch:
        x1 = datum[0]
        x2 = datum[1]
        len1 = datum[2]
        len2 = datum[3]
        
        length_list1.append(len1)
        length_list2.append(len2)
        #Pad first sentences
        padded_vec1 = np.pad(np.array(x1),
                                pad_width=((0,MAX_LENGTH-len1)),
                                mode="constant", constant_values=0)
        data_list1.append(padded_vec1)
        
        #Pad second sentences
        padded_vec2 = np.pad(np.array(x2),
                        pad_width=((0,MAX_LENGTH-len2)),
                        mode="constant", constant_values=0)
        data_list2.append(padded_vec2)
        
    data_list1 = np.array(data_list1)
    data_list2 = np.array(data_list2)
    length_list1 = np.array(length_list1)
    lenth_list2 = np.array(length_list2)
    
    return [torch.from_numpy(np.array(data_list1)), 
            torch.from_numpy(np.array(data_list2)),
            torch.LongTensor(length_list1), 
            torch.LongTensor(length_list2)]




class EncoderRNN(nn.Module):
    def __init__(self,weights_matrix, hidden_size, vocab_size, dropout=0):
        '''Bidirectional RNN'''
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.dropout = dropout
        
        # Embedding input: max_length x batch_size
        # Embedding output: max_length x batch_size x hidden size
        #self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0) #vocab size x hidden size
        
        
        self.num_embeddings, self.embedding_dim = weights_matrix.size()
        
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim,padding_idx=0)
        self.embedding.weight.data.copy_(weights_matrix)
        self.embedding.weight.requires_grad = False
        # Input: (max_length x batch_size x hidden_size)
        # Output: hidden - 2 x batch_size x hidden_size
        # Output: outputs max_length x batch_size x hidden_size*2
        self.gru = nn.GRU(self.embedding_dim, hidden_size, dropout=self.dropout, bidirectional=False)
        #Stellas
        #self.gru = nn.GRU(self.embedding_dim, hidden_size, n_layers, bidirectional=True)
        
    def forward(self, input_seqs, input_lengths, hidden=None):
        # Note: we run this all at once (over multiple batches of multiple sequences)
        embedded = self.embedding(input_seqs)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
#         output, output_len = torch.nn.utils.rnn.pad_packed_sequence(output)
#         output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:]
        return outputs, hidden

# class EncoderRNN(nn.Module):
#     def __init__(self, weights_matrix, input_size, hidden_size,n_layers=1):
#         super(EncoderRNN, self).__init__()
     
        
#         self.hidden_size = hidden_size
#         self.input_size = input_size
#         self.n_layers = n_layers
#         self.batch_size = BATCH_SIZE
#         self.num_embeddings, self.embedding_dim = weights_matrix.size()
        
#         self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
#         self.embedding.weight.data.copy_(weights_matrix)
#         self.embedding.weight.requires_grad = False

        
#         self.gru = nn.GRU(self.embedding_dim, hidden_size, n_layers, bidirectional=True)
        

#     def forward(self, input_seqs, input_len, hidden=None):

       
#         embedded = self.embedding(input_seqs)
#         packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_len)
#         output, hidden = self.gru(packed, hidden)

#         output, output_len = torch.nn.utils.rnn.pad_packed_sequence(output)
#         output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:]
        
#         return output,hidden


# class DecoderRNN(nn.Module):
#     def __init__(self, weights_matrix, hidden_size, output_size,n_layers=1):
#         super(DecoderRNN, self).__init__()
#         self.hidden_size = hidden_size
#         self.output_size = output_size
#         self.n_layers = n_layers
#         self.batch_size = BATCH_SIZE
#         self.num_embeddings, self.embedding_dim = weights_matrix.size()
        
#         #self.embedding = nn.Embedding(output_size, hidden_size)
#         self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
#         self.embedding.weight.data.copy_(weights_matrix)
#         self.embedding.weight.requires_grad = False
        
#         self.gru1 = nn.GRU(self.embedding_dim, hidden_size,n_layers)
#         self.gru2 = nn.GRU(hidden_size, hidden_size,n_layers)
        
#         self.out = nn.Linear(hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, input_seq, hidden):
        
#         embedded = self.embedding(input_seq) # dim = Batch_Size x embedding_dim
#         embedded = embedded.view(1, self.batch_size, self.embedding_dim) # S=1 x Batch_Size x embedding_dim
        
#         rnn_output, hidden = self.gru1(embedded, hidden)
#         output = F.relu(rnn_output)
        
#         output, hidden = self.gru2(output, hidden)
#         output = self.softmax(self.out(output[0]))
        
#         return output,hidden



#     def initHidden(self):
#         return torch.zeros(1, 1, self.hidden_size).to(device)
    
class DecoderRNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, vocab_size):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_embeddings, self.embedding_dim = weights_matrix.size()
        #self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        self.embedding.weight.data.copy_(weights_matrix)
        self.embedding.weight.requires_grad = False

        
        self.gru = nn.GRU(self.embedding_dim, hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inp, hidden):
        embedded = self.embedding(inp).unsqueeze(0) #so that we have 1 x batch x hidden
        #print('embedded', embedded.size())
        output = F.relu(embedded)
        #print('after relu', output.size())
        #print('hidden size', hidden.size())
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    
def train(inputs, input_lengths, targets, target_lengths, 
          encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH,
         teacher_forcing_ratio=0.5):
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0 #
    batch_size = inputs.size()[1]
    #print('input size', inputs.size())
    #print('batch size', batch_size)
    max_targ_len = max_length

    # Run words through encoder
    _, encoder_hidden = encoder(inputs, input_lengths, None)

    
    # Prepare input and output variables
    decoder_input = torch.LongTensor([SOS_token] * batch_size).to(device)
    decoder_hidden = encoder_hidden#[:1] # Use last (forward) hidden state from encoder
    
    #print('time 1 size', decoder_input.size())
    #print('time 1 hidden size', decoder_hidden.size())
    
    #randomly use teacher forcing or not
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Run through decoder one time step at a time using TEACHER FORCING=1.0
    all_decoder_outputs = Variable(torch.zeros(max_targ_len, batch_size, output_lang.n_words))

    if use_teacher_forcing:
        for t in range(max_targ_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            
            all_decoder_outputs[t] = decoder_output
            decoder_input = targets[t]
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(max_targ_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)

            decoder_input = topi.squeeze().detach()  # detach from history as input
            
            all_decoder_outputs[di] = decoder_output

                
    loss = masked_cross_entropy(
    all_decoder_outputs.transpose(0, 1).contiguous(),
    targets.transpose(0, 1).contiguous(),
    target_lengths)
        
    loss.backward()

    # Update parameters with optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item()

def trainIters(loader, encoder, decoder, n_iters, print_every=1000, plot_every=100, validate_every=1,
               learning_rate=0.01,
              teacher_forcing_ratio=0.5):
    
    start = time.time()
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    print_loss_total = 0
    plot_loss_total = 0
    plot_losses = []
    val_bleu = []

    counter = 0
    epoch = 0

    while epoch < n_iters:
        epoch += 1

        # Get training data for this cycle
        for i, (source, target, lengths1, lengths2) in enumerate(loader):

            counter += 1

            # Run the train function
            loss = train(
                source.long().transpose(0,1).to(device), lengths1, target.long().transpose(0,1).to(device), lengths2,
                encoder, decoder,
                encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio=teacher_forcing_ratio
            )

            # Keep track of loss
            print_loss_total += loss
            plot_loss_total += loss


            if counter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_iters), epoch, 
                                                       epoch / n_iters * 100, print_loss_avg)
                print(print_summary)


            if counter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
                
                torch.save(encoder.state_dict(), en_loc + '/encoder.pt')
                torch.save(decoder.state_dict(), en_loc + '/decoder.pt')

                with open(en_loc + '/loss.p', 'wb') as fp:
                    pickle.dump(plot_losses, fp)
                    
                with open(en_loc + '/bleu.p', 'wb') as fp:
                    pickle.dump(val_bleu, fp)    
                
                
            if counter % validate_every == 0:
                bleu = validate(encoder, decoder, val_loader)
                print('Bleu ', bleu)
                
          

    showPlot(plot_losses)
    return plot_losses, val_bleu

def evaluate(encoder, decoder, sentence, input_lengths, translated, search='greedy', max_length=MAX_LENGTH):
    """
    Function that generate translation.
    First, feed the source sentence into the encoder and obtain the hidden states from encoder.
    Secondly, feed the hidden states into the decoder and unfold the outputs from the decoder.
    Lastly, for each outputs from the decoder, collect the corresponding words in the target language's vocabulary.
    And collect the attention for each output words.
    @param encoder: the encoder network
    @param decoder: the decoder network
    @param sentence: string, a sentence in source language to be translated
    @param max_length: the max # of words that the decoder can return
    @output decoded_words: a list of words in target language
    @output decoder_attentions: a list of vector, each of which sums up to 1.0
    """    
    # process input sentence
    with torch.no_grad():
        input_tensor = sentence.transpose(0,1).to(device)
        input_length = sentence.size()[0]
        
        # encode the source lanugage
        encoder_output, encoder_hidden = encoder(input_tensor, input_lengths, None)

        decoder_input = torch.tensor([SOS_token], device=device)  # SOS
        decoder_hidden = encoder_hidden[:1] # Use last (forward) hidden state from encoder 
        # output of this function
        decoded_words = ''

        for di in range(max_length):
            # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            
            # hint: print out decoder_output and decoder_attention
            # TODO: add your code here to populate decoded_words and decoder_attentions
            # TODO: do this in 2 ways discussed in class: greedy & beam_search
            
            # GREEDY
            topv, topi = decoder_output.data.topk(1) 

            if topi.item() == EOS_token:
                #decoded_words.append('<EOS>')
                break

            else:
                if topi.item() not in [SOS_token, EOS_token, UNK_IDX, PAD_IDX]:
                    decoded_words = decoded_words + ' ' + output_lang.index2word[topi.item()]
            
            decoder_input = topi[0].detach()
        
        translation = ''
        for i in translated: #expected translation
            if i.item() not in [SOS_token, EOS_token, UNK_IDX, PAD_IDX]:
                translation = translation + ' ' + output_lang_v.index2word[i.item()]

        return decoded_words, translation
    
    
def evaluate_batch(loader, encoder, decoder):
    
    decoded_sentences = []
    actual_sentences = []
    
    for i, (source, target, lengths1, lengths2) in enumerate(loader):
        #iterate over batch
        
        for n in range(len(source)):
            # Go sentence by sentence
            
            decoded, actual = evaluate(encoder, decoder, source[n].unsqueeze(0), lengths1[n], target[n])
            decoded_sentences.append(decoded)
            actual_sentences.append(actual)
            
    return decoded_sentences, actual_sentences


def validate(encoder, decoder, val_loader):
    decoded_sentences, actual_sentences = evaluate_batch(val_loader, encoder, decoder)
    bleu = evaluate_bleu(decoded_sentences, actual_sentences)
    
    
    return bleu


def evaluate_bleu(translation_list, reference_list):
    
    return corpus_bleu(translation_list, [reference_list])

#Plot results
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    

def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))    



In [25]:
input_lang, output_lang, pairs = prepareData(en_loc+'/train.tok.vi', en_loc+'/train.tok.en', None)

Reading lines...
Read 121083 sentence pairs
Counting words...
Counted words:
vi 49955
en 50004


In [26]:
input_lang_v, output_lang_v, pairs_v = prepareData(en_loc+'/dev.tok.vi', en_loc+'/dev.tok.en', num_sent=None)

Reading lines...
Read 1143 sentence pairs
Counting words...
Counted words:
vi 49955
en 50004


In [27]:
pairs[0:5]

[('Khoa_học đằng_sau một tiêu_đề về khí_hậu',
  'Rachel Pike  The science behind a climate headline'),
 ('Tôi muốn cho các bạn biết về sự to_lớn của những nỗ_lực khoa_học đã góp_phần làm_nên các dòng tít bạn thường thấy trên báo',
  'I d like to talk to you today about the scale of the scientific effort that goes into making the headlines you see in the paper'),
 ('Có những dòng trông như thế_này khi bàn về biến_đổi khí_hậu  và như thế_này khi nói về chất_lượng không_khí hay khói bụi',
  'Headlines that look like this when they have to do with climate change  and headlines that look like this when they have to do with air quality or smog'),
 ('Cả hai đều là một nhánh của cùng một lĩnh_vực trong ngành khoa_học khí_quyển',
  'They are both two branches of the same field of atmospheric science'),
 ('Các tiêu_đề gần_đây trông như thế_này khi Ban Điều_hành Biến_đổi khí_hậu Liên_chính_phủ  gọi tắt là IPCC đưa ra_bài nghiên_cứu của họ về hệ_thống khí_quyển',
  'Recently the headlines looked l

In [28]:
pairs_v[0:5]

[('Khi tôi còn nhỏ  Tôi nghĩ rằng BắcTriều Tiên là đất_nước tốt nhất trên thế_giới và tôi thường hát bài " Chúng_ta chẳng có gì phải ghen_tị  "',
  'When I was little  I thought my country was the best on the planet  and I grew up singing a song called  Nothing To Envy'),
 ('Tôi đã rất tự_hào về đất_nước tôi', 'And I was very proud'),
 ('Ở trường  chúng_tôi dành rất nhiều thời_gian để học về cuộc_đời của chủ_tịch Kim II - Sung  nhưng lại không học nhiều về thế_giới bên_ngoài  ngoại_trừ việc Hoa_Kỳ  Hàn_Quốc và Nhật_Bản là kẻ_thù của chúng_tôi',
  'In school  we spent a lot of time studying the history of Kim Il-Sung  but we never learned much about the outside world  except that America  South Korea  Japan are the enemies'),
 ('Mặc_dù tôi đã từng tự_hỏi không biết thế_giới bên_ngoài kia như thế_nào  nhưng tôi vẫn nghĩ rằng mình sẽ sống cả cuộc_đời ở BắcTriều Tiên  cho tới khi tất_cả mọi thứ đột_nhiên thay_đổi',
  'Although I often wondered about the outside world  I thought I would spe

In [0]:

BATCH_SIZE=32
hidden_size=265

encoder = EncoderRNN(weights_matrix_vi,hidden_size = hidden_size, vocab_size = input_lang.n_words).to(device)

#encoder = EncoderRNN(weights_matrix_vi, input_lang.n_words, hidden_size,n_layers = 2).to(device)
#encoder.load_state_dict(torch.load(en_loc + '/encoder.pt'))
decoder = DecoderRNN(weights_matrix_eng,hidden_size = hidden_size, vocab_size = output_lang.n_words).to(device)

#decoder = DecoderRNN(weights_matrix_eng, hidden_size, output_lang.n_words,n_layers = 2).to(device)

#decoder.load_state_dict(torch.load(en_loc + '/decoder.pt'))

In [30]:
train_dataset = VocabDataset(pairs,input_lang.word2index, output_lang.word2index)


train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(pairs_v, input_lang_v.word2index, output_lang_v.word2index)
# 1 batch input dimension: num_sentences x max sentence length
# 1 batch: source_sentences, target_sentences, source_lengths, target_lengths
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)




plot_losses, bleu = trainIters(train_loader, encoder, decoder, n_iters=5, 
                         print_every=100
                         , plot_every=100, validate_every = 500, learning_rate=0.001, teacher_forcing_ratio=0.5)

ValueError: ignored

In [0]:
senc = EncoderRNN(hidden_size = hidden_size, vocab_size = input_lang.n_words)
enc.load_state_dict(torch.load(en_loc + 'encoder.pt'))

In [0]:
decoded, actual = evaluate_batch(train_loader, encoder, decoder)

for i in zip(decoded, actual):
    if i == 10:
        break
    print('\n')
    print('Expected:', i[1])
    print('Actual:' ,i[0])

In [0]:
evaluate_bleu(decoded, actual)