## Install torch && Scarbleu

Reference: 
https://github.com/spro/practical-pytorch/tree/master/seq2seq-translation

In [0]:
!pip install -q torch

In [0]:
!pip3 install sacrebleu

Collecting sacrebleu
  Downloading https://files.pythonhosted.org/packages/37/51/bffea2b666d59d77be0413d35220022040a1f308c39009e5b023bc4eb8ab/sacrebleu-1.2.12.tar.gz
Collecting typing (from sacrebleu)
  Downloading https://files.pythonhosted.org/packages/4a/bd/eee1157fc2d8514970b345d69cb9975dcd1e42cd7e61146ed841f6e68309/typing-3.6.6-py3-none-any.whl
Building wheels for collected packages: sacrebleu
  Running setup.py bdist_wheel for sacrebleu ... [?25l- done
[?25h  Stored in directory: /root/.cache/pip/wheels/ea/0a/7d/ddcbdcd15a04b72de1b3f78e7e754aab415aff81c423376385
Successfully built sacrebleu
Installing collected packages: typing, sacrebleu
Successfully installed sacrebleu-1.2.12 typing-3.6.6


## Load Pacgages

In [0]:
import numpy as np
from sacrebleu import corpus_bleu
from collections import Counter
import pickle as pkl
import random
import pdb
import pandas as pd
import string
import re
import unicodedata
import os
import time
import math

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset
from torch.autograd import Variable
from torch.nn import functional

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
%matplotlib inline

#specify SOS() and EOS(end of sentence)
#specify maximum vocabulary size = 50000
PAD_IDX = 2
UNK_IDX = 3
SOS_token = 0
EOS_token = 1
MAX_VOCAB_SIZE = 500000
MAX_LENGTH = 30

train_en = 'data/train.tok.en'
train_zh = 'data/train.tok.zh'
val_en = 'data/dev.tok.en'
val_zh = 'data/dev.tok.zh'

Token_list = ["<SOS>", "<EOS>", "<PAD>", "<UNK>"]

In [0]:
#user GPU if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
  print("Currently using GPU")

Currently using GPU


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


## Load Embedding

In [0]:
folder_path = os.getcwd() + '/gdrive/My Drive/NLP_Project/'

In [0]:
import unicodedata
import re

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"<SOS>":0, "<EOS>":1, "<PAD>":2, "<UNK>":3}
        #self.word2count = {"<SOS>":0, "<EOS>":0, "<PAD>":0, "<UNK>":0}
        self.word2count = {}
        #self.word2count = {"<SOS>":0, "<EOS>":0, "<PAD>":0, "<UNK>":0}
        self.index2word = {0: "<SOS>", 1: "<EOS>", 2:"<PAD>", 3:"<UNK>"}
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            if (word!="") and (word !=" "):
                self.addWord(word)

    def addWord(self, word):
      if (word not in Token_list):
        if word not in self.word2index:
            
          self.word2index[word] = self.n_words
          self.word2count[word] = 1
          self.index2word[self.n_words] = word
          self.n_words += 1
        else:
          self.word2count[word] += 1
    def delWord(self, word):
        if (word in self.word2index.keys()):
            index = self.word2index[word]
            del self.word2index[word]
            del self.index2word[index] 
            self.n_words = self.n_words-1
        else:
          print ("Word Not Existed")
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(address_lang1, address_lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines_lang1 = open(folder_path+address_lang1, encoding='utf-8').\
        read().strip().split('\n')
    lines_lang2 = open(folder_path+address_lang2, encoding='utf-8').\
        read().strip().split('\n')
    
    assert (len(lines_lang1)==len(lines_lang2))
    # Split every line into pairs and normalize
    
    pairs = [[lines_lang1[i], normalizeString(lines_lang2[i])] for i in range (len(lines_lang1))]
    #print (pairs[-1])
    # Reverse pairs, make Lang instances
    lang1=address_lang1[-2:]
    lang2=address_lang2[-2:]
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepareData(address_lang1, address_lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(address_lang1, address_lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [0]:
def load_emb_matrix(language):
    #load fasttext word vectors
    words_to_load = MAX_VOCAB_SIZE
    if language == 'english':
      file = 'wiki-news-300d-1M-subword.vec'
    if language == 'chinese':
      file = 'cc.zh.300.vec'
    

    with open(folder_path + 'data/' + file) as f:
        #remove the first line
        firstLine = f.readline()
        loaded_embeddings = np.zeros((words_to_load + 4, 300))
        words2id = {}
        idx2words = {}
        ordered_words = []

        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i , :] = np.asarray(s[1:])           
            words2id[s[0]] = i 
            idx2words[i] = s[0]
    words2id['<SOS>'] = SOS_token
    words2id['<EOS>'] = EOS_token
    words2id['<PAD>'] = PAD_IDX
    words2id['<UNK>'] = UNK_IDX
    idx2words[0] = '<SOS>'
    idx2words[1] = '<EOS>'
    idx2words[2] = '<PAD>'
    idx2words[3] = '<UNK>'
    return words2id,idx2words,loaded_embeddings

In [0]:
def generate_weights_matrix(index2word_lang, word2index_lang, index2word_embed, word2index_embed, loaded_embeddings):
    emb_dim=300
    missing_count=0
    matrix_len = len(index2word_lang)
    weights_matrix = np.zeros((matrix_len, 300))
    
    for key in index2word_lang.keys():
        word=index2word_lang[key]
        if (word in word2index_embed.keys()):
          weights_matrix[key] = loaded_embeddings[word2index_embed[word]]
        else:
          missing_count=missing_count+1
          weights_matrix[key] = np.random.normal(scale=0.6, size=(emb_dim, ))
    print ("missing count: ", missing_count)
    return weights_matrix

## Load data 

In [0]:
train_input_lang, train_output_lang, train_pairs = prepareData(train_zh, train_en, reverse=False)

Reading lines...
Read 213376 sentence pairs
Trimmed to 150216 sentence pairs
Counting words...
Counted words:
zh 64772
en 37140


In [0]:
for word, count in train_input_lang.word2count.items():
  if word not in Token_list:
    if count<4:
      train_input_lang.delWord(word)
print ("train_input_lang lenght: ", len(train_input_lang.word2index))

train_input_lang lenght:  24622


In [0]:
for word, count in train_output_lang.word2count.items():
  if word not in Token_list:
    if count<4:
      train_output_lang.delWord(word)
print ("train_output_lang lenght: ", len(train_output_lang.word2index))

train_output_lang lenght:  14663


In [0]:
train_input_lang_new=Lang("zh")
for word in train_input_lang.word2index.keys():
  train_input_lang_new.addWord(word)
print ("train_input_lang lenght: ", len(train_input_lang_new.word2index))

train_input_lang lenght:  24622


In [0]:
train_output_lang_new=Lang("en")
for word in train_output_lang.word2index.keys():
  train_output_lang_new.addWord(word)
print ("train_input_lang lenght: ", len(train_output_lang_new.word2index))

train_input_lang lenght:  14663


In [0]:
# words2id_eng,idx2words_eng,loaded_embeddings_eng = load_emb_matrix('english')
# words2id_zh,idx2words_zh,loaded_embeddings_zh = load_emb_matrix('chinese')

In [0]:
# pkl.dump(words2id_eng, open(folder_path + 'data/words2id_eng_0.5M.pkl', 'wb'))
# pkl.dump(idx2words_eng, open(folder_path +'data/idx2words_eng_0.5M.pkl', 'wb'))
# pkl.dump(loaded_embeddings_eng, open(folder_path +'data/embedding_matrix_eng_0.5M.pkl', 'wb'))

# pkl.dump(words2id_zh, open(folder_path + 'data/words2id_zh_0.5M.pkl', 'wb'))
# pkl.dump(idx2words_zh, open(folder_path + 'data/idx2words_zh_0.5M.pkl', 'wb'))
# pkl.dump(loaded_embeddings_zh, open(folder_path +'data/embedding_matrix_zh_0.5M.pkl', 'wb'))

In [0]:
words2id_eng=pkl.load(open(folder_path + 'data/words2id_eng_0.5M.pkl', 'rb'))
idx2words_eng=pkl.load(open(folder_path +'data/idx2words_eng_0.5M.pkl', 'rb'))
loaded_embeddings_eng=pkl.load(open(folder_path +'data/embedding_matrix_eng_0.5M.pkl', 'rb'))

words2id_zh=pkl.load(open(folder_path + 'data/words2id_zh_0.5M.pkl', 'rb'))
idx2words_zh=pkl.load(open(folder_path + 'data/idx2words_zh_0.5M.pkl', 'rb'))
loaded_embeddings_zh=pkl.load(open(folder_path +'data/embedding_matrix_zh_0.5M.pkl', 'rb'))

In [0]:
weights_matrix_eng=generate_weights_matrix(train_output_lang_new.index2word, train_output_lang_new.word2index, idx2words_eng, words2id_eng, loaded_embeddings_eng)
weights_matrix_eng = torch.from_numpy(weights_matrix_eng).to(device)
pkl.dump(weights_matrix_eng, open(folder_path +'data/weights_matrix_eng_1M_final.pkl', 'wb'))


weights_matrix_zh=generate_weights_matrix(train_input_lang_new.index2word, train_input_lang_new.word2index, idx2words_zh, words2id_zh, loaded_embeddings_zh) 
weights_matrix_zh = torch.from_numpy(weights_matrix_zh).to(device)
pkl.dump(weights_matrix_zh, open(folder_path +'data/weights_matrix_zh_1M_final.pkl', 'wb'))

missing count:  473
missing count:  2969


In [0]:
# weights_matrix_eng=generate_weights_matrix(train_output_lang.index2word, train_output_lang.word2index, idx2words_eng, words2id_eng, loaded_embeddings_eng)
# weights_matrix_eng = torch.from_numpy(weights_matrix_eng).to(device)
# pkl.dump(weights_matrix_eng, open(folder_path +'data/weights_matrix_eng_1M_final.pkl', 'wb'))


# weights_matrix_zh=generate_weights_matrix(train_input_lang.index2word, train_input_lang.word2index, idx2words_zh, words2id_zh, loaded_embeddings_zh) 
# weights_matrix_zh = torch.from_numpy(weights_matrix_zh).to(device)
# pkl.dump(weights_matrix_zh, open(folder_path +'data/weights_matrix_zh_1M_final.pkl', 'wb'))

## Data Loader

In [0]:
def indexesFromSentence(lang, sentence):
    sentence=sentence.strip()
    
    result = []
    for word in sentence.split(" "):
      if (word!="") and (word!=" "):
        if word in lang.word2index:
          result.append(lang.word2index[word])
        else:
          result.append(UNK_IDX)
    result.append(EOS_token)
    return result

In [0]:
class VocabDataset(Dataset):
    """
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, pairs,input_language, output_language):
        """
        @param pairs: pairs of input and target sentences(raw text sentences)
        @param input_language: Class Lang of input languages (zh in this case)
        @param output_language: Class Lang of output languages (en in this case)

        """
        self.pairs = pairs
        self.inputs = [pair[0] for pair in pairs]
        self.input_lang = input_language
        self.output_lang = output_language
        self.outputs = [pair[1] for pair in pairs]
        
        
        #assert self.input_lang == self.target_lang
       
    def __len__(self):
         return len(self.pairs)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        #turn raw text sentecens into indices
        input_ = indexesFromSentence(self.input_lang, self.inputs[key])
        output = indexesFromSentence(self.output_lang, self.outputs[key])
        #print (output)
        #print both the length of the source sequence and the target sequence
        return [input_,len(input_),output,len(output)]
    
    
    def __gettext__(self,key):
      return [self.inputs[key],self.outputs[key]]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    input_data_list = []
    output_data_list = []
   
    
    for datum in batch:
      input_data_list.append(datum[0])
      output_data_list.append(datum[2])
      
      
    # Zip into pairs, sort by length (descending), unzip
    seq_pairs = sorted(zip(input_data_list, output_data_list), key=lambda p: len(p[0]), reverse=True)
    input_seqs, output_seqs = zip(*seq_pairs)
    
    #store the length of the sequences 
    input_data_len = [len(p) for p in input_seqs]
    output_data_len = [len(p) for p in output_seqs]
    #MAX_LENGTH=max
    #padding
    padded_vec_input = [np.pad(np.array(p),
                                 pad_width=((0,max(input_data_len)-len(p))),
                                 mode="constant", constant_values=PAD_IDX) for p in input_seqs]
        
    padded_vec_output = [np.pad(np.array(p),
                                 pad_width=((0,max(output_data_len)-len(p))),
                                 mode="constant", constant_values=PAD_IDX) for p in output_seqs]      
    
    
    input_var = Variable(torch.LongTensor(padded_vec_input))
    output_var = Variable(torch.LongTensor(padded_vec_output))
    input_data_len = Variable(torch.LongTensor(input_data_len))
    output_data_len = Variable(torch.LongTensor(output_data_len))
    
    
    return [input_var,input_data_len,output_var,output_data_len]

## Encoder

In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, weights_matrix, input_size, hidden_size,n_layers=1):
        super(EncoderRNN, self).__init__()
     
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.n_layers = n_layers
        #self.batch_size = BATCH_SIZE
        self.num_embeddings, self.embedding_dim = weights_matrix.size()
        
        self.embedding_dropout = nn.Dropout(drop_out)
        
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        self.embedding.from_pretrained(weights_matrix, freeze=True, sparse=False)
        #self.embedding.weight.requires_grad = True

        
        self.gru = nn.GRU(self.embedding_dim, hidden_size, n_layers, bidirectional=True)
        

    def forward(self, input_seqs, input_len, hidden=None):
        
        self.batch_size = input_seqs.size()[0]
       
        embedded = self.embedding(input_seqs)
#         print("input",input_seqs.shape)
#         print ("embeded", embedded.shape)
        embedded = self.embedding_dropout(embedded)
        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_len)
        output, hidden = self.gru(packed, hidden)

        output, output_len = torch.nn.utils.rnn.pad_packed_sequence(output)
        output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:]
        
        return output,hidden
    
#     def random_init_hidden(self, batch_size):
#         hidden = torch.zeros(n_layers*2, batch_size, self.hidden_size, device=device)
#         nn.init.xavier_normal_(hidden)
#         return hidden

## Decoder

In [0]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))
     
    def forward(self, hidden, encoder_outputs):
#       print ("Attn Forward")
#       print ("hidden", hidden.shape)
#       print ("encoder_outputs", encoder_outputs.shape)
      attn_energies = self.score(hidden, encoder_outputs).squeeze(2)
#       print ("attn_energies", attn_energies.shape)
      #score = F.softmax(attn_energies, dim = 1).view(1, self.batch_size, -1)
      
      
      return F.softmax(attn_energies).unsqueeze(1)

    def score(self, hidden, encoder_output):
#       print ("hidden", hidden.shape)
#       print ("encoder_output", encoder_output.shape)
      '''
      Args
          hidden: size 1 x B x hidden_size
          encoder_output: size N x B x hidden_size
      Return 
          energy: size B x N x 1
      '''
#       print ("score")
#       print ("hidden", hidden.shape)
#       print ("encoder_outputs", encoder_output.shape)
#       print ("hidden_change", hidden.squeeze(0).unsqueeze(2).shape)
      self.batch_size = hidden.shape[1]
      if self.method == 'dot':
          energy = torch.bmm(encoder_output.transpose(1,0), hidden.squeeze(0).unsqueeze(2)) 
#           print ("energy", energy.shape)
          return energy 

      elif self.method == 'general':
          energy = torch.bmm(encoder_output.transpose(1,0), self.attn(hidden.squeeze(0)).unsqueeze(2)) 
          #print ("energy", energy.shape)
          return energy

      elif self.method == 'concat':
          concat = torch.cat((hidden.transpose(1,0).expand(self.batch_size ,encoder_output.shape[0], self.hidden_size), encoder_output.transpose(1,0)), dim = 2)
          #print("concat", concat)
          tanh = nn.Tanh()
          out = tanh(self.attn(concat)) #size: B x N x hidden_size
          #print ("out", out)
          #print ("v", self.v.expand(self.batch_size, 1, self.hidden_size))
          energy = torch.bmm(self.v.expand(self.batch_size, 1, self.hidden_size), out.transpose(2,1)).transpose(1,2)
          #print ("energy", energy)
#           print ("energy", energy.shape)
          return energy

class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, weights_matrix, hidden_size, output_size, drop_out, n_layers=1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.drop_out=drop_out
        #self.batch_size = BATCH_SIZE
        self.num_embeddings, self.embedding_dim = weights_matrix.size()


        # Define layers
        #self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding = nn.Embedding(self.num_embeddings,self.hidden_size)
        
#         self.embedding.weight.data.copy_(weights_matrix)
#         self.embedding.weight.requires_grad = True
        self.embedding.from_pretrained(weights_matrix, freeze=True, sparse=False)
        
        self.embedding_dropout = nn.Dropout(drop_out)
      
        #self.gru1 = nn.GRU(self.embedding_dim, hidden_size, n_layers)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=drop_out)
       # self.gru2 = nn.GRU(hidden_size, hidden_size,n_layers)
        
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
#         print ("decoder:forward")
        # Get the embedding of the current input word (last output word)
#         print("input_seq", input_seq.shape)
#         print("last_hidden", last_hidden.shape)
#         print("encoder_outputs", encoder_outputs.shape)
        self.batch_size=input_seq.size(0)
        #print (self.batch_size)
        embedded = self.embedding(input_seq) # dim = Batch_Size x embedding_dim
#         print ("input_seq", input_seq)
        #print("embedding", embedded.shape)
        embedded = self.embedding_dropout(embedded)
        #print("embedding", embedded.shape)
#         print("embedding", embedded.shape)
        embedded = embedded.view(1, self.batch_size, self.hidden_size)
        #print("embedding", embedded.shape)
  # S=1 x Batch_Size x embedding_dim
#         print("embedding", embedded.shape)
        # Get current hidden state from input word and last hidden state
        # rnn_output : [1 x batch_size x hidden_size]
        # hidden: [layer x batch_size x hidden_size]
#         print (embedded.shape)
#         print (last_hidden.shape)
        rnn_output, hidden = self.gru(embedded, last_hidden)
#         print("rnn_output", rnn_output)
        #print("hidden", hidden.shape)
        #print("rnn_output", rnn_output.shape)
#         print("hidden", hidden.shape)
        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average
        #context, attn_weights = self.attn(rnn_output, encoder_outputs)
        
        attn_weights = self.attn(hidden[0].view(1, self.batch_size, self.hidden_size), encoder_outputs)
#         print("attn_weights", attn_weights)
        #print("attn_weights", attn_weights.shape)
        
 
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        #print ("context", context.shape)
        #print ("attn_weights", attn_weights.shape)
        
#         attn_weights = self.attn(rnn_output, encoder_outputs)
#         context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

        # Attentional vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        #print ("rnn_output", rnn_output.shape)
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        #print ("context", context.shape)
        concat_input = torch.cat((rnn_output, context), 1)
        #print ("concat_input", concat_input.shape)
        concat_output = torch.tanh(self.concat(concat_input))
        #print ("concat_output", concat_output.shape)

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(concat_output)
        #print ("output", output.shape)
        #output = self.softmax(output)
#         print ("output", output.shape)
        #Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights
        #return attn_weights

## Training Function

In [0]:
#record the run time
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [0]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

### Loss function

In [0]:
def sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.range(0, max_len - 1).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def masked_cross_entropy(logits, target, length):
    length = Variable(torch.LongTensor(length)).to(device)

    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.
    Returns:
        loss: An average loss value masked by the length.
    """
#     print ("length", length.size())
    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
#     print ("log_probs_flat", log_probs_flat.shape)
#     print ("target_flat", target_flat.shape)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss

In [0]:
#the train function is now taking a batch at a time
def train(input_batch, input_lengths, target_batches, target_lengths, encoder, decoder, encoder_optimizer, 
          decoder_optimizer, criteron, max_length=MAX_LENGTH, if_attention = True):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
#     print ("input_batch", input_batch.size())
#     print ("target_batches", target_batches.size()) 
    input_length, batch_size = input_batch.size()
    #batch_size_2, target_length = output_batch.size()
    
    loss =0
#     print("input_batch", input_batch)
#     print("input_lengths", input_lengths)
    encoder_outputs, encoder_hidden = encoder(input_batch, input_lengths)
#     print("encoder_outputs", encoder_outputs)
#     print("encoder_hidden", encoder_hidden)
    decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    
    max_target_length = target_lengths.max().item()
    all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))
    
    decoder_input = decoder_input.to(device)
    all_decoder_outputs = all_decoder_outputs.to(device)
    
#     print ("decoder_input", decoder_input)
#     print ("decoder_hidden", decoder_hidden)
#     print ("encoder_outputs", encoder_outputs)

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
      for t in range(max_target_length):
#           print("decoder_input", decoder_input.shape)
#           print("decoder_hidden", decoder_hidden.shape)
#           print("encoder_outputs", encoder_outputs.shape)
          decoder_output, decoder_hidden, decoder_attn = decoder(
              decoder_input, decoder_hidden, encoder_outputs
          )
#           print ("decoder_output", decoder_output)
#           print ("decoder_hidden", decoder_hidden)
#           print ("decoder_attn", decoder_attn)  
          all_decoder_outputs[t] = decoder_output
          #loss += criteron(decoder_output, target_batches[t])
          decoder_input = target_batches[t] # Next input is current target
    else:
      for t in range(max_target_length):
#           print("decoder_input", decoder_input.shape)
#           print("decoder_hidden", decoder_hidden.shape)
#           print("encoder_outputs", encoder_outputs.shape)
          decoder_output, decoder_hidden, decoder_attn = decoder(
              decoder_input, decoder_hidden, encoder_outputs
          )
          
          topv, topi = decoder_output.topk(1)
          decoder_input = topi.squeeze().detach()
          
          all_decoder_outputs[t] = decoder_output
          decoder_input = decoder_input.unsqueeze(0)

    # Loss calculation and backpropagation
#     1: torch.Size([32, 39, 2840])
#     2: torch.Size([32, 50])
#     print ("1:", all_decoder_outputs.transpose(0, 1).contiguous().shape)
#     print ("2:", target_batches.transpose(0, 1).contiguous().shape)
    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
        target_batches.transpose(0, 1).contiguous(), # -> batch x seq
        target_lengths.to("cpu")
    )
    
    #print ("all_decoder_outputs", all_decoder_outputs.shape)
    #print ("target_batches", target_batches.shape)
    #loss= criterion(all_decoder_outputs.transpose(1, 2), target_batches)
    #print (loss)
    
#     print ("all_decoder_outputs", all_decoder_outputs.transpose(0, 1).contiguous())
#     print ("target_batches", target_batches.transpose(0, 1).contiguous())
#     print ("target_lengths", target_lengths)
#     print("loss", loss)
    loss.backward()
    
    # Clip gradient norms
    ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

    # Update parameters with optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()
#     print (loss.item())
    return loss.item(), ec, dc


## Evaluation

In [0]:
def to_output_lang(output_list, output_lang):
  result=[]
  for token_index in output_list:
    token=output_lang.index2word[token_index]
    result.append(token)
  return result    
def evaluate(encoder, decoder, sentence, input_lengths, translated, search='greedy', max_length=MAX_LENGTH):
    """
    Function that generate translation.
    First, feed the source sentence into the encoder and obtain the hidden states from encoder.
    Secondly, feed the hidden states into the decoder and unfold the outputs from the decoder.
    Lastly, for each outputs from the decoder, collect the corresponding words in the target language's vocabulary.
    And collect the attention for each output words.
    @param encoder: the encoder network
    @param decoder: the decoder network
    @param sentence: string, a sentence in source language to be translated
    @param max_length: the max # of words that the decoder can return
    @output decoded_words: a list of words in target language
    @output decoder_attentions: a list of vector, each of which sums up to 1.0
    """    
    # process input sentence
    with torch.no_grad():
        input_tensor = sentence.transpose(0,1).to(device)
        #input_length = sentence.size()[0]
        encoder_output, encoder_hidden = encoder(input_tensor, [input_lengths.item()], None)

        decoder_input = Variable(torch.LongTensor([SOS_token])).to(device) # SOS
        decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder 
        # output of this function
        decoded_words = ''

        for di in range(max_length):
            # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
            decoder_output, decoder_hidden, attn_weight = decoder(
                decoder_input, decoder_hidden, encoder_output)
            
            
            # GREEDY
            topv, topi = decoder_output.data.topk(1) 

            if topi.item() == EOS_token:
                #decoded_words.append('<EOS>')
                break
            else:
                if topi.item() not in [SOS_token, EOS_token, UNK_IDX, PAD_IDX]:
                    decoded_words = decoded_words + ' ' + train_output_lang_new.index2word[topi.item()]
            
            decoder_input = topi[0].detach()
        
        translation = ''
        for i in translated: #expected translation
            if i.item() not in [SOS_token, EOS_token, UNK_IDX, PAD_IDX]:
                translation = translation + ' ' + train_output_lang_new.index2word[i.item()]

        return decoded_words, translation

def evaluate_batch(loader, encoder, decoder):
    
    decoded_sentences = []
    actual_sentences = []
    
    for i, (source, lengths1, target, lengths2) in enumerate(loader):
        #iterate over batch
        
        for n in range(len(source)):
            # Go sentence by sentence
            
            decoded, actual = evaluate(encoder, decoder, source[n].unsqueeze(0), lengths1[n], target[n])
            decoded_sentences.append(decoded)
            actual_sentences.append(actual)
            
    return decoded_sentences, actual_sentences


def evaluate_bleu(translation_list, reference_list):
    
    return corpus_bleu(translation_list, [reference_list])      

## Training Functions

In [0]:
def trainIters(Epoch, iters,  encoder, decoder, encoder_optimizer, decoder_optimizer, criteron, n_iters, train_loader, loss_list, print_every=1000, plot_every=100):
    bleu_score=0
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    loss_list=[]
    Iter_loss_list=[]
#     encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
#     decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
#    criterion = nn.NLLLoss()
    #iters = 0
    loss_iter = 0
    while iters <= n_iters:
      Epoch +=1
      print("Epoch:", Epoch)
      num_batch=0
      loss_total=0
      
      for i, (input_var,input_data_len,output_var,output_data_len) in enumerate(train_loader):
       
        iters += 1
        
        input_batch = input_var.transpose(0,1).to(device)
        output_batch = output_var.transpose(0,1).to(device)
        input_data_len=input_data_len.to(device)
        output_data_len=output_data_len.to(device)
        loss, _, _= train(input_batch,input_data_len,output_batch,output_data_len, encoder,
                        decoder, encoder_optimizer, decoder_optimizer, criteron)
        
#         a, b, c = train(input_batch,input_data_len,output_batch,output_data_len, encoder,
#                decoder, encoder_optimizer, decoder_optimizer, criteron)
        #return a, b,c 
        loss_total += loss
        loss_iter += loss
        
        if iters % print_every == 0:
          
          Iter_loss=loss_iter/print_every
          Iter_loss_list.append(Iter_loss)
          
          loss_list.append(Iter_loss)
          
          #decoded, actual = evaluate_batch(val_loader, encoder, decoder)
#           print (decoded)
#           print (actual)
          #Bleu_Score = evaluate_bleu(decoded, actual)
          
         
          
          print('Iteration Loss: %s (%d %d%%) %.4f' % (timeSince(start, iters / n_iters), iters, iters / n_iters * 100, Iter_loss))
         # print ("Bleu Score: ", Bleu_Score)
          loss_iter = 0
          
#           print('Average Loss: %s (%d %d%%) %.4f' % (timeSince(start, iters / n_iters),
#                                              iters, iters / n_iters * 100, print_loss_avg))
        if iters % plot_every == 0:
          state = {'Epoch': Epoch, "Itertion": iters, 'encoder_state_dict': encoder.state_dict(), 'decoder_state_dict': decoder.state_dict, 
               'encoder_optimizer': encoder_optimizer.state_dict(), 'decoder_optimizer': decoder_optimizer.state_dict(), "loss_list": loss_list, "loss_avg": Iter_loss_list, "Bleu":bleu_score}
      torch.save(state, folder_path+"model_saved/Dec13_stella_compare{}.pt".format(iters))
      decoded_sentences, actual_sentences= evaluate_batch(val_loader, encoder, decoder)
      bleu=corpus_bleu(decoded_sentences, [actual_sentences], use_effective_order=False)
      print ("Bleu: ", bleu)
        

In [0]:
def load_checkpoint(encoder, decoder, encoder_optimizer, decoder_optimizer, iteration_num):
    # Note: Input model & optimizer should be pre-defined.  This routine only updates their states.
    folder_path = os.getcwd() + '/gdrive/My Drive/NLP_Project/'
    start_epoch = 0
    filename=folder_path+"model_saved/Dec13_final_500_dot{}.pt".format(iteration_num)
    if os.path.isfile(filename):
        print("=> loading checkpoint '{}'".format(iteration_num))
        checkpoint = torch.load(filename, map_location=device)
        start_epoch = checkpoint['Epoch']
        itertion = checkpoint['Itertion']
        #model.load_state_dict(checkpoint['state_dict'])
        encoder.load_state_dict(checkpoint["encoder_state_dict"])
        decoder.load_state_dict(checkpoint["decoder_state_dict"]())
        encoder_optimizer.load_state_dict(checkpoint["encoder_optimizer"])
        decoder_optimizer.load_state_dict(checkpoint["decoder_optimizer"])
        loss_list=checkpoint["loss_list"]
        Iter_loss_list=checkpoint["loss_avg"]
        #optimizer.load_state_dict(checkpoint['optimizer'])
        #losslogger = checkpoint['losslogger']
        print("=> loaded checkpoint '{}' (epoch {})"
                  .format(filename, start_epoch))
    else:
        print("=> no checkpoint found at '{}'".format(filename))

    return start_epoch, itertion, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_list, Iter_loss_list

## Start Training

In [0]:
batch_size=64

In [0]:
train_dataset = VocabDataset(train_pairs, train_input_lang_new, train_output_lang_new)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True,  drop_last = False)

In [0]:
#batch_size=128
val_input_lang, val_output_lang, val_pairs = prepareData(val_zh, val_en, reverse=False)
val_dataset = VocabDataset(val_pairs, train_input_lang_new, train_output_lang_new)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=batch_size,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True,  drop_last = False)

Reading lines...
Read 1261 sentence pairs
Trimmed to 756 sentence pairs
Counting words...
Counted words:
zh 2828
en 1817


In [0]:
start_epoch=0
itertion=0
hidden_size=1000
layers=2
drop_out=0.1
n_iters=100000
attn_model="dot"
loss_list=[]
learning_rate=0.001
teacher_forcing_ratio=1
decoder_learning_ratio = 1.0

clip=5
encoder = EncoderRNN(weights_matrix_zh, train_input_lang_new.n_words, hidden_size, n_layers = layers).to(device)
decoder= LuongAttnDecoderRNN(attn_model, weights_matrix_eng, hidden_size, train_output_lang_new.n_words, drop_out, n_layers = layers).to(device)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate*decoder_learning_ratio)
#start_epoch, itertion, encoder, decoder, encoder_optimizer_, decoder_optimizer_, loss_list, Iter_loss_list = load_checkpoint(encoder, decoder, encoder_optimizer, decoder_optimizer, 9069)

In [0]:
criteron = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
trainIters(start_epoch, itertion, encoder, decoder, encoder_optimizer, decoder_optimizer, criteron, n_iters, train_loader, loss_list, print_every=100, plot_every=1000)

Epoch: 1


  """


Iteration Loss: 1m 17s (- 1283m 23s) (100 0%) 5.7723
Iteration Loss: 2m 33s (- 1278m 2s) (200 0%) 4.9995
Iteration Loss: 3m 51s (- 1280m 17s) (300 0%) 4.7587
Iteration Loss: 5m 8s (- 1279m 9s) (400 0%) 4.5606
Iteration Loss: 6m 25s (- 1278m 29s) (500 0%) 4.4381
Iteration Loss: 7m 42s (- 1278m 1s) (600 0%) 4.3277
Iteration Loss: 9m 0s (- 1276m 54s) (700 0%) 4.2530
Iteration Loss: 10m 17s (- 1275m 25s) (800 0%) 4.1867
Iteration Loss: 11m 34s (- 1273m 40s) (900 0%) 4.0994
Iteration Loss: 12m 51s (- 1272m 14s) (1000 1%) 4.0905
Iteration Loss: 14m 8s (- 1271m 23s) (1100 1%) 4.0111
Iteration Loss: 15m 25s (- 1269m 52s) (1200 1%) 3.9738
Iteration Loss: 16m 43s (- 1269m 40s) (1300 1%) 3.9334
Iteration Loss: 18m 0s (- 1267m 55s) (1400 1%) 3.8909
Iteration Loss: 19m 17s (- 1266m 47s) (1500 1%) 3.8797
Iteration Loss: 20m 34s (- 1265m 7s) (1600 1%) 3.8492
Iteration Loss: 21m 51s (- 1264m 13s) (1700 1%) 3.7874
Iteration Loss: 23m 9s (- 1263m 0s) (1800 1%) 3.7871
Iteration Loss: 24m 26s (- 1261m 40s

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Bleu:  BLEU(score=5.505611571392188, counts=[2945, 652, 236, 84], totals=[8275, 7522, 6773, 6031], precisions=[35.58912386706949, 8.667907471417177, 3.48442344603573, 1.3928038467915769], bp=0.8851002348548135, sys_len=8275, ref_len=9285)
Epoch: 2
Iteration Loss: 31m 44s (- 1290m 49s) (2400 2%) 3.4685
Iteration Loss: 33m 1s (- 1288m 2s) (2500 2%) 3.3003
Iteration Loss: 34m 19s (- 1285m 50s) (2600 2%) 3.3219
Iteration Loss: 35m 36s (- 1283m 18s) (2700 2%) 3.3338
Iteration Loss: 36m 53s (- 1280m 39s) (2800 2%) 3.3162
Iteration Loss: 38m 10s (- 1278m 26s) (2900 2%) 3.3082
Iteration Loss: 39m 28s (- 1276m 20s) (3000 3%) 3.3435
Iteration Loss: 40m 45s (- 1274m 5s) (3100 3%) 3.3160
Iteration Loss: 42m 3s (- 1272m 5s) (3200 3%) 3.3415
Iteration Loss: 43m 20s (- 1270m 9s) (3300 3%) 3.3382
Iteration Loss: 44m 38s (- 1268m 10s) (3400 3%) 3.3418
Iteration Loss: 45m 55s (- 1266m 23s) (3500 3%) 3.3164
Iteration Loss: 47m 13s (- 1264m 36s) (3600 3%) 3.3133
Iteration Loss: 48m 31s (- 1262m 52s) (3700



Bleu:  BLEU(score=5.174722594340055, counts=[3167, 716, 235, 80], totals=[9954, 9198, 8443, 7691], precisions=[31.816355234076752, 7.7843009349858665, 2.7833708397489043, 1.0401768300611103], bp=1.0, sys_len=9954, ref_len=9285)
Epoch: 3
Iteration Loss: 62m 4s (- 1258m 33s) (4700 4%) 3.2661
Iteration Loss: 63m 20s (- 1256m 25s) (4800 4%) 2.7167
Iteration Loss: 64m 38s (- 1254m 28s) (4900 4%) 2.7427
Iteration Loss: 65m 55s (- 1252m 39s) (5000 5%) 2.7802
Iteration Loss: 67m 13s (- 1250m 56s) (5100 5%) 2.7824
Iteration Loss: 68m 30s (- 1248m 57s) (5200 5%) 2.8278
Iteration Loss: 69m 48s (- 1247m 15s) (5300 5%) 2.8633
Iteration Loss: 71m 5s (- 1245m 18s) (5400 5%) 2.8795
Iteration Loss: 72m 22s (- 1243m 30s) (5500 5%) 2.8661
Iteration Loss: 73m 39s (- 1241m 45s) (5600 5%) 2.8942
Iteration Loss: 74m 56s (- 1239m 55s) (5700 5%) 2.9019
Iteration Loss: 76m 14s (- 1238m 16s) (5800 5%) 2.9273
Iteration Loss: 77m 32s (- 1236m 38s) (5900 5%) 2.9397
Iteration Loss: 78m 49s (- 1235m 0s) (6000 6%) 2.9



Bleu:  BLEU(score=5.593242324400979, counts=[3219, 742, 249, 84], totals=[9628, 8872, 8117, 7362], precisions=[33.433734939759034, 8.363390441839496, 3.067635826044105, 1.1409942950285248], bp=1.0, sys_len=9628, ref_len=9285)
Epoch: 4
Iteration Loss: 93m 35s (- 1224m 34s) (7100 7%) 2.7126
Iteration Loss: 94m 52s (- 1222m 43s) (7200 7%) 2.4587
Iteration Loss: 96m 9s (- 1221m 1s) (7300 7%) 2.4851
Iteration Loss: 97m 25s (- 1219m 11s) (7400 7%) 2.4986
Iteration Loss: 98m 42s (- 1217m 25s) (7500 7%) 2.5410
Iteration Loss: 100m 0s (- 1215m 51s) (7600 7%) 2.5570
Iteration Loss: 101m 17s (- 1214m 12s) (7700 7%) 2.5781
Iteration Loss: 102m 34s (- 1212m 25s) (7800 7%) 2.5755
Iteration Loss: 103m 51s (- 1210m 51s) (7900 7%) 2.6200
Iteration Loss: 105m 8s (- 1209m 11s) (8000 8%) 2.6507
Iteration Loss: 106m 26s (- 1207m 36s) (8100 8%) 2.6544
Iteration Loss: 107m 43s (- 1205m 58s) (8200 8%) 2.6587
Iteration Loss: 109m 0s (- 1204m 25s) (8300 8%) 2.6791
Iteration Loss: 110m 17s (- 1202m 44s) (8400 8%



Bleu:  BLEU(score=5.636254252988955, counts=[3230, 759, 250, 79], totals=[9494, 8738, 7986, 7242], precisions=[34.02148725510849, 8.686198214694437, 3.130478337089907, 1.0908588787627727], bp=1.0, sys_len=9494, ref_len=9285)
Epoch: 5
Iteration Loss: 123m 48s (- 1193m 16s) (9400 9%) 2.7870
Iteration Loss: 125m 5s (- 1191m 35s) (9500 9%) 2.2857
Iteration Loss: 126m 21s (- 1189m 56s) (9600 9%) 2.3282
Iteration Loss: 127m 39s (- 1188m 25s) (9700 9%) 2.3518
Iteration Loss: 128m 57s (- 1186m 53s) (9800 9%) 2.3681
Iteration Loss: 130m 13s (- 1185m 9s) (9900 9%) 2.4141
Iteration Loss: 131m 31s (- 1183m 39s) (10000 10%) 2.4137
Iteration Loss: 132m 48s (- 1182m 7s) (10100 10%) 2.4375
Iteration Loss: 134m 5s (- 1180m 35s) (10200 10%) 2.4619
Iteration Loss: 135m 22s (- 1178m 59s) (10300 10%) 2.4815
Iteration Loss: 136m 40s (- 1177m 28s) (10400 10%) 2.4834
Iteration Loss: 137m 57s (- 1175m 52s) (10500 10%) 2.5250
Iteration Loss: 139m 13s (- 1174m 15s) (10600 10%) 2.5168
Iteration Loss: 140m 30s (- 



Bleu:  BLEU(score=5.620990273780996, counts=[3289, 778, 245, 77], totals=[9513, 8757, 8002, 7254], precisions=[34.57374119625775, 8.884321114536942, 3.0617345663584103, 1.0614833195478357], bp=1.0, sys_len=9513, ref_len=9285)
Epoch: 6
Iteration Loss: 155m 19s (- 1160m 56s) (11800 11%) 2.4247
Iteration Loss: 156m 36s (- 1159m 23s) (11900 11%) 2.2000
Iteration Loss: 157m 53s (- 1157m 55s) (12000 12%) 2.2322
Iteration Loss: 159m 11s (- 1156m 24s) (12100 12%) 2.2559
Iteration Loss: 160m 28s (- 1154m 55s) (12200 12%) 2.2920
Iteration Loss: 161m 46s (- 1153m 27s) (12300 12%) 2.3222
Iteration Loss: 163m 4s (- 1152m 3s) (12400 12%) 2.3462
Iteration Loss: 164m 22s (- 1150m 34s) (12500 12%) 2.3455
Iteration Loss: 165m 38s (- 1149m 1s) (12600 12%) 2.3687
Iteration Loss: 166m 56s (- 1147m 30s) (12700 12%) 2.3947
Iteration Loss: 168m 13s (- 1145m 59s) (12800 12%) 2.4200
Iteration Loss: 169m 30s (- 1144m 28s) (12900 12%) 2.4163
Iteration Loss: 170m 48s (- 1143m 3s) (13000 13%) 2.4689
Iteration Loss:



Bleu:  BLEU(score=5.802394631985424, counts=[3301, 771, 251, 88], totals=[9565, 8809, 8053, 7309], precisions=[34.51123889179299, 8.752412305596549, 3.1168508630324103, 1.203995074565604], bp=1.0, sys_len=9565, ref_len=9285)
Epoch: 7
Iteration Loss: 185m 34s (- 1130m 32s) (14100 14%) 2.4981
Iteration Loss: 186m 50s (- 1128m 58s) (14200 14%) 2.1311
Iteration Loss: 188m 7s (- 1127m 26s) (14300 14%) 2.1526
Iteration Loss: 189m 24s (- 1125m 56s) (14400 14%) 2.1765
Iteration Loss: 190m 41s (- 1124m 25s) (14500 14%) 2.2330
Iteration Loss: 191m 59s (- 1123m 0s) (14600 14%) 2.2305
Iteration Loss: 193m 15s (- 1121m 28s) (14700 14%) 2.2724
Iteration Loss: 194m 33s (- 1120m 0s) (14800 14%) 2.2814
Iteration Loss: 195m 50s (- 1118m 32s) (14900 14%) 2.3065
Iteration Loss: 197m 7s (- 1117m 5s) (15000 15%) 2.3170
Iteration Loss: 198m 25s (- 1115m 37s) (15100 15%) 2.3610
Iteration Loss: 199m 42s (- 1114m 9s) (15200 15%) 2.3496
Iteration Loss: 200m 59s (- 1112m 39s) (15300 15%) 2.3454
Iteration Loss: 20



Bleu:  BLEU(score=5.401833297886212, counts=[3325, 760, 244, 93], totals=[10231, 9475, 8719, 7968], precisions=[32.49926693382856, 8.021108179419524, 2.7984860649157013, 1.1671686746987953], bp=1.0, sys_len=10231, ref_len=9285)
Epoch: 8
Iteration Loss: 217m 4s (- 1098m 34s) (16500 16%) 2.2432
Iteration Loss: 218m 22s (- 1097m 6s) (16600 16%) 2.1137
Iteration Loss: 219m 39s (- 1095m 37s) (16700 16%) 2.1326
Iteration Loss: 220m 56s (- 1094m 9s) (16800 16%) 2.1768
Iteration Loss: 222m 13s (- 1092m 44s) (16900 16%) 2.1707
Iteration Loss: 223m 30s (- 1091m 15s) (17000 17%) 2.1871
Iteration Loss: 224m 48s (- 1089m 50s) (17100 17%) 2.2301
Iteration Loss: 226m 5s (- 1088m 24s) (17200 17%) 2.2317
Iteration Loss: 227m 22s (- 1086m 57s) (17300 17%) 2.2604
Iteration Loss: 228m 39s (- 1085m 29s) (17400 17%) 2.2725
Iteration Loss: 229m 57s (- 1084m 4s) (17500 17%) 2.2894
Iteration Loss: 231m 13s (- 1082m 33s) (17600 17%) 2.3103
Iteration Loss: 232m 30s (- 1081m 5s) (17700 17%) 2.2950
Iteration Loss:



Bleu:  BLEU(score=5.299051935210646, counts=[3250, 731, 224, 64], totals=[9279, 8523, 7767, 7014], precisions=[35.02532600495743, 8.576792209315968, 2.883996395004506, 0.9124607927003137], bp=0.9993535876113976, sys_len=9279, ref_len=9285)
Epoch: 9
Iteration Loss: 247m 14s (- 1067m 53s) (18800 18%) 2.3887
Iteration Loss: 248m 32s (- 1066m 27s) (18900 18%) 2.0734
Iteration Loss: 249m 49s (- 1065m 1s) (19000 19%) 2.0995
Iteration Loss: 251m 6s (- 1063m 36s) (19100 19%) 2.1288
Iteration Loss: 252m 24s (- 1062m 11s) (19200 19%) 2.1360
Iteration Loss: 253m 41s (- 1060m 46s) (19300 19%) 2.1448
Iteration Loss: 254m 58s (- 1059m 18s) (19400 19%) 2.1769
Iteration Loss: 256m 15s (- 1057m 51s) (19500 19%) 2.1896
Iteration Loss: 257m 32s (- 1056m 25s) (19600 19%) 2.2057
Iteration Loss: 258m 48s (- 1054m 57s) (19700 19%) 2.2160
Iteration Loss: 260m 6s (- 1053m 32s) (19800 19%) 2.2616
Iteration Loss: 261m 22s (- 1052m 5s) (19900 19%) 2.2650
Iteration Loss: 262m 39s (- 1050m 38s) (20000 20%) 2.2623
I



Bleu:  BLEU(score=5.6526956148979135, counts=[3239, 770, 249, 82], totals=[9580, 8824, 8068, 7313], precisions=[33.81002087682672, 8.726201269265639, 3.086266732771443, 1.121290851907562], bp=1.0, sys_len=9580, ref_len=9285)
Epoch: 10
Iteration Loss: 278m 45s (- 1036m 8s) (21200 21%) 2.1717
Iteration Loss: 280m 2s (- 1034m 42s) (21300 21%) 2.0681
Iteration Loss: 281m 19s (- 1033m 17s) (21400 21%) 2.0858
Iteration Loss: 282m 36s (- 1031m 50s) (21500 21%) 2.1104
Iteration Loss: 283m 53s (- 1030m 23s) (21600 21%) 2.1260
Iteration Loss: 285m 9s (- 1028m 57s) (21700 21%) 2.1339
Iteration Loss: 286m 26s (- 1027m 30s) (21800 21%) 2.1621
Iteration Loss: 287m 44s (- 1026m 7s) (21900 21%) 2.1787
Iteration Loss: 289m 0s (- 1024m 40s) (22000 22%) 2.1902
Iteration Loss: 290m 17s (- 1023m 14s) (22100 22%) 2.2066
Iteration Loss: 291m 34s (- 1021m 51s) (22200 22%) 2.2212
Iteration Loss: 292m 52s (- 1020m 28s) (22300 22%) 2.2613
Iteration Loss: 294m 9s (- 1019m 2s) (22400 22%) 2.2447
Iteration Loss: 29



Bleu:  BLEU(score=5.393580754702707, counts=[3268, 746, 246, 78], totals=[9798, 9042, 8286, 7530], precisions=[33.35374566238008, 8.25038708250387, 2.9688631426502536, 1.0358565737051793], bp=1.0, sys_len=9798, ref_len=9285)
Epoch: 11
Iteration Loss: 308m 59s (- 1005m 51s) (23500 23%) 2.3234
Iteration Loss: 310m 16s (- 1004m 27s) (23600 23%) 2.0539
Iteration Loss: 311m 33s (- 1003m 2s) (23700 23%) 2.0579
Iteration Loss: 312m 51s (- 1001m 39s) (23800 23%) 2.0767
Iteration Loss: 314m 8s (- 1000m 16s) (23900 23%) 2.0959
Iteration Loss: 315m 26s (- 998m 52s) (24000 24%) 2.1298
Iteration Loss: 316m 42s (- 997m 27s) (24100 24%) 2.1328
Iteration Loss: 318m 0s (- 996m 5s) (24200 24%) 2.1540
Iteration Loss: 319m 18s (- 994m 42s) (24300 24%) 2.1620
Iteration Loss: 320m 35s (- 993m 19s) (24400 24%) 2.1780
Iteration Loss: 321m 52s (- 991m 55s) (24500 24%) 2.2033
Iteration Loss: 323m 10s (- 990m 31s) (24600 24%) 2.2291
Iteration Loss: 324m 26s (- 989m 6s) (24700 24%) 2.2149
Iteration Loss: 325m 43s



Bleu:  BLEU(score=4.95462830138734, counts=[3209, 671, 200, 66], totals=[9462, 8706, 7950, 7202], precisions=[33.914605791587405, 7.707328279347577, 2.5157232704402515, 0.9164121077478479], bp=1.0, sys_len=9462, ref_len=9285)
Epoch: 12
Iteration Loss: 340m 30s (- 974m 11s) (25900 25%) 2.1121
Iteration Loss: 341m 47s (- 972m 47s) (26000 26%) 2.0411
Iteration Loss: 343m 4s (- 971m 24s) (26100 26%) 2.0615
Iteration Loss: 344m 21s (- 969m 59s) (26200 26%) 2.0767
Iteration Loss: 345m 38s (- 968m 34s) (26300 26%) 2.0939
Iteration Loss: 346m 56s (- 967m 13s) (26400 26%) 2.1244
Iteration Loss: 348m 13s (- 965m 50s) (26500 26%) 2.1248
Iteration Loss: 349m 31s (- 964m 27s) (26600 26%) 2.1635
Iteration Loss: 350m 48s (- 963m 3s) (26700 26%) 2.1709
Iteration Loss: 352m 5s (- 961m 41s) (26800 26%) 2.1845
Iteration Loss: 353m 23s (- 960m 19s) (26900 26%) 2.1948
Iteration Loss: 354m 40s (- 958m 57s) (27000 27%) 2.2133
Iteration Loss: 355m 58s (- 957m 35s) (27100 27%) 2.2214
Iteration Loss: 357m 16s (



Bleu:  BLEU(score=5.37912129368308, counts=[3252, 710, 234, 92], totals=[9943, 9193, 8443, 7693], precisions=[32.706426631801264, 7.723267703687588, 2.7715267085159305, 1.1958923696867283], bp=1.0, sys_len=9943, ref_len=9285)
Epoch: 13
Iteration Loss: 370m 48s (- 944m 5s) (28200 28%) 2.2549
Iteration Loss: 372m 5s (- 942m 42s) (28300 28%) 1.9878
Iteration Loss: 373m 22s (- 941m 18s) (28400 28%) 2.0340
Iteration Loss: 374m 39s (- 939m 56s) (28500 28%) 2.0543
Iteration Loss: 375m 56s (- 938m 31s) (28600 28%) 2.0691
Iteration Loss: 377m 13s (- 937m 9s) (28700 28%) 2.0827
Iteration Loss: 378m 29s (- 935m 43s) (28800 28%) 2.1062
Iteration Loss: 379m 46s (- 934m 19s) (28900 28%) 2.1235
Iteration Loss: 381m 4s (- 932m 57s) (29000 28%) 2.1341
Iteration Loss: 382m 22s (- 931m 36s) (29100 29%) 2.1643
Iteration Loss: 383m 40s (- 930m 15s) (29200 29%) 2.1694
Iteration Loss: 384m 57s (- 928m 53s) (29300 29%) 2.1893
Iteration Loss: 386m 14s (- 927m 30s) (29400 29%) 2.2018
Iteration Loss: 387m 31s (-



Bleu:  BLEU(score=5.011021739286931, counts=[3282, 693, 217, 75], totals=[9927, 9171, 8415, 7663], precisions=[33.06134783922635, 7.556427870461237, 2.5787284610814023, 0.9787289573274175], bp=1.0, sys_len=9927, ref_len=9285)
Epoch: 14
Iteration Loss: 402m 23s (- 912m 37s) (30600 30%) 2.0876
Iteration Loss: 403m 41s (- 911m 14s) (30700 30%) 2.0181
Iteration Loss: 404m 58s (- 909m 52s) (30800 30%) 2.0496
Iteration Loss: 406m 15s (- 908m 30s) (30900 30%) 2.0449
Iteration Loss: 407m 32s (- 907m 6s) (31000 31%) 2.0672


In [0]:
decoded_sentences, actual_sentences= evaluate_batch(val_loader, encoder, decoder)
bleu=corpus_bleu(decoded_sentences, [actual_sentences], use_effective_order=False)

