In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import io
import unicodedata
import string
import re
import random
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from torch.utils.data import Dataset
from torch.optim import lr_scheduler
import itertools
import glob
plt.switch_backend('agg')
import matplotlib.ticker as ticker
from sacrebleu import corpus_bleu
import sacrebleu
import math
import time
import copy
import pdb
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
batch_size = 32
words_to_load = 100000
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_token = 3
LR_RATE = 0.0008
MAX_LENGTH = 35
hidden_size = 512
teacher_forcing_ratio = 0.9
EPOCH_NUM = 30
PRINT_FREQ = 500
embed_dim = 300
infrequent_count = 1
add = '/scratch/wz1218'

__Preprocess Data__

In [None]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"<SOS>": 0, "<EOS>": 1, "<pad>": 2, "<unk>": 3}
        self.word2count = {"<SOS>": 0, "<EOS>": 0, "<pad>": 0, "<unk>": 0}
        self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<pad>", 3: "<unk>"}
        self.n_words = 4  # Count SOS, EOS, pad and unk

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2count:
            self.word2count[word] = 1
        else:
            self.word2count[word] += 1
            
    def buildVocab(self, count=infrequent_count, train = False, in_out = False):
        if train & in_out:
            del_list = []
            for k,v in self.word2count.items():
                if v <= count:
                    del_list.append(k)
            for k in del_list:
                self.word2count.pop(k)

        for k,v in self.word2count.items():
            self.word2index[k] = self.n_words
            self.index2word[self.n_words] = k
            self.n_words += 1

In [None]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )



def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def normalizeZh(s):
    s = s.strip()
    s = re.sub("\s+", " ", s)
    return s

In [None]:
def filterPair(p):
    filtered = []
    for i in p:
        filtered.append(' '.join(i.split(' ')[:MAX_LENGTH-1]))
    return filtered

def filterPairs(pairs):
    return [filterPair(pair) for pair in pairs]

In [None]:
def readLangs(dataset, lang1, lang2):
    chinese = add+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang1)
    english = add+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang2)

    chinese_lines = open(chinese, encoding='utf-8').read().strip().split('\n')
    english_lines = open(english, encoding='utf-8').read().strip().split('\n')
    length = len(chinese_lines)

    pairs = [[normalizeZh(chinese_lines[i]), normalizeString(english_lines[i])] for i in range(length)]
    pairs = filterPairs(pairs)
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    
    if dataset == 'train':
        input_lang.buildVocab(train = True, in_out = True)
        output_lang.buildVocab(train = True, in_out = True)
    else:
        input_lang.buildVocab()
        output_lang.buildVocab()
    return input_lang, output_lang, pairs

In [None]:
train_input_lang, train_output_lang, train_pairs = readLangs('train', 'zh', 'en')
val_input_lang, val_output_lang, val_pairs = readLangs('dev', 'zh', 'en')
test_input_lang, test_output_lang, test_pairs = readLangs('test', 'zh', 'en')

__Embedding__

In [None]:
def load_embedding(fname, words_to_load):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for index, line in enumerate(fin):
        if index > words_to_load:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = [float(i) for i in tokens[1:]]
    return data

In [None]:
fname_zh = add + '/zh/zh.vec'
fname_eng = add + '/zh/fasttext300d.vec'
embedding_mat_zh = load_embedding(fname_zh, words_to_load)
embedding_mat_en = load_embedding(fname_eng, words_to_load)

In [None]:
def create_weight(unique_token, embedding):
    emb_dim = 300
    words_found = 0
    wnf = []
    matrix_len = len(unique_token)
    weight_matrix= np.zeros((matrix_len, emb_dim))
    for i, word in enumerate(unique_token):
        try: 
            weight_matrix[i] = embedding[word]
            words_found += 1
        except KeyError:
            weight_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
            wnf.append(i)
    return weight_matrix, wnf

In [None]:
chinese_wm, chin_wnf = create_weight(train_input_lang.word2index.keys(), embedding_mat_zh)
english_wm, eng_wnf = create_weight(train_output_lang.word2index.keys(), embedding_mat_en)

__Data Loader__

In [None]:
class NMTDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, input_lang, output_lang, pairs):
        """
        @param data_list_1: list of sentence 1 tokens 
        @param data_list_2: list of sentence 2 tokens
        @param target_list: list of review targets 

        """
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        input_sentence = self.pairs[key][0]
        input_indexes = [self.input_lang.word2index[word] if word in self.input_lang.word2index else UNK_token for word in input_sentence.split(' ')]
        input_indexes.append(EOS_token)
        input_length = len(input_indexes)

        output_sentence = self.pairs[key][1]
        output_indexes = [self.output_lang.word2index[word] if word in self.output_lang.word2index else UNK_token for word in output_sentence.split(' ')]
        output_indexes.append(EOS_token)
        output_length = len(output_indexes)
        return [input_indexes, input_length, output_indexes, output_length]

    
def NMTDataset_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    input_ls = []
    output_ls = []
    input_length_ls = []
    output_length_ls = []
    
    for datum in batch:
        input_length_ls.append(datum[1])
        output_length_ls.append(datum[3])
    
    max_input = sorted(input_length_ls)[-1]
    max_output = sorted(output_length_ls)[-1]
    
    for datum in batch:
        padded_vec_input = np.pad(np.array(datum[0]), 
                                  pad_width=((0,max_input-datum[1])), 
                                  mode="constant", constant_values=2).tolist()
        padded_vec_output = np.pad(np.array(datum[2]), 
                                   pad_width=((0,max_output-datum[3])), 
                                   mode="constant", constant_values=2).tolist()
        input_ls.append(padded_vec_input)
        output_ls.append(padded_vec_output)
    return [torch.tensor(torch.from_numpy(np.array(input_ls)), device=device), 
            torch.tensor(input_length_ls, device=device), 
            torch.tensor(torch.from_numpy(np.array(output_ls)), device=device), 
            torch.tensor(output_length_ls, device=device)]

In [None]:
train_dataset = NMTDataset(train_input_lang, train_output_lang, train_pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=NMTDataset_collate_func,
                                           shuffle=True,
                                           drop_last=True)

val_dataset = NMTDataset(train_input_lang, train_output_lang, val_pairs)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                         batch_size=batch_size,
                                         collate_fn=NMTDataset_collate_func,
                                         shuffle=False,
                                         drop_last=True)

__Self-Attention Encoder__

In [None]:
def nopeak_mask(size, opt):
    np_mask = np.triu(np.ones((1, size, size)),k=1).astype('uint8')
    np_mask = torch.from_numpy(np_mask) == 0
    np_mask = np_mask.to(device)
    return np_mask

def create_masks(src, trg, opt):
    
    src_mask = (src != opt.src_pad).unsqueeze(-2)
    if trg is not None:
        trg_mask = (trg != opt.trg_pad).unsqueeze(-2)
        size = trg.size(1) # get seq_len for matrix
        np_mask = nopeak_mask(size, opt)
        np_mask = np_mask.to(device)
        trg_mask = trg_mask & np_mask
    else:
        trg_mask = None
    return src_mask, trg_mask


In [None]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)
    
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = MAX_LENGTH, dropout = 0.1):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0., max_seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
        return x

In [None]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    
    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
    output = torch.matmul(scores, v)
    return output

class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
#         self.d_model = d_model
#         self.d_k = 512 // heads
#         self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
#         self.q_linear = nn.Linear(d_model, 512)
#         self.v_linear = nn.Linear(d_model, 512)
#         self.k_linear = nn.Linear(d_model, 512)
        
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
#         self.out = nn.Linear(512, 512)
    
    def forward(self, q, k, v, mask=None):
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        bs = q.size(0)
        # perform linear operation and split into N heads
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * N * sl * d_model
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        
        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)
#         concat = scores.transpose(1,2).contiguous().view(bs, -1, 512)
        output = self.out(concat)
        return output

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
    
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x


In [None]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model = 300, N = 6, heads = 6, dropout = 0.1):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)

In [None]:
encoder = Encoder(train_input_lang.n_words, opt.d_model, opt.n_layer, opt.heads).to(device)
for i, (src, i_l, trg, t_l) in enumerate(train_loader):
    if i > 1: 
        break
    target_input = trg[:, :-1]
    src_mask, trg_mask = create_masks(src, target_input, opt)
    preds = encoder(src, src_mask)

#### Decoder with Attention

In [None]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.other = nn.Parameter(torch.FloatTensor(batch_size, 1, hidden_size)).to(device)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size()[1]
        hidden = hidden.transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(1, 2)
        attn_energies = torch.bmm(hidden, encoder_outputs)
        result = F.softmax(attn_energies, dim = 2).unsqueeze(0).unsqueeze(0)
        return result

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, english_wm, attn_model, hidden_size, output_size, n_layers, dropout_p):
        super(AttnDecoderRNN, self).__init__()
        
        # Keep parameters for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = nn.Dropout(dropout_p)
        
        # Define layers
        embed_mat = torch.from_numpy(english_wm).float()
        n, embed_dim = embed_mat.shape
        self.embedding = nn.Embedding.from_pretrained(embed_mat, freeze = False)
        
        self.lstm = nn.GRU(hidden_size  + embed_dim, hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size * 2, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)
    
    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        word_embedded = self.embedding(word_input).view(n_layers, batch_size, -1) # S=1 x B x N
        word_embedded = self.dropout(word_embedded)
        rnn_input = torch.cat((word_embedded, last_context), 2)
        rnn_output, hidden = self.lstm(rnn_input, last_hidden)
        attn_weights = self.attn(rnn_output, encoder_outputs).squeeze(0).squeeze(0)
        context = attn_weights.bmm(encoder_outputs) # B x 1 x N
        output = F.log_softmax(self.out(torch.cat((rnn_output.transpose(0, 1), context), 2)), dim = 2).squeeze(1)
        return output, context, hidden, attn_weights

__Training__

In [None]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def train(input, target, input_len, target_len, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH, teach_forcing_ratio=0.5, encoder_cnn = False):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    max_input_len = max(input_len)
    max_target_len = max(target_len)

    loss = 0
    
    target_input = target[:, :-1]
    src_mask, trg_mask = create_masks(input, target_input, opt)

    if not encoder_cnn:
        encoder_output = encoder(input, src_mask)
    else:
        encoder_hidden = encoder(input)
    decoder_context = torch.zeros((1, batch_size, decoder.hidden_size), device = device)
    decoder_input = torch.tensor([[SOS_token]]*batch_size, device = device)
    decoder_hidden = encoder_output.transpose(0, 1)[-1, :, :].unsqueeze(0).contiguous()
    encoder_outputs = encoder_output
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
        for di in range(max_target_len):
            decoder_output, decoder_context, decoder_hidden, attn_weights = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            decoder_context = decoder_context.transpose(0, 1)
            loss += criterion(decoder_output, target[:,di])
            decoder_input = target[:,di].unsqueeze(1)  # Teacher forcing (batch_size, 1)

    else:
        for di in range(max_target_len):
            decoder_output, decoder_context, decoder_hidden, attn_weights= decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().unsqueeze(1)  # detach from history as input
            decoder_context = decoder_context.transpose(0, 1)
            loss += criterion(decoder_output, target[:,di])
            ni = topi[0][0]
            if ni == EOS_token:
                break
                
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item() / float(max_target_len)

In [None]:
def exp_lr_scheduler(optimizer, epoch, lr_rate =0.001, lr_decay_epoch=7):
    """Decay learning rate by a factor of 0.1 every lr_decay_epoch epochs."""
    if epoch % lr_decay_epoch == 0:
        lr_rate = lr_rate * (0.5**(epoch // lr_decay_epoch))
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr_rate
        return optimizer, lr_rate
    return optimizer, lr_rate

In [None]:
def evaluate(encoder, decoder, input, input_len, target, target_len, encoder_cnn, max_length=MAX_LENGTH):
    # process input sentence
    with torch.no_grad():
        
        max_input_len = max(input_len)
        
        target_input = target[:, :-1]
        src_mask, trg_mask = create_masks(input, target_input, opt)

        if not encoder_cnn:
            encoder_output = encoder(input, src_mask)
        else:
            encoder_hidden = encoder(input)

        decoder_context = torch.zeros((1, batch_size, decoder.hidden_size), device = device)
        decoder_input = torch.tensor([[SOS_token]]*batch_size, device = device)
        decoder_hidden = encoder_output.transpose(0, 1)[-1, :, :].unsqueeze(0).contiguous()
#         decoder_hidden = (decoder_hidden, decoder_hidden)
        encoder_outputs = encoder_output
        # output of this function
        decoded_words = []
        for di in range(max_length):
            # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
            decoder_output, decoder_context, decoder_hidden, attn_weights= decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            decoder_context = decoder_context.transpose(0, 1)
            topv, topi = decoder_output.topk(1)
            decoded_words.append(topi.cpu().numpy())
            decoder_input = topi.squeeze().detach().unsqueeze(1)  # detach from history as input
        return np.asarray(decoded_words).T#, decoder_attentions[:di + 1]

In [None]:
def test(encoder, decoder, data_loader, encoder_cnn):
    total_score = 0
    count = 0
    
    candidate_corpus = []
    reference_corpus = []

    for i, (input, input_len, target, target_len) in enumerate(data_loader):
        decoded_words = evaluate(encoder, decoder, input, input_len, target, target_len, encoder_cnn)
        candidate_sentences = []
        for ind in range(decoded_words.shape[1]):
            sent_words = []
            for token in decoded_words[0][ind]:
                if token != PAD_token and token != EOS_token:
#                     pdb.set_trace()
                    sent_words.append(train_output_lang.index2word[token])
                else:
                    break
            sent_words = ' '.join(sent_words)
            if count == 0:
                print('predict: '+sent_words)
                count += 1
    #             sent_words = ' '.join([train_output_lang.index2word[token] for token in decoded_words[0][ind]])
            candidate_sentences.append(sent_words)
        candidate_corpus.extend(candidate_sentences)

        reference_sentences = []
        for sent in target:
            sent_words = []
            for token in sent:
                if token.item() != EOS_token:
                    sent_words.append(train_output_lang.index2word[token.item()])
                else:
                    break
            sent_words = ' '.join(sent_words)
            if count == 1:
                print('target: '+sent_words)
                count += 1
    #             sent_words = ' '.join([train_output_lang.index2word[token.item()] for token in sent])
            reference_sentences.append(sent_words)
        reference_corpus.extend(reference_sentences)
    
    score = corpus_bleu(candidate_corpus, [reference_corpus], smooth='exp', smooth_floor=0.0, force=False).score
    return score

In [None]:
def trainIters(loader, encoder, decoder, n_iters, encoder_cnn, print_every=1000, plot_every=100, learning_rate=0.01, teacher_forcing_ratio = 0.9):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()
    
    best_bleu = None
    save_path = '/scratch/wz1218/save_model/very_new.pt'
    for iter in range(1, n_iters + 1):
        print_loss_total = 0  # Reset every print_every
        for i, (input, input_len, target, target_len) in enumerate(train_loader):
            loss = train(input, target, input_len, target_len, encoder, decoder, 
                         encoder_optimizer, decoder_optimizer, criterion, 
                         max_length=MAX_LENGTH, teach_forcing_ratio=teacher_forcing_ratio, encoder_cnn = encoder_cnn)
            print_loss_total += loss
            plot_loss_total += loss
            
            
            if (i + 1) % print_every == 0:
                current_bleu = test(encoder, decoder, val_loader, encoder_cnn)
#                 current_bleu = test(encoder, decoder, train_loader, encoder_cnn)
                if not best_bleu or current_bleu > best_bleu:
                    torch.save({
                                'epoch': iter,
                                'encoder_state_dict': encoder.state_dict(),
                                'decoder_state_dict': decoder.state_dict(),
                                'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
                                'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
                                'train_loss': loss,
                                'best_BLEU': best_bleu
                                }, save_path)
                    best_bleu = current_bleu
                
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0  
                print('%s (Epoch: %d %d%%) | Train Loss: %.4f | Best Bleu: %.4f | Current Blue: %.4f' 
                      % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg, best_bleu, current_bleu))
                with open('/scratch/wz1218/save_model/very_new.txt', 'a') as f:
                    f.write('%s (Epoch: %d %d%%) | Train Loss: %.4f | Best Bleu: %.4f | Current Blue: %.4f\n' 
                          % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg, best_bleu, current_bleu))
            if i % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
#         encoder_optimizer, learning_rate = exp_lr_scheduler(encoder_optimizer, iter, lr_rate = learning_rate, lr_decay_epoch=5)
#         decoder_optimizer, learning_rate = exp_lr_scheduler(decoder_optimizer, iter, lr_rate = learning_rate, lr_decay_epoch=5)


In [None]:
class Parameters:
    def __init__(self, src_pad, trg_pad, d_model, n_layer, heads, dropout):
        self.src_pad = src_pad
        self.trg_pad = trg_pad
        self.d_model = d_model
        self.n_layer = n_layer
        self.dropout = dropout
        self.heads = heads
opt = Parameters(0, 0, 512, 2, 8, 0.3)

In [None]:
attn_model = 'general'
dropout_p = 0.3
n_layers = 1
clip = 5.0
encoder = Encoder(train_input_lang.n_words, opt.d_model, opt.n_layer, opt.heads, opt.dropout).to(device)
attn_decoder = AttnDecoderRNN(english_wm, attn_model, hidden_size, train_output_lang.n_words, n_layers, dropout_p=dropout_p).to(device)
trainIters(train_loader, encoder, attn_decoder, n_iters=EPOCH_NUM, encoder_cnn=False, print_every=PRINT_FREQ, plot_every=1, learning_rate=LR_RATE, teacher_forcing_ratio = 0.9)