### DLP Lab5
Goal of this lab is to to implement a conditional seq2seq VAE for English tense conversion
1. Tense conversion: ‘access’ to ‘accessing’, or ‘accessed’ to ‘accesses’
2. Generative model: Gaussian noise + tense -> access, accesses, accessing, accessed

#### Requirment
1. Implement a conditional seq2seq VAE.
    * Modify encoder, decoder, and training functions
    * Implement evaluation function, dataloader, and reparameterization trick.
2. Plot and **compare** the CrossEntropy loss, KL loss and BLEU-4 score of testing data curves during training with different settings of your model
    * Teacher forcing ratio
    * KL annealing schedules (two methods)
3. Output the conversion results between tenses (from tense A to tense B)
4. Output the results generated by a Gaussian noise with 4 tenses.

#### Implement detail
1. Use LSTM
2. Log variance
3. Condition (tense)
    * Simply concatenate to the hidden_0 and z
    * Embed your condition to high dimensional space (or simply use one-hot)
4. KL lost annealing
    * Monotonic
    * Cyclical
5. Adopt BLEU-4 score function in NLTK (average 10 testing scores)
6. Adopt Gaussian_score() to compute the generation score
    * Random sample 100 noise to generate 100 words with 4 different tenses(totally 400 words)
    * 4 words should exactly match the training data

#### Hyper parameters
* LSTM hidden size: 256 or 512
* Latent size: 32
* Condition embedding size: 8
* Teacher forcing ratio: 0~1 (>0.5) (??? 
* KL weight: 0~1 (???
* Learning rate: 0.05
* Optimizer: SGD
* Loss function: torch.nn.CrossEntropyLoss()

Date: 2020/05/

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import gc
import random
import time
import math
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils import data
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
from os import system
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
SOS_token = 0
EOS_token = 1
#----------Hyper Parameters----------#
hidden_size = 256
laten_size = 32
condition_size = 4
#The number of vocabulary
vocab_size = 30
teacher_forcing_ratio = 0.7
empty_input_ratio = 0.1
KLD_weight = 0.0
learning_rate = 0.01

In [3]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

### Fetch data
* train.txt: Each training pair includes 4 words: simple present(sp), third person(tp), present progressive(pg), simple past(p)
* test.txt: Each training pair includes 2 words with different combination of tenses

In [4]:
def getData(mode):
    assert mode == 'train' or mode == 'test'
    if mode == 'train':
        data = pd.read_csv('./data/'+mode+'.txt', delimiter=' ', header=None)
    else:
        data = []
        with open('./data/test.txt','r') as fp:
            for line in fp:
                word = line.split(' ')
                word[1] = word[1].strip('\n')
                data.extend([word])
    return data

getData("train")

Unnamed: 0,0,1,2,3
0,abandon,abandons,abandoning,abandoned
1,abet,abets,abetting,abetted
2,abdicate,abdicates,abdicating,abdicated
3,abduct,abducts,abducting,abducted
4,abound,abounds,abounding,abounded
...,...,...,...,...
1222,exhort,exhorts,exhorting,exhorted
1223,exhilarate,exhilarates,exhilarating,exhilarated
1224,exculpate,exculpates,exculpating,exculpated
1225,exasperate,exasperates,exasperating,exasperated


#### Vocabulary

In [5]:
class Vocabuary():
    def __init__(self):
        self.word2index = {'SOS': 0, 'EOS': 1, 'PAD': 2, 'UNK': 3}
        self.index2word = {0: 'SOS', 1: 'EOS', 2: 'PAD', 3: 'UNK'}
        self.n_words = 4
        self.max_length = 0
        self.build_vocab(getData('train'))
        

    # input the training data and build vocabulary
    def build_vocab(self, corpus):        
        for idx in range(corpus.shape[0]):
            for word in corpus.iloc[idx,:]:
                if len(word) > self.max_length:
                    self.max_length = len(word)
                    
                for char in word:
                    if char not in self.word2index:
                        self.word2index[char] = self.n_words
                        self.index2word[self.n_words] = char
                        self.n_words += 1                      
                    
    # convert word in indices
    def word2indices(self, word, add_eos=False, add_sos=False):
        indices = [self.word2index[char] if char in self.word2index else 3 for char in word]

        if add_sos:
            indices.insert(0, 0)
        if add_eos:
            indices.append(1)
            
        # padding input of same target into same length
#         indices.extend([2]*(self.max_length-len(word)))
            
        return np.array(indices)
    
    # convert indices to word
    def indices2word(self, indices):
        # ignore indices after EOS
        new_indices = []
        for idx in indices:
            new_indices.append(idx)
            if idx == self.word2index['EOS']:
                break
                    
        word = [self.index2word[idx] for idx in new_indices if idx > 2 ]
        return ''.join(word)

In [6]:
v = Vocabuary()
t = "exculpated"
idx = v.word2indices(t)
print(idx)
t = v.indices2word(idx)
print(t)

[12 29 14 15 20 17  4 13 12  7]
exculpated


#### DataLoader

In [7]:
class TenseLoader(data.Dataset):
    def __init__(self, mode, vocab):
        self.mode = mode   
        self.data = getData(self.mode)
        self.vocab = vocab
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_tensor = []
        if self.mode == 'train':
            for word in self.data.iloc[index,:]:
                data_tensor.append(torch.tensor(self.vocab.word2indices(word)))
#             data_tensor = torch.tensor(data_tensor)
        else:
            condition = [["sp", "p"], ["sp", "pg"], ["sp", "tp"], ["sp", "tp"], ["p", "tp"], 
                        ["sp", "pg"], ["p", "sp"], ["pg", "sp"], ["pg", "p"], ["pg", "tp"]]
            order = {'sp':0, 'tp':1, 'pg':2, 'p':3}
            
            input_tense = order[condition[index][0]]
            input_tensor = torch.tensor(self.vocab.word2indices(self.data[index][0]))
            target_tense = order[condition[index][1]]
            target_tensor = torch.tensor(self.vocab.word2indices(self.data[index][1]))
            
            data_tensor = [(input_tense, input_tensor), (target_tense, target_tensor)]

        return data_tensor

In [8]:
trainset = TenseLoader('train', v)
trainset[12]

[tensor([ 4, 14, 21, 10, 12, 22, 12]),
 tensor([ 4, 14, 21, 10, 12, 22, 12,  9]),
 tensor([ 4, 14, 21, 10, 12, 22, 10,  6, 11]),
 tensor([ 4, 14, 21, 10, 12, 22, 12,  7])]

In [9]:
testset = TenseLoader('test', v)
testset[0]

[(0, tensor([4, 5, 4, 6, 7, 8, 6])),
 (3, tensor([ 4,  5,  4,  6,  7,  8,  6, 12,  7]))]

#### Show reslut
* Crossentropy loss curve
* KL loss curve
* BLEU-4 score curve

In [10]:
def show_result(scores, losses):  
    bleu_score, gaussian_score = scores
    c_loss, kl_loss = losses
    
    plt.figure(figsize=(10, 6))
    plt.ylabel("Loss")
    plt.xlabel("Epochs")
    plt.title("CrossEntropy Loss Curve", fontsize=18)
    plt.plot(c_loss)
    plt.show()
    
    plt.figure(figsize=(10, 6))
    plt.ylabel("Loss")
    plt.xlabel("Epochs")
    plt.title("KL Loss Curve", fontsize=18)
    plt.plot(kl_loss)
    plt.show()
    
    plt.figure(figsize=(10, 6))
    plt.ylabel("Score")
    plt.xlabel("Epochs")
    plt.title("BLEU Score Curve", fontsize=18)
    plt.plot(bleu_score)
    plt.show()
    
    plt.figure(figsize=(10, 6))
    plt.ylabel("Score")
    plt.xlabel("Epochs")
    plt.title("Gaussian Score Curve", fontsize=18)
    plt.plot(gaussian_score)
    plt.show()

#### Condition
concatenate the condition part with the initial hidden part 
* nn.Embedding
* One-hot

In [11]:
def condition_embedding(condition_size, batch_size=1, condition=None):
    order = {'sp':0, 'tp':1, 'pg':2, 'p':3}
    one_hot = {0:[1, 0, 0, 0], 1:[0, 1, 0, 0], 2:[0, 0, 1, 0], 3:[0, 0, 0, 1]}
    
    if condition != None:
        embedded_tense = torch.tensor(one_hot[condition], dtype=torch.float).view(1, -1)
    else: 
        embedded_tense = []
        for o in order.values():
            embedded = torch.tensor(one_hot[o], dtype=torch.float).view(1, -1)
            embedded_cp = embedded
            
            # expand batch dim, (4, condition_size) to (4, batch_size, condition_size)
            for i in range(batch_size-1):
                embedded = torch.cat((embedded, embedded_cp), 0)
            embedded_tense.append(embedded)
        
    return embedded_tense
                     
condition_embedding(condition_size, 32)

[tensor([[1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.]]), tensor([[0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],


### Evaluation

In [12]:
#compute BLEU-4 score
def compute_bleu(output, reference):
    """
    reference = 'accessed'
    output = 'access'
    return BLEU score
    """
    cc = SmoothingFunction()
    if len(reference) == 3:
        weights = (0.33,0.33,0.33)
    else:
        weights = (0.25,0.25,0.25,0.25)
    return sentence_bleu([reference], output,weights=weights,smoothing_function=cc.method1)

In [13]:
def BLEU_predict(encoder, decoder, vocab, batch_size=1, condition_size=8, plot_pred=False):
    testset = TenseLoader('test', vocab)
    outputs = []
    
    with torch.no_grad():
    
        for idx in range(len(testset)):
            input_tense = testset[idx][0][0]
            input_embedded_tense = condition_embedding(condition_size, condition=input_tense) # (1, condition_szie)
            input_tensor = testset[idx][0][1].to(device) # (seq_len)

            target_tense = testset[idx][1][0]
            target_embedded_tense = condition_embedding(condition_size, condition=target_tense)
            target_tensor = testset[idx][1][1].to(device)

            batch_size = 1

            # transpose tensor from (batch_size, tense, seq_len) to (tense, seq_len, batch_size)
            input_tensor = input_tensor.view(-1, 1) # (seq_len, 1)
            target_tensor = target_tensor.view(-1, 1)

            # init encoder hidden state and cat condition
            encoder_hidden = encoder.initHidden(input_embedded_tense, batch_size)

            # calculate number of time step
            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)
            

            #----------sequence to sequence part for encoder----------#
            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(
                    input_tensor[ei], encoder_hidden)

            # reparameterization trick
            mu, logvar = encoder.variational(encoder_hidden)
            reparameterized_state = reparameterize(mu, logvar)
            reparameterized_state = decoder.in_layer(reparameterized_state)

            # init decoder hidden state and cat condition
            decoder_hidden = decoder.initHidden(reparameterized_state, target_embedded_tense, batch_size)
            
            decoder_input = torch.tensor([[SOS_token] for i in range(batch_size)], device=device)
            
            output = torch.zeros(target_length, batch_size)

            #----------sequence to sequence part for decoder----------#
            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden) 
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  # detach from history as input
                output[di] = decoder_input
            
            # transpose tensor from (target_length, batch_size) to (batch_size, target_length)
            output = output.transpose(0, 1).view(-1)
         
            outputs.append(vocab.indices2word(output.data.numpy()))

    return outputs

In [14]:
# print the prediction and return the bleu score
def BLEU_score(prediction, plot_pred=False):
    data = getData('test')

    bleu_total = 0
    for idx in range(len(prediction)):
        bleu_total += compute_bleu(prediction[idx], data[idx][1])

        if plot_pred:
            output = "\ninput:  {}\ntarget: {}\npred:   {}".format(data[idx][0], data[idx][1], prediction[idx])
            print ("="*30+output)

    return bleu_total/len(prediction)

In [15]:
def Gaussian_predict(encoder, decoder, vocab, batch_size=64, laten_size=32, condition_size=8, plot_pred=False):
    outputs = []

    with torch.no_grad():    
        batch_size = 100

        # sample 100 Gaussian
        laten_variable = torch.randn((batch_size, laten_size), device=device).view(1, batch_size, -1)

        # get 4 tense embedding tensor
        embedded_tenses = condition_embedding(condition_size , batch_size)

        # record outputs
        output_tensors = torch.zeros(4, vocab.max_length, batch_size)# (tense, seq_len, batch_size)

        # 4 tense iteration
        for index, embedded_tense in enumerate(embedded_tenses):

            decoder_input = torch.tensor([[SOS_token] for i in range(batch_size)], device=device)

            output = torch.zeros(vocab.max_length, batch_size)

            # init decoder hidden state and cat condition
            decoder_hidden = decoder.initHidden(decoder.in_layer(laten_variable), embedded_tense, batch_size)

            #----------sequence to sequence part for decoder----------#
            for di in range(vocab.max_length):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden) 
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  # detach from history as input
                output[di] = decoder_input

            # get predict tensors
            output_tensors[index] = output

        # transpose tensor from (tense, seq_len, batch_size) to (batch_size, tense, seq_len)
        output_tensors = output_tensors.permute(2, 0, 1)

        # convert input into string
        for idx in range(batch_size):
            outputs.append([vocab.indices2word(tense.data.numpy()) for tense in output_tensors[idx]])

    return outputs

In [16]:
# compute generation score
def Gaussian_score(predictions, plot_pred=False):
    """
    the order should be : simple present, third person, present progressive, past
    predictions = [['consult', 'consults', 'consulting', 'consulted'],...]
    return Gaussian_score score
    """
    score = 0
    words_list = []
    with open('./data/train.txt','r') as fp:
        for line in fp:
            word = line.split(' ')
            word[3] = word[3].strip('\n')
            words_list.extend([word])
        for idx, t in enumerate(predictions):
            if plot_pred:
                print (t)
            for idxj, i in enumerate(words_list):
                if t == i:
                    score += 1
    return score/len(predictions)

In [17]:
def evaluate(encoder, decoder, vocab, batch_size=64, laten_size=32, condition_size=8, plot_pred=False):
    # predict train.txt for gaussian score
    predictions = Gaussian_predict(encoder, decoder, vocab, batch_size=batch_size, laten_size=laten_size, condition_size=condition_size, plot_pred=plot_pred)
    
    # compute Gaussian score
    gaussian_score = Gaussian_score(predictions, plot_pred=plot_pred)
    if plot_pred:
        print ("Gaussian score: %.2f"%gaussian_score)

    # predict test.txt for bleu score
    predictions = BLEU_predict(encoder, decoder, vocab, batch_size=1, condition_size=condition_size, plot_pred=False)
            
    # compute BLEU score
    bleu_score = BLEU_score(predictions, plot_pred=plot_pred)
    if plot_pred:
        print ("BLEU score: %.2f"%bleu_score)
    
    return bleu_score, gaussian_score

#### Reparameterization Trick

In [18]:
def reparameterize(mu, logvar):
    std = torch.exp(0.5*logvar)
    eps = torch.randn_like(std)
    return mu + eps*std

### Encoder

In [19]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, laten_size, condition_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.condition_size = condition_size
        
        self.fc1 = nn.Linear(hidden_size+condition_size, laten_size)
        self.fc2 = nn.Linear(hidden_size+condition_size, laten_size)

        self.embedding = nn.Embedding(input_size, hidden_size+condition_size)
        self.lstm = nn.LSTM(hidden_size+condition_size, hidden_size+condition_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, -1, self.hidden_size+self.condition_size)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden
    
    def variational(self, hidden):
        return self.fc1(hidden[0]), self.fc2(hidden[0])

    def initHidden(self, embedded_tense, batch_size=64):
        embedded_tense = embedded_tense.to(device).view(1, batch_size, -1)
        zeros = torch.zeros(1, batch_size, self.hidden_size, device=device)
        return (torch.cat((zeros, embedded_tense), 2),
                torch.cat((zeros, embedded_tense), 2))

### Decoder

In [20]:
class DecoderRNN(nn.Module):
    def __init__(self, latent_size, hidden_size, output_size, condition_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.condition_size = condition_size

        self.in_layer = nn.Linear(latent_size, hidden_size)
        self.embedding = nn.Embedding(output_size, hidden_size+condition_size)
        self.lstm = nn.LSTM(hidden_size+condition_size, hidden_size+condition_size)
        self.out = nn.Linear(hidden_size+condition_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, -1, self.hidden_size+self.condition_size)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output[0])
        return output, hidden

    def initHidden(self, hidden_state, embedded_tense, batch_size):
        #print(hidden_state.shape)
        embedded_tense = embedded_tense.to(device).view(1, batch_size, -1)
        zeros = torch.zeros(1, batch_size, self.hidden_size, device=device)
        return (torch.cat((hidden_state, embedded_tense), 2),
                torch.cat((zeros, embedded_tense), 2))

### Train
* Use teacher forcing
* Use KL loss annealing

In [21]:
def kl_annealing(epochs, mode):
    assert mode == "monotonic" or mode == "cyclical"
    if mode == "monotonic":
        if epochs > 20:
            KLD_weight = 1
        else:
            KLD_weight = 0.05 * epochs
    else:
        if epochs%20 > 10:
            KLD_weight = 1
        else:
            KLD_weight = 0.1 * (epochs%20)
    return KLD_weight

In [22]:
def compute_kl_loss(mu, logvar):
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return KLD

In [23]:
# save model every epoch
def train(input_tensors, encoder, decoder, encoder_optimizer, decoder_optimizer, 
          criterion, epochs, condition_size, teacher_forcing_ratio):
#     batch_size = input_tensors.size(0)
    batch_size = 1
    # get 4 tense embedding tensor
    embedded_tenses = condition_embedding(condition_size, batch_size)
    
    # loss for 4 tense
    kl_loss_total = 0
    ce_loss_total = 0
    
    # transpose tensor from (batch_size, tense, seq_len) to (tense, seq_len, batch_size)
#     input_tensors = input_tensors.permute(1, 2, 0)
    
    # 4 tense iteration
    for index, embedded_tense in enumerate(embedded_tenses):
        # embedded_tense.to(device)
        input_tensor = input_tensors[index].to(device) # (seq_len, batch_size)
        input_tensor = input_tensor.transpose(0, 1)
    
        # init encoder hidden state and cat condition
        encoder_hidden = encoder.initHidden(embedded_tense, batch_size)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # calculate number of time step
        input_length = input_tensor.size(0)

        loss = 0
        ce_loss = 0

        #----------sequence to sequence part for encoder----------#
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(
                input_tensor[ei], encoder_hidden)
            
        # reparameterization trick
        mu, logvar = encoder.variational(encoder_hidden)
        reparameterized_state = reparameterize(mu, logvar)
        reparameterized_state = decoder.in_layer(reparameterized_state)
        # calculate kl loss
        kl_loss = compute_kl_loss(mu, logvar)
        kl_loss_total += kl_loss
        loss += kl_annealing(epochs, "monotonic") * kl_loss
        
        # init decoder hidden state and cat condition
        decoder_hidden = decoder.initHidden(reparameterized_state, embedded_tense, batch_size)
        
        decoder_input = torch.tensor([[SOS_token] for i in range(batch_size)], device=device)

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        #----------sequence to sequence part for decoder----------#
        if use_teacher_forcing:
            # Teacher forcing: Feed the target as the next input
            for di in range(input_length):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden) 
                _, indx = torch.max(decoder_output, 1)
#                 print(indx, input_tensor[di])
                ce_loss += criterion(decoder_output, input_tensor[di])
                decoder_input = input_tensor[di]  # Teacher forcing
                
        else:
            # Without teacher forcing: use its own predictions as the next input
            for di in range(input_length):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden) 
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  # detach from history as input

                ce_loss += criterion(decoder_output, input_tensor[di])
#                 if decoder_input.item() == EOS_token:
#                     break

        loss += ce_loss
        ce_loss_total += (ce_loss.item()/input_length)
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

    return ce_loss_total/4, kl_loss_total/4

In [24]:
def trainIters(encoder, decoder, vocab, n_iters, print_every=1000, plot_every=100, 
               batch_size=64, learning_rate=0.01, laten_size=32, condition_size=8, teacher_forcing_ratio=1.0):
    start = time.time()

    # Reset every print_every, for print log
    print_ce_loss_total = 0  
    print_kl_loss_total = 0
    # Reset every plot_every, for plot curve
    crossentropy_losses = []
    kl_losses = []
    plot_ce_loss_total = 0
    plot_kl_loss_total = 0
    # scores
    gaussian_scores = []
    bleu_scores = []
    

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    # create dataloader
    trainset = TenseLoader('train', vocab)
    trainloader = data.DataLoader(trainset, batch_size, shuffle = True)

    criterion = nn.CrossEntropyLoss()

    for iter in range(1, n_iters + 1):
        for input_tensors in trainloader:
#             input_tensors = input_tensors.to(device)

            ce_loss, kl_loss = train(input_tensors, encoder, decoder, encoder_optimizer, decoder_optimizer, 
                                     criterion, (iter-1), condition_size, teacher_forcing_ratio)
            print_ce_loss_total += ce_loss
            print_kl_loss_total += kl_loss
            plot_ce_loss_total += ce_loss
            plot_kl_loss_total += kl_loss
            
        # evaluate and save model
        bleu_score, gaussian_score = evaluate(encoder, decoder, vocab, laten_size=laten_size, condition_size=condition_size, plot_pred=False)
        bleu_scores.append(bleu_score)
        gaussian_scores.append(gaussian_score)
        if bleu_score > 0.1 and gaussian_score > 0.2:
            print ("Model save...")
            torch.save(encoder, "./models/encoder_{:.4f}_{:.4f}.ckpt".format(gaussian_score, bleu_score))
            torch.save(decoder, "./models/decoder_{:.4f}_{:.4f}.ckpt".format(gaussian_score, bleu_score))

        if iter % print_every == 0:
            print_ce_loss_avg = print_ce_loss_total / print_every
            print_kl_loss_avg = print_kl_loss_total / print_every
            print('%s (%d %d%%) CE Loss: %.4f, KL Loss: %.4f, BLEU score: %.2f, Gaussian score: %.2f' % 
                  (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_ce_loss_avg, print_kl_loss_avg, bleu_score, gaussian_score))
            print_ce_loss_total = 0
            print_kl_loss_total = 0

        crossentropy_losses.append(plot_ce_loss_total)
        plot_ce_loss_total = 0
        kl_losses.append(plot_kl_loss_total)
        plot_kl_loss_total = 0
        
        collected = gc.collect()
            
#     print ("The highest score is %s"%max_score)
            
    return (bleu_scores, gaussian_scores), (crossentropy_losses, kl_losses)

In [None]:
encoder1 = EncoderRNN(vocab_size, hidden_size, laten_size, condition_size).to(device)
decoder1 = DecoderRNN(laten_size, hidden_size, vocab_size, condition_size).to(device)
vocab = Vocabuary()
scores, losses = trainIters(encoder1, decoder1, vocab, 100, print_every=1, plot_every=1, 
                            batch_size=1, learning_rate=0.01, laten_size=32, condition_size=4, teacher_forcing_ratio=0.7)
show_result(scores, losses)

0m 41s (- 68m 42s) (1 1%) CE Loss: 3004.9755, KL Loss: 1373.0375, BLEU score: 0.05, Gaussian score: 0.00
1m 23s (- 68m 29s) (2 2%) CE Loss: 2752.4332, KL Loss: 2109.3921, BLEU score: 0.08, Gaussian score: 0.00
2m 8s (- 69m 8s) (3 3%) CE Loss: 2617.9485, KL Loss: 2623.5410, BLEU score: 0.08, Gaussian score: 0.00
2m 53s (- 69m 27s) (4 4%) CE Loss: 2508.6193, KL Loss: 2831.7866, BLEU score: 0.08, Gaussian score: 0.00
3m 37s (- 68m 57s) (5 5%) CE Loss: 2390.9205, KL Loss: 3479.9385, BLEU score: 0.08, Gaussian score: 0.00
4m 22s (- 68m 28s) (6 6%) CE Loss: 2255.9658, KL Loss: 3813.6475, BLEU score: 0.09, Gaussian score: 0.00
5m 6s (- 67m 49s) (7 7%) CE Loss: 2155.1569, KL Loss: 4249.3066, BLEU score: 0.09, Gaussian score: 0.00
5m 50s (- 67m 15s) (8 8%) CE Loss: 2115.7320, KL Loss: 4260.1543, BLEU score: 0.11, Gaussian score: 0.00
6m 35s (- 66m 40s) (9 9%) CE Loss: 2063.5628, KL Loss: 4219.1533, BLEU score: 0.11, Gaussian score: 0.00
7m 20s (- 66m 0s) (10 10%) CE Loss: 1987.6929, KL Loss: 41

In [None]:
_, _  = evaluate(encoder1, decoder1, vocab, batch_size=32, plot_pred=True)