### DLP Lab5
Goal of this lab is to to implement a conditional seq2seq VAE for English tense conversion
1. Tense conversion: ‘access’ to ‘accessing’, or ‘accessed’ to ‘accesses’
2. Generative model: Gaussian noise + tense -> access, accesses, accessing, accessed

#### Requirment
1. Implement a conditional seq2seq VAE.
    * Modify encoder, decoder, and training functions
    * Implement evaluation function, dataloader, and reparameterization trick.
2. Plot and **compare** the CrossEntropy loss, KL loss and BLEU-4 score of testing data curves during training with different settings of your model
    * Teacher forcing ratio
    * KL annealing schedules (two methods)
3. Output the conversion results between tenses (from tense A to tense B)
4. Output the results generated by a Gaussian noise with 4 tenses.

#### Implement detail
1. Use LSTM
2. Log variance
3. Condition (tense)
    * Simply concatenate to the hidden_0 and z
    * Embed your condition to high dimensional space (or simply use one-hot)
4. KL lost annealing
    * Monotonic
    * Cyclical
5. Adopt BLEU-4 score function in NLTK (average 10 testing scores)
6. Adopt Gaussian_score() to compute the generation score
    * Random sample 100 noise to generate 100 words with 4 different tenses(totally 400 words)
    * 4 words should exactly match the training data

#### Hyper parameters
* LSTM hidden size: 256 or 512
* Latent size: 32
* Condition embedding size: 8
* Teacher forcing ratio: 0~1 (>0.5) (??? 
* KL weight: 0~1 (???
* Learning rate: 0.05
* Optimizer: SGD
* Loss function: torch.nn.CrossEntropyLoss()

Date: 2020/05/

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import time
import math
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils import data
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
from os import system
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SOS_token = 0
EOS_token = 1
#----------Hyper Parameters----------#
hidden_size = 256
laten_size = 32
condition_size = 8
#The number of vocabulary
vocab_size = 30
teacher_forcing_ratio = 1.0
empty_input_ratio = 0.1
KLD_weight = 0.0
learning_rate = 0.05

In [3]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %2ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

### Fetch data
* train.txt: Each training pair includes 4 words: simple present(sp), third person(tp), present progressive(pg), simple past(p)
* test.txt: Each training pair includes 2 words with different combination of tenses

In [4]:
def getData(mode):
    assert mode == 'train' or mode == 'test'
    if mode == 'train':
        data = pd.read_csv('./data/'+mode+'.txt', delimiter=' ', header=None)
    else:
        data = []
        with open('./data/test.txt','r') as fp:
            for line in fp:
                word = line.split(' ')
                word[1] = word[1].strip('\n')
                data.extend([word])
    return data

getData("train")

Unnamed: 0,0,1,2,3
0,abandon,abandons,abandoning,abandoned
1,abet,abets,abetting,abetted
2,abdicate,abdicates,abdicating,abdicated
3,abduct,abducts,abducting,abducted
4,abound,abounds,abounding,abounded
...,...,...,...,...
1222,exhort,exhorts,exhorting,exhorted
1223,exhilarate,exhilarates,exhilarating,exhilarated
1224,exculpate,exculpates,exculpating,exculpated
1225,exasperate,exasperates,exasperating,exasperated


#### Vocabulary

In [5]:
class Vocabuary():
    def __init__(self):
        self.word2index = {'SOS': 0, 'EOS': 1, 'PAD': 2, 'UNK': 3}
        self.index2word = {0: 'SOS', 1: 'EOS', 2: 'PAD', 3: 'UNK'}
        self.n_words = 4
        self.max_length = 0
        self.build_vocab(getData('train'))
        

    # input the training data and build vocabulary
    def build_vocab(self, corpus):        
        for idx in range(corpus.shape[0]):
            for word in corpus.iloc[idx,:]:
                if len(word) > self.max_length:
                    self.max_length = len(word)
                    
                for char in word:
                    if char not in self.word2index:
                        self.word2index[char] = self.n_words
                        self.index2word[self.n_words] = char
                        self.n_words += 1                      
                    
    # convert word in indices
    def word2indices(self, word, add_eos=False, add_sos=False):
        indices = [self.word2index[char] if char in self.word2index else 3 for char in word]

        if add_sos:
            indices.insert(0, 0)
        if add_eos:
            indices.append(1)
            
        # padding input of same target into same length
        indices.extend([2]*(self.max_length-len(word)))
            
        return np.array(indices)
    
    # convert indices to word
    def indices2word(self, indices):
        word = [self.index2word[idx] for idx in indices if idx > 2 ]
        return ''.join(word)

In [6]:
v = Vocabuary()
t = "exculpated"
idx = v.word2indices(t)
print(idx)
t = v.indices2word(idx)
print(t)

[12 29 14 15 20 17  4 13 12  7  2  2  2  2  2]
exculpated


#### DataLoader

In [7]:
class TenseLoader(data.Dataset):
    def __init__(self, mode, vocab):
        self.mode = mode   
        self.data = getData(self.mode)
        self.vocab = vocab
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.mode == 'train':
            data_tensor = []
            for word in self.data.iloc[index,:]:
                data_tensor.append(self.vocab.word2indices(word))
        else:
            condition = [["sp", "p"], ["sp", "pg"], ["sp", "tp"], ["sp", "tp"], ["p", "tp"], 
                        ["sp", "pg"], ["p", "sp"], ["pg", "sp"], ["pg", "p"], ["pg", "tp"]]
            input_tensor = self.vocab.word2indices(self.data[index][0])
            target_tensor = self.vocab.word2indices(self.data[index][0])
        return torch.tensor(data_tensor)

In [8]:
trainset = TenseLoader('train', v)
trainset[0]

tensor([[ 4,  5,  4,  6,  7,  8,  6,  2,  2,  2,  2,  2,  2,  2,  2],
        [ 4,  5,  4,  6,  7,  8,  6,  9,  2,  2,  2,  2,  2,  2,  2],
        [ 4,  5,  4,  6,  7,  8,  6, 10,  6, 11,  2,  2,  2,  2,  2],
        [ 4,  5,  4,  6,  7,  8,  6, 12,  7,  2,  2,  2,  2,  2,  2]])

#### Show reslut
* Crossentropy loss curve
* KL loss curve
* BLEU-4 score curve

In [9]:
def show_result(score, c_loss, kl_loss):  
    label = ["BLEU Score", "Crossentropy Loss", "KL Loss"]
    
    plt.figure(figsize=(10, 6))
    
    plt.ylabel("Loss & Score")
    plt.xlabel("Epochs")
    plt.title("Training Curve", fontsize=18)
    plt.plot(score, label="BLEU Score")
    plt.plot(c_loss, label="Crossentropy Loss")
    plt.plot(kl_loss, label="KL Loss")

    plt.show()

#### Condition
concatenate the condition part with the initial hidden part 
* nn.Embedding
* One-hot

In [10]:
def condition_embedding(condition_size, batch_size=1, condition=None):
    order = {'sp':0, 'tp':1, 'pg':2, 'p':3}
    embedding = nn.Embedding(4, condition_size)
    
    if condition:
        ondition_tensor = torch.tensor(order[condition])
        embedded_tense = embedding(condition_tensor).view(1, -1)
    else: 
        embedded_tense = []
        for o in order.values():
            condition_tensor = torch.tensor(o)
            embedded = embedding(condition_tensor).view(1, -1)
            embedded_cp = embedded
            # expand batch dim, (4, condition_size) to (4, batch_size, condition_size)
            for i in range(batch_size-1):
                embedded = torch.cat((embedded, embedded_cp), 0)
            embedded_tense.append(embedded)
        
    return embedded_tense
                     
condition_embedding(condition_size, 32)

[tensor([[ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
         [ 0.7214, -1.1775,  1.0754, -0.4868, -0.6969, -0.8959, -2.2780,  1.5519],
    

### Evaluation

In [11]:
#compute BLEU-4 score
def compute_bleu(output, reference):
    """
    reference = 'accessed'
    output = 'access'
    return BLEU score
    """
    cc = SmoothingFunction()
    if len(reference) == 3:
        weights = (0.33,0.33,0.33)
    else:
        weights = (0.25,0.25,0.25,0.25)
    return sentence_bleu([reference], output,weights=weights,smoothing_function=cc.method1)

In [12]:
# compute generation score
def Gaussian_score(predictions, plot_pred=False):
    """
    the order should be : simple present, third person, present progressive, past
    predictions = [['consult', 'consults', 'consulting', 'consulted'],...]
    return Gaussian_score score
    """
    score = 0
    words_list = []
    with open('./data/train.txt','r') as fp:
        for line in fp:
            word = line.split(' ')
            word[3] = word[3].strip('\n')
            words_list.extend([word])
        for idx, t in enumerate(predictions):
            if plot_pred:
                print (t)
            for idxj, i in enumerate(words_list):
                if t == i:
                    score += 1
    return score/len(predictions)

In [13]:
def predict(encoder, decoder, vocab, batch_size=64, plot_pred=False):
#     # create dataloader
#     trainset = TenseLoader('train', vocab)
#     trainloader = data.DataLoader(trainset, batch_size=batch_size)
        
    prediction = []
    inputs = []
    outputs = []

    with torch.no_grad():    
#         for input_tensors in trainloader:  
        for i in range(100):
#             batch_size = input_tensors.size(0)
            # get 4 tense embedding tensor
            embedded_tenses = condition_embedding(condition_size , 1)
            
#             # transpose tensor from (batch_size, tense, seq_len) to (tense, seq_len, batch_size)
#             input_tensors = input_tensors.to(device)
#             input_tensors = input_tensors.permute(1, 2, 0)
            # record outputs
            output_tensors = torch.zeros(input_tensors.size())

            # 4 tense iteration
            for index, embedded_tense in enumerate(embedded_tenses):
                # init hidden state and cat condition
                encoder_hidden = encoder.initHidden(embedded_tense, batch_size)
                
                input_tensor = input_tensors[index] # (seq_len, batch_size)

                # calculate number of time step
                input_length = input_tensor.size(0)

                #----------sequence to sequence part for encoder----------#
                for ei in range(input_length):
                    encoder_output, encoder_hidden = encoder(
                        input_tensor[ei], encoder_hidden)

                decoder_input = torch.tensor([[SOS_token] for i in range(batch_size)], device=device)
                output = torch.zeros(input_length, batch_size)

                # reparameterization trick
                mu, logvar = encoder.variational(encoder_hidden)
                reparameterized_state = reparameterize(mu, logvar)

                # init decoder hidden state and cat condition
                decoder_hidden = decoder.initHidden(reparameterized_state, embedded_tense, batch_size)

                #----------sequence to sequence part for decoder----------#
                for di in range(input_length):
                    decoder_output, decoder_hidden = decoder(
                        decoder_input, decoder_hidden) 
                    topv, topi = decoder_output.topk(1)
                    decoder_input = topi.squeeze().detach()  # detach from history as input
                    output[di] = decoder_input
                
                # get predict tensors
                output_tensors[index] = output

            # transpose tensor from (tense, seq_len, batch_size) to (batch_size, tense, seq_len)
            output_tensors = output_tensors.permute(2, 0, 1)

            # convert input into string
            for idx in range(batch_size):
                outputs.append([vocab.indices2word(tense.data.numpy()) for tense in output_tensors[idx]])

    return outputs

In [14]:
# def test(encoder, decoder, vocab, plot_pred=False):


In [15]:
def evaluate(encoder, decoder, vocab, batch_size=64, plot_pred=False):
    # predict train.txt for gaussian score
    predictions = predict(encoder, decoder, vocab, batch_size=batch_size, plot_pred=plot_pred)
    
    # compute Gaussian score
    gaussian_score = Gaussian_score(predictions, plot_pred=plot_pred)
    if plot_pred:
        print ("Gaussian score: %.2f"%gaussian_score)

    # predict test.txt for bleu score
            
    # compute BLEU score
    
    
    return predictions, gaussian_score

#### Reparameterization Trick

In [16]:
def reparameterize(mu, logvar):
    std = torch.exp(0.5*logvar)
    eps = torch.randn_like(std)
    return mu + eps*std

### Encoder

In [17]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, laten_size, condition_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.condition_size = condition_size
        
        self.fc1 = nn.Linear(hidden_size+condition_size, laten_size)
        self.fc2 = nn.Linear(hidden_size+condition_size, laten_size)

        self.embedding = nn.Embedding(input_size, hidden_size+condition_size)
        self.lstm = nn.LSTM(hidden_size+condition_size, hidden_size+condition_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, -1, self.hidden_size+condition_size)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden
    
    def variational(self, hidden):
        return self.fc1(hidden[0]), self.fc2(hidden[0])

    def initHidden(self, embedded_tense, batch_size=64):
        embedded_tense = embedded_tense.to(device).view(1, batch_size, -1)
        zeros = torch.zeros(1, batch_size, self.hidden_size, device=device)
        return (torch.cat((zeros, embedded_tense), 2),
                torch.cat((zeros, embedded_tense), 2))

### Decoder

In [18]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, condition_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.condition_size = condition_size

        self.embedding = nn.Embedding(output_size, hidden_size+condition_size)
        self.lstm = nn.LSTM(hidden_size+condition_size, hidden_size+condition_size)
        self.out = nn.Linear(hidden_size+condition_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, -1, self.hidden_size+condition_size)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output[0])
        return output, hidden

    def initHidden(self, hidden_state, embedded_tense, batch_size):
        embedded_tense = embedded_tense.to(device).view(1, batch_size, -1)
        return (torch.cat((hidden_state, embedded_tense), 2),
                torch.cat((hidden_state, embedded_tense), 2))

### Train
* Use teacher forcing
* Use KL loss annealing

In [19]:
def kl_annealing(epochs, mode):
    assert mode == "monotonic" or mode == "cyclical"
    if mode == "monotonic":
        if epochs > 1e4:
            KLD_weight = 1
        else:
            KLD_weight = 1e-4 * epochs
    else:
        if epochs%1e4 > 5e3:
            KLD_weight = 1
        else:
            KLD_weight = 2e-1 * epochs
    return KLD_weight

In [20]:
def compute_kl_loss(mu, logvar):
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return KLD

In [21]:
# save model every epoch
def train(input_tensors, encoder, decoder, encoder_optimizer, decoder_optimizer, 
          criterion, epochs, teacher_forcing_ratio):
    batch_size = input_tensors.size(0)
    # get 4 tense embedding tensor
    embedded_tenses = condition_embedding(condition_size, batch_size)
    
    # loss for 4 tense
    kl_loss_total = 0
    ce_loss_total = 0
    
    # transpose tensor from (batch_size, tense, seq_len) to (tense, seq_len, batch_size)
    input_tensors = input_tensors.permute(1, 2, 0)
    
    # 4 tense iteration
    for index, embedded_tense in enumerate(embedded_tenses):
        # embedded_tense.to(device)
        input_tensor = input_tensors[index] # (seq_len, batch_size)
    
        # init encoder hidden state and cat condition
        encoder_hidden = encoder.initHidden(embedded_tense, batch_size)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # calculate number of time step
        input_length = input_tensor.size(0)

        loss = 0
        ce_loss = 0

        #----------sequence to sequence part for encoder----------#
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(
                input_tensor[ei], encoder_hidden)
            
        # reparameterization trick
        mu, logvar = encoder.variational(encoder_hidden)
        reparameterized_state = reparameterize(mu, logvar)
        
        # calculate kl loss
        kl_loss = compute_kl_loss(mu, logvar)
        kl_loss_total += kl_annealing(epochs, "monotonic") * kl_loss
        loss += kl_annealing(epochs, "monotonic") * kl_loss

        # init decoder hidden state and cat condition
        decoder_hidden = decoder.initHidden(reparameterized_state, embedded_tense, batch_size)
        
        decoder_input = torch.tensor([[SOS_token] for i in range(batch_size)], device=device)

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        #----------sequence to sequence part for decoder----------#
        if use_teacher_forcing:
            # Teacher forcing: Feed the target as the next input
            for di in range(input_length):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden) 
                ce_loss += criterion(decoder_output, input_tensor[di])
                decoder_input = input_tensor[di]  # Teacher forcing

        else:
            # Without teacher forcing: use its own predictions as the next input
            for di in range(input_length):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden) 
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  # detach from history as input

                ce_loss += criterion(decoder_output, input_tensor[di])

        loss += ce_loss
        ce_loss_total += (ce_loss.item()/input_length)
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

    return ce_loss_total/4, kl_loss_total/4

In [22]:
def trainIters(encoder, decoder, vocab, n_iters, print_every=1000, plot_every=100, 
               batch_size=64, learning_rate=0.01, teacher_forcing_ratio=1.0):
    start = time.time()

    # Reset every print_every, for print log
    print_ce_loss_total = 0  
    print_kl_loss_total = 0
    # Reset every plot_every, for plot curve
    crossentropy_losses = []
    kl_losses = []
    plot_ce_loss_total = 0
    plot_kl_loss_total = 0
    # scores
    gaussian_scores = []
    bleu_scores = []
    

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    # create dataloader
    trainset = TenseLoader('train', vocab)
    trainloader = data.DataLoader(trainset, batch_size, shuffle = True)

    criterion = nn.CrossEntropyLoss()

    for iter in range(1, n_iters + 1):
        for input_tensors in trainloader:
            input_tensors = input_tensors.to(device)

            ce_loss, kl_loss = train(input_tensors, encoder, decoder, encoder_optimizer, decoder_optimizer, 
                                     criterion, (iter-1), teacher_forcing_ratio)
            print_ce_loss_total += ce_loss
            print_kl_loss_total += kl_loss
            plot_ce_loss_total += ce_loss
            plot_kl_loss_total += kl_loss
            
        # evaluate and save model
        _, gaussian_score = evaluate(encoder, decoder, vocab, plot_pred=False)
        gaussian_scores.append(gaussian_score)
        if (iter-1) > 500:
#             max_score = avg_bleu
            print ("Model save...")
            torch.save(encoder, "./models/encoder_{}.ckpt".format((iter-1)))
            torch.save(decoder, "./models/decoder_{}.ckpt".format((iter-1)))

        if iter % print_every == 0:
            print_ce_loss_avg = print_ce_loss_total / print_every
            print_kl_loss_avg = print_kl_loss_total / print_every
            print('%s (%d %2d%%) CE Loss: %.4f, KL Loss: %.4f, Gaussian score: %.2f' % 
                  (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_ce_loss_avg, print_kl_loss_avg, gaussian_score))
            print_ce_loss_total = 0
            print_kl_loss_total = 0

        crossentropy_losses.append(plot_ce_loss_total)
        plot_ce_loss_total = 0
        kl_losses.append(plot_kl_loss_total)
        plot_kl_loss_total = 0
            
#     print ("The highest score is %s"%max_score)
            
    return gaussian_scores, crossentropy_losses

In [None]:
encoder1 = EncoderRNN(vocab_size, hidden_size, laten_size, condition_size).to(device)
decoder1 = DecoderRNN(laten_size, vocab_size, condition_size).to(device)
vocab = Vocabuary()
gaussian_scores, crossentropy_losses = trainIters(encoder1, decoder1, vocab, 1000, print_every=100, plot_every=1, teacher_forcing_ratio=1)
# show_result(scores, losses)

3m 51s (- 34m 41s) (100 10%) CE Loss: 19.9754, KL Loss: 0.1189, Gaussian score: 0.00
7m 49s (- 31m 16s) (200 20%) CE Loss: 13.4573, KL Loss: 0.0200, Gaussian score: 0.00
11m 47s (- 27m 30s) (300 30%) CE Loss: 12.0685, KL Loss: 0.0130, Gaussian score: 0.00
15m 51s (- 23m 46s) (400 40%) CE Loss: 11.7996, KL Loss: 0.0102, Gaussian score: 0.00
19m 56s (- 19m 56s) (500 50%) CE Loss: 11.6990, KL Loss: 0.0089, Gaussian score: 0.00
24m  3s (- 16m  2s) (600 60%) CE Loss: 11.6481, KL Loss: 0.0081, Gaussian score: 0.00
28m  6s (- 12m  2s) (700 70%) CE Loss: 11.6116, KL Loss: 0.0073, Gaussian score: 0.00
32m  7s (- 8m  1s) (800 80%) CE Loss: 11.5874, KL Loss: 0.0066, Gaussian score: 0.00


In [None]:
pred,_  = evaluate(encoder1, decoder1, vocab, batch_size=64, plot_pred=True)