# Variational Auto-Encoder
Trained using the code in https://github.com/kaletap/Sentence-VAE (fork of timbmg/Sentence-VAE github repo).

In [18]:
import json
import os
import sys

import torch
from nltk.tokenize import TweetTokenizer

In [11]:
sys.path.append('../Sentence-VAE')
from model import SentenceVAE
from ptb import PTB
from utils import idx2word

## Loading 

In [3]:
params =  {
     "vocab_size": 9877,
     "sos_idx": 2,
     "eos_idx": 3,
     "pad_idx": 0,
     "unk_idx": 1,
     "max_sequence_length": 60,
     "embedding_size": 300,
     "rnn_type": "gru",
     "hidden_size": 256,
     "word_dropout": 0,
     "embedding_dropout": 0.5,
     "latent_size": 16,
     "num_layers": 1,
     "bidirectional": False
 }

In [4]:
model = SentenceVAE(**params)
model.load_state_dict(torch.load('../Sentence-VAE/bin/2020-Oct-08-13:44:02/E9.pytorch'))

<All keys matched successfully>

In [5]:
device = torch.device('cuda')
model.to(device)

SentenceVAE(
  (embedding): Embedding(9877, 300)
  (embedding_dropout): Dropout(p=0.5, inplace=False)
  (encoder_rnn): GRU(300, 256, batch_first=True)
  (decoder_rnn): GRU(300, 256, batch_first=True)
  (hidden2mean): Linear(in_features=256, out_features=16, bias=True)
  (hidden2logv): Linear(in_features=256, out_features=16, bias=True)
  (latent2hidden): Linear(in_features=16, out_features=256, bias=True)
  (outputs2vocab): Linear(in_features=256, out_features=9877, bias=True)
)

In [6]:
with open('../Sentence-VAE/data/ptb.vocab.json', 'r') as file:
    vocab = json.load(file)

w2i, i2w = vocab['w2i'], vocab['i2w']

## Usage

### Sampling

In [14]:
samples, z = model.inference(n=10)

In [16]:
samples.shape

torch.Size([10, 60])

In [17]:
print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

the house version of the house ' s <unk> bill is a <unk> reminder <eos>
the <unk> of the <unk> <unk> <unk> that has been <unk> by the <unk> of the <unk> and <unk> <unk> <unk> <unk> & <unk> <unk> <unk> & woods <eos>
in a separate <unk> <unk> <unk> and <unk> <unk> & son of the <unk> group of the company ' s <unk> trucking corp . said it agreed to buy its <unk> businesses for a <unk> of <unk> <unk> <eos>
the company said the company has been completed in the past year <eos>
the company said it will be a <unk> of the <unk> of the <unk> of the <unk> of the <unk> <eos>
the board is subject to approval of a definitive agreement between the company ' s common stock and warrants to sell the stock of the <unk> of the <unk> <eos>
cross & trecker is a <unk> <unk> in newark n . y . in los angeles <eos>
mr . <unk> says he believes the market is n't going to be done <eos>
in a separate speech prepared by mr . bush ' s office of justice department ' s office in washington d . c . a . <unk> by moscow o

We see a classic problem in generated sentences: a lot of `<unk>` predictions. I think we should either disable `<unk>` prediction or use something like BPE tokenizer like in GPT or Roberta (or WordPiece tokenizer used in Bert). Besides, generated examples kind of suck.

### Encoding-decoding
We want to take any sentence, and obtain it's hidden representation ($z$, encoding). Then, we shall decode it and see how generated sentence looks like. That will allow us to work on hidden space and interpolate between two actual sentences.

In [51]:
class Tokenizer:
    def __init__(self, w2i: dict, max_sequence_length: int = 512):
        self.w2i = w2i
        self.max_sequence_length = max_sequence_length
        self.tokenizer = TweetTokenizer(preserve_case=False)
        
    def tokenize(self, line: str):
        words = self.tokenizer.tokenize(line)

        model_input = ['<sos>'] + words
        model_input = model_input[:self.max_sequence_length]

        length = len(model_input)

        model_input.extend(['<pad>'] * (self.max_sequence_length-length))

        model_input = [self.w2i.get(w, self.w2i['<unk>']) for w in model_input]
        
        return {
            'input': model_input,
            'length': length
        }
    
    def __call__(self, lines: list, device=None):
        device = device or torch.device('cuda')
        tokenized_lines = []
        lengths = []
        for line in lines:
            tokenizer_output = self.tokenize(line)
            tokenized_lines.append(tokenizer_output['input'])
            lengths.append(tokenizer_output['length'])
        return {
            'input': torch.tensor(tokenized_lines).to(device),
            'length': torch.tensor(lengths).to(device)
        }

In [70]:
tokenizer = Tokenizer(w2i, max_sequence_length=60)

In [139]:
sentence1 = "you either believe seymour can do it again or you do n't"
sentence2 = "he was previously vice president"

tokenizer_output = tokenizer([sentence1, sentence2])

In [144]:
logp, mean, logv, z = model(tokenizer_output['input'], tokenizer_output['length'])
samples, z = model.inference(z=mean)
print(z)
print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

tensor([[-0.3910,  0.8752, -0.9751, -1.6617, -1.0646, -0.0298,  1.0742,  0.1252,
         -0.8899, -0.8038, -0.7011, -0.8372, -0.2074,  0.1503,  0.2320, -0.3837],
        [ 0.0779,  0.8346, -0.7624,  0.6026, -0.1528, -0.2572,  0.1297,  1.2691,
         -0.6488,  0.1550,  0.8086,  0.0057, -0.1798, -0.0842,  0.2896,  0.1095]],
       device='cuda:0', grad_fn=<AddmmBackward>)
it ' s not going to be done <eos>
he said the company was n't disclosed <eos>


Standard deviations of these samples is rather large, but less than 1.

In [147]:
std = torch.exp(0.5 * logv)
std

tensor([[0.7512, 0.8605, 0.7109, 0.6675, 0.7622, 0.8093, 0.7694, 0.7702, 0.6683,
         0.7675, 0.7924, 0.7573, 0.8994, 0.5838, 0.8782, 0.7484],
        [0.7555, 0.8419, 0.7829, 0.6717, 0.8348, 0.8340, 0.7731, 0.8177, 0.7462,
         0.8183, 0.8166, 0.7111, 0.8510, 0.6047, 0.8640, 0.8839]],
       device='cuda:0', grad_fn=<ExpBackward>)