In [4]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from torchnlp.datasets import imdb_dataset
from torchnlp.datasets import penn_treebank_dataset
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BiLSTMEncoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, embedding_matrix):
        super(BiLSTMEncoder, self).__init__()
        
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, embedding_size) 
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.lstm = nn.LSTM(embedding_size, int(hidden_size/2),
                            bidirectional=True,
                            batch_first=True)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1,1,-1)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1*2, 1, int(self.hidden_size/2))

class BiLSTMDecoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, embedding_matrix):
        super(BiLSTMDecoder, self).__init__()
        
        self.output_size = output_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        # layers
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.lstm = nn.LSTM(embedding_size, hidden_size,
                            bidirectional=False,
                            batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1,1,-1)
        output = F.relu(output)

        output, hidden = self.lstm(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

In [3]:
SOS_token = 0
EOS_token = 1
MASKED_token = 2
MAX_LENGTH = 42

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"SOSTOKEN": 0, "EOSTOKEN": 1, "MASKEDTOKEN": 2}
        self.index2word = {0: "SOSTOKEN", 1: "EOSTOKEN", 2: "MASKEDTOKEN"}
        self.word2count = {"SOSTOKEN": 0, "EOSTOKEN": 0, "MASKEDTOKEN": 0}
        
        self.n_words = 3  # Count SOS and EOS and Masked token

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
def unicodeToAscii(s):
    """
    Turn a Unicode string to plain ASCII, thanks to
    https://stackoverflow.com/a/518232/2809427
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):  # Lowercase, trim, and remove non-letter characters
    s = unicodeToAscii(s.lower().strip())
    #s = re.sub(r"([.!?])", r" \1", s)
    #s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"[^a-zA-Z]+", r" ", s)
    s = " ".join(s.split()[:40])
    return s

In [6]:
def readLang(dataset_title):
    """
    Args:
        dataset_title: either 'imdb' or 'ptb'
    """
    print("Reading lines...")
    if dataset_title == 'imdb':
        train = imdb_dataset(train=True, directory='../data/')
        # Read the dataset and split into lines
        lines = [train[ind]['text'].strip() for ind, doc in enumerate(train)]
        # Normalize lines
        lines = [' '.join(["SOSTOKEN", normalizeString(s), "EOSTOKEN"]) for s in lines]
        lang = Lang(dataset_title)
    elif dataset_title == 'ptb':
        raise NotImplementedError
    return lang, lines

In [7]:
def prepareData(dataset_title):
    lang, lines = readLang(dataset_title)
    print("Read %s sentence pairs" % len(lines))
    print("Counting words...")
    for l in lines:
        lang.addSentence(l)
    print("Counted words:")
    print(lang.name, lang.n_words)
    return lang, lines

In [8]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    #indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsForTrain(lang, sentence):
    # mask = generate_mask(len(sentence))
    target_tensor = tensorFromSentence(lang, sentence)
    # transformed_sentence = " ".join(transform_input_with_is_missing_token(sentence.split(), mask))
    #input_tensor = tensorFromSentence(lang, transformed_sentence)
    return target_tensor # , target_tensor

def indexFromTensor(lang, decoder_output):
    return decoder_output.max(0)[1]

In [None]:
MAX_LENGTH = 42 # max(map(lambda x: len(x.split()), imdb_lines)) == 2516

def train(input_tensor, model, model_optimizer, criterion, max_length=MAX_LENGTH):
    #c_ = time()
    model_hidden = model.initHidden()

    model_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    
    model_outputs = torch.zeros(max_length, model.input_size, device=device)

    loss = 0

    for ei in range(input_length - 1):
        model_output, model_hidden = model(
            input_tensor[ei], model_hidden)
        #print(model_output, input_tensor.shape, input_tensor[0].shape)
        loss += criterion(model_output[0], input_tensor[ei + 1])
        model_outputs[ei] = model_output[0]
    
    loss.backward()

    
    model_optimizer.step()

    return loss.item() / input_length

In [40]:
def generate_mask(sequence_length, batch_size=None, is_present=0.7):
    """
    e.g.
    returns: [1, 1, 0, 1, 0, 1]
    """
    if batch_size is not None:
        mask = np.random.binomial(1, is_present, size=(batch_size, sequence_length))
    elif batch_size is None:
        mask = np.random.binomial(1, is_present, size=(sequence_length,))
    return torch.from_numpy(mask).long().view(len(mask), 1)

In [9]:
%%time
imdb_lang, imdb_lines = prepareData('imdb')
#print(random.choice(imdb_lines))

aclImdb_v1.tar.gz: 0.00B [00:00, ?B/s]

Reading lines...


aclImdb_v1.tar.gz: 84.1MB [00:03, 25.6MB/s]                            


Read 25000 sentence pairs
Counting words...
Counted words:
imdb 32736
CPU times: user 22.4 s, sys: 3.83 s, total: 26.2 s
Wall time: 29.1 s


In [42]:
from tqdm import tqdm
# mask = generate_mask()
# spm_tokenize = tokenize.SentencePieceTokenizer(args.spm_prefix)

# # Compute Batch Size
# max_tokens_per_device = 48000
# # max_tokens_per_device = 1000
# n_devices = torch.cuda.device_count()
# max_tokens = max_tokens_per_device * n_devices
# truncate_length = 20
# batch_size = int(max_tokens/truncate_length)

# checkpoint_path = "/home/ipyaternev/dl_cshse_2019/"
# saver = Saver(checkpoint_path)

# train_path = os.path.join(args.path, 'train')
# dev_path = os.path.join(args.path, 'test')

# train_dataset = 

# # Constructed vocabulary from train
# vocab = train_dataset.vocab
# Task = namedtuple('Task', 'source_dictionary target_dictionary')
# task = Task(source_dictionary=vocab, 
#         target_dictionary=vocab)

# trainer = MGANTrainer(args, task, saver, visdom, vocab)
# def loader(dataset):
#     _loader = DataLoader(dataset, batch_size=batch_size, 
#             collate_fn=TensorIMDbDataset.collate, 
#             shuffle=True, num_workers=8)
#     return _loader

# #trainer.validate_dataset(loader(train_dataset))

# dev_dataset = TensorIMDbDataset(
#         dev_path, spm_tokenize, 
#         rmask, truncate_length,
#         vocab 
# )

# Datasets = namedtuple('Dataset', 'train dev')
# datasets = Datasets(
#         train=train_dataset,
#         dev=dev_dataset
# )



# for epoch in tqdm(range(args.max_epochs), total=args.max_epochs, desc='epoch'):
#     train_loader = loader(datasets.train)
#     pbar = tqdm(train_loader, desc='training', leave=True)
#     for i, samples in enumerate(pbar):
#         trainer.run(epoch, samples)
#         if i % args.validate_every == 0:
#             validation_samples = 1000
#             val_idxs = random.sample(range(len(datasets.dev)), validation_samples)
#             subset = torch.utils.data.Subset(datasets.dev, val_idxs)
#             trainer.validate_dataset(loader(subset))

max_epochs = 1

for epoch in tqdm(range(max_epochs), total=max_epochs, desc='epoch'):
    
    #model_optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    training_sentences = [tensorFromSentence(imdb_lang, imdb_lines[0]) for i in range(10)]
    masks = [generate_mask(el.size()[0]) for el in training_sentences]
    masked_sentences = [s * t for (s, t) in zip(training_sentences, masks)]
    criterion = nn.CrossEntropyLoss()
#    criterion = critic()
    

epoch: 100%|██████████| 1/1 [00:00<00:00, 813.16it/s]


In [38]:
training_sentences[0].view(42)

tensor([ 0,  3,  4,  5,  6,  7,  8,  9, 10, 11,  5, 12, 13, 14, 15, 16,  3, 17,
        18, 19, 20, 21, 22, 23,  3, 24, 25, 26, 27, 28, 29, 30, 10,  5, 31,  3,
        32, 33, 34,  3, 35,  1])

In [43]:
masked_sentences[0]

tensor([[ 0],
        [ 3],
        [ 4],
        [ 0],
        [ 6],
        [ 0],
        [ 0],
        [ 9],
        [10],
        [ 0],
        [ 0],
        [12],
        [13],
        [14],
        [15],
        [16],
        [ 3],
        [ 0],
        [ 0],
        [19],
        [20],
        [21],
        [22],
        [23],
        [ 0],
        [24],
        [ 0],
        [26],
        [27],
        [28],
        [29],
        [ 0],
        [10],
        [ 5],
        [31],
        [ 3],
        [ 0],
        [33],
        [34],
        [ 3],
        [35],
        [ 1]])

In [31]:
masks

[tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])]

In [39]:
training_sentences[3].size()

torch.Size([42, 1])

In [36]:
training_sentences

[tensor([[ 0],
         [ 3],
         [ 4],
         [ 5],
         [ 6],
         [ 7],
         [ 8],
         [ 9],
         [10],
         [11],
         [ 5],
         [12],
         [13],
         [14],
         [15],
         [16],
         [ 3],
         [17],
         [18],
         [19],
         [20],
         [21],
         [22],
         [23],
         [ 3],
         [24],
         [25],
         [26],
         [27],
         [28],
         [29],
         [30],
         [10],
         [ 5],
         [31],
         [ 3],
         [32],
         [33],
         [34],
         [ 3],
         [35],
         [ 1]])]

In [49]:
def create_generator(hparams, inputs, targets, present, is_training, is_validating, reuse=None):
    '''
    Use seq2seq_vd from tf repo
    ''' 
    sequence, logits, log_probs, initial_state, final_state = seq2seq.generator(hparams, 
                                                                                  inputs, 
                                                                                  targets, 
                                                                                  present, 
                                                                                  is_training=is_training, 
                                                                                  is_validating=is_validating, 
                                                                                  reuse=reuse)
    return (sequence, logits, log_probs, initial_state, final_state, encoder_states)

In [54]:
def create_discriminator(hparams,
                         sequence,
                         is_training,
                         reuse=None,
                         initial_state=None,
                         inputs=None,
                         present=None):
    
    '''Use seq2seq_vd (maybe birectional) from tf repo'''
    
    if FLAGS.discriminator_model == 'seq2seq_vd':
        predictions = seq2seq_vd.discriminator(
                                                hparams,
                                                inputs,
                                                present,
                                                sequence,
                                                is_training=is_training,
                                                reuse=reuse)
    elif FLAGS.discriminator_model == 'bidirectional_vd':
        predictions = bidirectional_vd.discriminator(
                                                hparams,
                                                sequence,
                                                is_training=is_training,
                                                reuse=reuse,
                                                initial_state=initial_state)
    else:
        raise NotImplementedError
    return predictions

In [56]:
def create_critic(hparams, sequence, is_training, reuse=None):
    """Create the Critic model specified by the FLAGS and hparams.
    Args:
    hparams:  Hyperparameters for the MaskGAN.
    sequence:  tf.int32 Tensor sequence of shape [batch_size, sequence_length]
    is_training:  Whether the model is training.
    reuse (Optional):  Whether to reuse the model.
    Returns:
    values:  tf.float32 Tensor of predictions of shape [batch_size,
      sequence_length]
    """
    if FLAGS.baseline_method == 'critic':
        if FLAGS.discriminator_model == 'seq2seq_vd':
            values = critic_vd.critic_seq2seq_vd_derivative(
                  hparams, sequence, is_training, reuse=reuse)
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError
    return values

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers)
        self.fc = nn.Linear(hidden_size, input_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        output = self.fc(output)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device),
                torch.zeros(1, 1, self.hidden_size, device=device))

In [None]:
class VariationalDropoutWrapper(tf.contrib.rnn.RNNCell):
  """Add variational dropout to a RNN cell."""

      def __init__(self, cell, batch_size, input_size, recurrent_keep_prob,
                   input_keep_prob):
        self._cell = cell
        self._recurrent_keep_prob = recurrent_keep_prob
        self._input_keep_prob = input_keep_prob

        def make_mask(keep_prob, units):
          random_tensor = keep_prob
          # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
          random_tensor += tf.random_uniform(tf.stack([batch_size, units]))
          return tf.floor(random_tensor) / keep_prob

        self._recurrent_mask = make_mask(recurrent_keep_prob,
                                         self._cell.state_size[0])
        self._input_mask = self._recurrent_mask

      @property
      def state_size(self):
        return self._cell.state_size

      @property
      def output_size(self):
        return self._cell.output_size

      def __call__(self, inputs, state, scope=None):
        dropped_inputs = inputs * self._input_mask
        dropped_state = (state[0], state[1] * self._recurrent_mask)
        new_h, new_state = self._cell(dropped_inputs, dropped_state, scope)
        return new_h, new_state

In [57]:
class VariationalDropout(nn.Module):
    def __init__(self, cell, batch_size, input_size, recurrent_keep_prob, input_keep_prob):
        super(VariationalDropout, self).__init__()
        
        self._cell = cell
        self._recurrent_keep_prob = recurrent_keep_prob
        self._input_keep_prob = input_keep_prob
        
        def make_mask(keep_prob, units):
            random_tensor = keep_prob
            random_tensor += torch.random(torch.)
        
    def kl(self):
        c1 = 1.16145124
        c2 = -1.50204118
        c3 = 0.58629921
        
        alpha = self.log_alpha.exp()
        
        negative_kl = 0.5 * self.log_alpha + c1 * alpha + c2 * alpha**2 + c3 * alpha**3
        
        kl = -negative_kl
        
        return kl.mean()
    
    def forward(self, x):
        """
        Sample noise   e ~ N(1, alpha)
        Multiply noise h = h_ * e
        """
        if self.train():
            # N(0,1)
            epsilon = torch.randn(x.size())
            if x.is_cuda:
                epsilon = epsilon.cuda()

            # Clip alpha
            self.log_alpha.data = torch.clamp(self.log_alpha.data, max=self.max_alpha)
            alpha = self.log_alpha.exp()

            # N(1, alpha)
            epsilon = epsilon * alpha

            return x * epsilon
        else:
            return x