In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from keras.datasets import reuters, imdb

Using TensorFlow backend.


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [4]:
import numpy as np
import math
import pickle
import argparse
import os
import math
import matplotlib.pyplot as plt

In [5]:
from pathlib import Path
from collections import OrderedDict
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt

Path.ls = lambda x: list(x.iterdir())

## Constants

In [6]:
max_words = 2000
maxlen = 400

In [7]:
bs = 200
en1_units=100
en2_units=100
num_topic=50
num_input=max_words
variance=0.995
init_mult=1.0
learning_rate=0.002
batch_size=200
momentum=0.99
num_epoch=100
nogpu=True
drop_rate=0.2

## Topic Model Utility Functions

In [8]:
def listify(o):
    if o is None: return []
    if isinstance(o, list): return o
    if isinstance(o, str): return [o]
    if isinstance(o, Iterable): return list(o)
    return [o]
def setify(o): return o if isinstance(o,set) else set(listify(o))
def compose(x, funcs, *args, order_key='_order', **kwargs):
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x

In [9]:
def print_perp(model):
    cost=[]
    model.eval()                        # switch to testing mode
    input_ = tensor_te
    recon, loss = model(input_, compute_loss=True, avg_loss=False)
    loss = loss.data
    counts = tensor_te.sum(1)
    avg = (loss / counts).mean()
    print('The approximated perplexity is: ', math.exp(avg))
    
def print_perp(model):
    cost=[]
    model.eval()                        # switch to testing mode
    input_, _ = next(iter(test_dl))
    recon, loss = model(input_, compute_loss=True, avg_loss=False)
    loss = loss.data
    counts = input_.sum(1)
    avg = (loss / counts).mean()
    print('The approximated perplexity is: ', math.exp(avg))

def print_top_words(beta, feature_names, n_top_words=10):
    print ('---------------Printing the Topics------------------')
    for i in range(len(beta)):
        line = " ".join([feature_names[j] 
                         for j in beta[i].argsort()[:-n_top_words - 1:-1]])
        print('{}'.format(line))
    print ('---------------End of Topics------------------')

## Data Utility Functions

In [10]:
def collate(b):
    x, y = zip(*b)
    return torch.stack(x), torch.stack(y)

class Numpyify():
    _order=0
    def __call__(self, item):
        return np.array(item)

class Onehotify():
    _order=1
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
    def __call__(self, item):
        return np.array(np.bincount(item.astype('int'), minlength=self.vocab_size))
    
class YToOnehot():
    _order=1
    def __init__(self, num_classes):
        self.num_classes = num_classes
    def __call__(self, item):
        categorical = np.zeros((1, self.num_classes))
        categorical[0, item] = 1
        return categorical

class Tensorify():
    _order=2
    def __call__(self, item):
        return torch.from_numpy(item)

class Floatify():
    _order=3
    def __call__(self, item):
        return item.float()
    
class CheckAndCudify():
    _order=100
    def __init__(self):
        self.ic = torch.cuda.is_available()
    def __call__(self, item):
        return item.cuda() if self.ic else item
    
class IMDBDataset(Dataset):
    def __init__(self, x, y, tfms_x, tfms_y): 
        self.x, self.y = x, y
        self.x_tfms = tfms_x
        self.y_tfms = tfms_y
    def __len__(self): 
        return len(self.x)
    def __getitem__(self, i): 
        return compose(self.x[i], self.x_tfms), compose(self.y[i], self.y_tfms)
    
class Sampler():
    def __init__(self, ds, bs, shuffle=False):
        self.n,self.bs,self.shuffle = len(ds),bs,shuffle
        
    def __iter__(self):
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        for i in range(0, self.n, self.bs): yield self.idxs[i:i+self.bs]

class DataLoader():
    def __init__(self, ds, sampler, collate_fn=collate):
        self.ds,self.sampler,self.collate_fn = ds,sampler,collate_fn
        
    def __iter__(self):
        for s in self.sampler: yield self.collate_fn([self.ds[i] for i in s])

In [11]:
def imdb_vocab(index_from=3):
    word_to_id = imdb.get_word_index()
    word_to_id = {k:(v + index_from) for k,v in word_to_id.items()}
    word_to_id["<PAD>"] = 0
    word_to_id["<START>"] = 1
    word_to_id["<UNK>"] = 2
    word_to_id["<FCK>"] = 3
    
    id_to_word = {value:key for key,value in word_to_id.items()}

    return id_to_word, word_to_id

## Load Data

In [12]:
vocab, id_vocab = imdb_vocab()

In [13]:
(train_x, train_y), (test_x, test_y) = imdb.load_data(num_words=max_words)

In [14]:
num_classes = np.max(train_y) + 1

In [15]:
tfms_x = [Numpyify(), Onehotify(vocab_size=max_words), Tensorify(), Floatify(), CheckAndCudify()]
tfms_y = [YToOnehot(num_classes=num_classes), Tensorify(), Floatify(), CheckAndCudify()]

In [16]:
train_ds = IMDBDataset(train_x, train_y, tfms_x=tfms_x, tfms_y=tfms_y)
test_ds = IMDBDataset(test_x, test_y, tfms_x=tfms_x, tfms_y=tfms_y)

In [17]:
train_samp = Sampler(train_ds, bs, shuffle=True)
test_samp = Sampler(test_ds, bs, shuffle=False)

In [18]:
train_dl = DataLoader(train_ds, sampler=train_samp, collate_fn=collate)
test_dl = DataLoader(test_ds, sampler=test_samp, collate_fn=collate)

## Define Model

In [19]:
def encoder(in_feature, hidden_feature1, hidden_feature2, drop_rate):
    return nn.Sequential(OrderedDict([
                ('linear1', nn.Linear(in_feature, hidden_feature1)),
                ('act1', nn.Softplus()),
                ('linear2', nn.Linear(hidden_feature1, hidden_feature2)),
                ('act2', nn.Softplus()),
                ('dropout', nn.Dropout(drop_rate))
            ]))

def decoder(in_feature, out_feature, drop_rate):
     return nn.Sequential(OrderedDict([
                ('act1', nn.Softmax(dim=-1)),
                ('dropout', nn.Dropout(drop_rate)),
                ('linear', nn.Linear(in_feature, out_feature)),
                ('batchnorm', nn.BatchNorm1d(out_feature)),
                ('act2', nn.Softmax(dim=-1))
            ]))

In [20]:
def hidden(in_feature, out_feature):
    return nn.Sequential(OrderedDict([
                ('linear', nn.Linear(in_feature, out_feature)),
                ('batchnorm', nn.BatchNorm1d(out_feature))
            ]))

In [21]:
class ProdLDA(nn.Module):
    def __init__(self, num_input, en1_units, en2_units, num_topic, drop_rate, init_mult):
        super(ProdLDA, self).__init__()
        self.num_input, self.en1_units, self.en2_units, \
        self.num_topic, self.drop_rate, self.init_mult = num_input, en1_units, en2_units, \
                                                            num_topic, drop_rate, init_mult
        # encoder
        self.en = encoder(num_input, en1_units, en2_units, drop_rate)
        self.mean = hidden(en2_units, num_topic)
        self.logvar = hidden(en2_units, num_topic)
        # decoder
        self.de = decoder(num_topic, num_input, drop_rate)
        # prior mean and variance as constant buffers
        self.prior_mean   = torch.Tensor(1, num_topic).fill_(0)
        self.prior_var    = torch.Tensor(1, num_topic).fill_(variance)
        self.prior_mean   = nn.Parameter(self.prior_mean, requires_grad=False)
        self.prior_var    = nn.Parameter(self.prior_var, requires_grad=False)
        self.prior_logvar = nn.Parameter(self.prior_var.log(), requires_grad=False)
        # initialize decoder weight
        if init_mult != 0:
            #std = 1. / math.sqrt( init_mult * (num_topic + num_input))
            self.de.linear.weight.data.uniform_(0, init_mult)
        # remove BN's scale parameters
        for component in [self.mean, self.logvar, self.de]:
            component.batchnorm.weight.requires_grad = False
            component.batchnorm.weight.fill_(1.0)

    def encode(self, input_):
        encoded = self.en(input_)
        posterior_mean = self.mean(encoded)
        posterior_logvar = self.logvar(encoded)
        return encoded, posterior_mean, posterior_logvar
    
    def decode(self, input_, posterior_mean, posterior_var):
        # take sample
        eps = input_.data.new().resize_as_(posterior_mean.data).normal_() # noise 
        z = posterior_mean + posterior_var.sqrt() * eps                   # reparameterization
        # do reconstruction
        recon = self.de(z)          # reconstructed distribution over vocabulary
        return recon
    
    def forward(self, input_, compute_loss=False, avg_loss=True):
        # compute posterior
        en2, posterior_mean, posterior_logvar = self.encode(input_) 
        posterior_var    = posterior_logvar.exp()
        
        recon = self.decode(input_, posterior_mean, posterior_var)
        if compute_loss:
            return recon, self.loss(input_, recon, posterior_mean, posterior_logvar, posterior_var, avg_loss)
        else:
            return recon

    def loss(self, input_, recon, posterior_mean, posterior_logvar, posterior_var, avg=True):
        # NL
        NL  = -(input_ * (recon + 1e-10).log()).sum(1)
        # KLD, see Section 3.3 of Akash Srivastava and Charles Sutton, 2017, 
        # https://arxiv.org/pdf/1703.01488.pdf
        prior_mean   = self.prior_mean.expand_as(posterior_mean)
        prior_var    = self.prior_var.expand_as(posterior_mean)
        prior_logvar = self.prior_logvar.expand_as(posterior_mean)
        var_division    = posterior_var  / prior_var
        diff            = posterior_mean - prior_mean
        diff_term       = diff * diff / prior_var
        logvar_division = prior_logvar - posterior_logvar
        # put KLD together
        KLD = 0.5 * ( (var_division + diff_term + logvar_division).sum(1) - self.num_topic)
        # loss
        loss = (NL + KLD)
        # in traiming mode, return averaged loss. In testing mode, return individual loss
        if avg:
            return loss.mean()
        else:
            return loss

## Train

In [22]:
model = ProdLDA(num_input, en1_units, en2_units, num_topic, drop_rate, init_mult)
optimizer = torch.optim.Adam(model.parameters(), learning_rate, betas=(momentum, 0.999))

In [23]:
if torch.cuda.is_available():
    model = model.cuda()
    optimizer = optimizer.cuda()

In [24]:
for epoch in range(num_epoch):
    loss_epoch = 0.0
    model.train()                    # switch to training mode
    for input_, _ in train_dl:
        recon, loss = model(input_, compute_loss=True)
        # optimize
        optimizer.zero_grad()        # clear previous gradients
        loss.backward()              # backprop
        optimizer.step()             # update parameters
        # report
        loss_epoch += loss.item()    # add loss to loss_epoch
    if epoch % 5 == 0:
        print('Epoch {}, loss={}'.format(epoch, loss_epoch / len(input_)))

Epoch 0, loss=1124.9376000976563
Epoch 5, loss=908.3554571533203
Epoch 10, loss=843.8213909912109
Epoch 15, loss=820.7206262207031
Epoch 20, loss=811.3518713378907
Epoch 25, loss=805.9802789306641
Epoch 30, loss=803.7289654541015
Epoch 35, loss=802.1388592529297
Epoch 40, loss=801.1045501708984
Epoch 45, loss=800.4813134765625
Epoch 50, loss=800.144482421875
Epoch 55, loss=799.8823211669921
Epoch 60, loss=799.4978063964844
Epoch 65, loss=799.3657940673828
Epoch 70, loss=799.1760681152343
Epoch 75, loss=798.948642578125
Epoch 80, loss=798.9252777099609
Epoch 85, loss=798.6046411132812
Epoch 90, loss=798.714306640625
Epoch 95, loss=798.5576092529296


## Test

In [25]:
emb = model.de.linear.weight.data.cpu().numpy().T
print_top_words(emb, vocab)
print_perp(model)

---------------Printing the Topics------------------
season episodes remake explanation happening twist series questions nightmare seven
decides meets delightful her candy ex nudity kills husband noir
decides mom dad meets sees her kills charlie finds wants
jokes stupid hate show sucks crap naked thats kinda seriously
disappointment disappointed cheesy expecting seemed bland wasn't budget weren't animation
season episodes show shows jerry cartoon episode series adam batman
zombie zombies arts martial fights crap scientist cop rent sucks
supporting alan noir apartment arthur comedic finest cage he manages
won stewart greatest jr excellent award outstanding kong martial don
seemed adaptation unnecessary development historical hadn't disappointed depth failed developed
he adam scared his cage serial actor film fan festival
vhs memories season childhood episodes thank you favourite copy shows
awesome likable batman supporting plus average bruce chemistry pace okay
waste write you crap your