## Replicate the prodlda pyro notebook


In [1]:
import os
import pyro
import pyro.distributions as dist
from pyro.infer import MCMC, NUTS
import torch

pyro.__version__

'1.7.0'

In [2]:
def model(counts):
    theta = pyro.sample('theta', dist.Dirichlet(torch.ones(6)))
    total_count = int(counts.sum())
    pyro.sample('counts', dist.Multinomial(total_count, theta), obs=counts)
    
data = torch.tensor([5, 4, 2, 5, 6, 5, 3, 3, 1, 5, 5, 3, 5, 3, 5, \
                     3, 5, 5, 3, 5, 5, 3, 1, 5, 3, 3, 6, 5, 5, 6])
counts = torch.unique(data, return_counts=True)[1].float()
counts

tensor([ 2.,  1.,  9.,  1., 14.,  3.])

In [3]:
nuts_kernel = NUTS(model)
num_samples, warmup_steps = (1000, 200)
mcmc = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=warmup_steps)
mcmc.run(counts)

Sample: 100%|███████████████████████████████████████| 1200/1200 [00:10, 118.61it/s, step size=6.99e-01, acc. prob=0.925]             


In [4]:
hmc_samples = {k: v.detach().cpu().numpy() for k, v, in mcmc.get_samples().items()}
print (hmc_samples['theta'])

[[0.12436687 0.01852442 0.32760313 0.03508684 0.3575466  0.13687217]
 [0.0805072  0.09267426 0.19635084 0.09863936 0.44795045 0.08387788]
 [0.09843706 0.02473691 0.41141734 0.0394576  0.25750944 0.16844165]
 ...
 [0.15093319 0.11145476 0.18145242 0.05172162 0.41899315 0.08544484]
 [0.04414841 0.01040715 0.41621354 0.0722887  0.39905074 0.0578914 ]
 [0.03533946 0.00421056 0.27795708 0.07612259 0.5142106  0.09215973]]


In [5]:
means = hmc_samples['theta'].mean(axis=0)
stds = hmc_samples['theta'].std(axis=0)
for i in range(6):
    print('%d: %.2f \u00B1 %.2f' % (i+1, means[i], stds[i]))
    

1: 0.08 ± 0.04
2: 0.06 ± 0.04
3: 0.28 ± 0.07
4: 0.05 ± 0.04
5: 0.42 ± 0.08
6: 0.11 ± 0.06


Now using categorical instead instead

In [6]:
def model(data):
    theta = pyro.sample('theta', dist.Dirichlet(torch.ones(6)))
    with pyro.plate('data', len(data)):
        pyro.sample('obs', dist.Categorical(theta), obs=data)
    
nuts_kernel = NUTS(model)
num_samples, warmup_steps = (1000, 200)
mcmc = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=warmup_steps)
mcmc.run(data - 1)

Sample: 100%|███████████████████████████████████████| 1200/1200 [00:10, 109.39it/s, step size=5.76e-01, acc. prob=0.938]             


In [7]:
hmc_samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}
means = hmc_samples['theta'].mean(axis=0)
stds = hmc_samples['theta'].std(axis=0)
for i in range(6):
    print('%d: %.2f \u00B1 %.2f' % (i+1, means[i], stds[i]))

1: 0.08 ± 0.04
2: 0.05 ± 0.04
3: 0.28 ± 0.08
4: 0.06 ± 0.04
5: 0.42 ± 0.08
6: 0.11 ± 0.05


the prodlda bit

In [8]:
import pandas as pd
import numpy as mp
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
news = fetch_20newsgroups(subset='all')
vectorizer = CountVectorizer(max_df=0.5, min_df=20, stop_words='english')
docs = torch.from_numpy(vectorizer.fit_transform(news['data']).toarray())
print(docs.shape)

torch.Size([18846, 12722])


In [10]:
vocab = pd.DataFrame(columns=['word', 'index'])
vocab['word'] = vectorizer.get_feature_names()
vocab['index'] = vocab.index
print(vocab)

         word  index
0          00      0
1         000      1
2        0001      2
3        0002      3
4         001      4
...       ...    ...
12717    zoom  12717
12718    zuma  12718
12719  zurich  12719
12720      zx  12720
12721      zz  12721

[12722 rows x 2 columns]


In [11]:
print('Dictionary size: %d' % len(vocab))
print('Corpus size: {}'.format(docs.shape))

Dictionary size: 12722
Corpus size: torch.Size([18846, 12722])


In [12]:
import math
import torch.nn as nn
import torch.nn.functional as F
from pyro.infer import SVI, TraceMeanField_ELBO
from tqdm import trange

In [25]:
class Encoder(nn.Module): 
    # takes inputs: counts of word in a doc
    # outputs: logtheta_loc and log_theta_scale, each of size=# topics 
    # encoder is used in the guide function
    def __init__(self, vocab_size, num_topics, hidden, dropout):
        super().__init__()
        self.drop = nn.Dropout(dropout)
        self.fc1 = nn.Linear(vocab_size, hidden) # layer 1 of linear transformation
        self.fc2 = nn.Linear(hidden, hidden) # layer 2 of linear transformation
        self.fcmu = nn.Linear(hidden, num_topics) # layer 3
        self.fclv = nn.Linear(hidden, num_topics) # layer 4, actually, as far as I understand, #layers is our choice
        self.bnmu = nn.BatchNorm1d(num_topics, affine=False)
        self.bnlv = nn.BatchNorm1d(num_topics, affine=False)
        
    def forward(self, inputs): #inputs is a vector of size=# words in the dictionary
        h = F.softplus(self.fc1(inputs))
        h = F.softplus(self.fc2(h))
        h = self.drop(h)
        logtheta_loc = self.bnmu(self.fcmu(h))
        logtheta_loc = logtheta_loc
        logtheta_logvar = self.bnlv(self.fclv(h))
        logtheta_scale = (0.5 * logtheta_logvar).exp()
        return logtheta_loc, logtheta_scale
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, num_topics, dropout):
        super().__init__()
        self.beta = nn.Linear(num_topics, vocab_size, bias=False)
        self.bn = nn.BatchNorm1d(vocab_size, affine=False)
        self.drop = nn.Dropout(dropout)
        
    def forward(self, inputs):
        # inputs: theta (topic mixture parameters)
        # outputs: vector of size=#words in dictionary, probabilities of observing a word 
        # in the paper, this prob. is (beta*theta)
        inputs = self.drop(inputs)
        return F.softmax(self.bn(self.beta(inputs)), dim=1)
    
class ProdLDA(nn.Module):
    def __init__(self, vocab_size, num_topics, hidden, dropout):
        super().__init__()
        self.vocab_size = vocab_size
        self.num_topics = num_topics
        self.encoder = Encoder(vocab_size, num_topics, hidden, dropout)
        self.decoder = Decoder(vocab_size, num_topics, dropout)
        
    def model(self, docs):
        pyro.module('decoder', self.decoder)
        # Dirichlet prior 𝑝(𝜃|𝛼) is replaced by a logistic-normal distribution
        with pyro.plate('documents', docs.shape[0]):
            # each doc gets a vector of theta (topic mixture)
            logtheta_loc = docs.new_zeros((docs.shape[0], self.num_topics))
            logtheta_scale = docs.new_ones((docs.shape[0], self.num_topics))
            logtheta = pyro.sample(
                'logtheta', dist.Normal(logtheta_loc, logtheta_scale).to_event(1))
            theta = F.softmax(logtheta, -1)            
            # conditional distribution of 𝑤𝑛 is defined as
            # 𝑤𝑛|𝛽,𝜃 ~ Categorical(𝜎(𝛽𝜃))
            count_param = self.decoder(theta)
            # Currently, PyTorch Multinomial requires `total_count` to be homogeneous.
            # Because the numbers of words across documents can vary,
            # we will use the maximum count accross documents here.
            # This does not affect the result because Multinomial.log_prob does
            # not require `total_count` to evaluate the log probability.
            total_count = int(docs.sum(-1).max())
            pyro.sample(
                'obs',
                dist.Multinomial(total_count, count_param),
                obs=docs
            )
            
    def guide(self, docs):
        pyro.module('encoder', self.encoder)
        with pyro.plate('documents', docs.shape[0]):
            logtheta_loc, logtheta_scale = self.encoder(docs)
            logtheta = pyro.sample(
                'logtheta', dist.Normal(logtheta_loc, logtheta_scale).to_event(1))
            
    def beta(self):
        return self.decoder.beta.weight.cpu().detach().T
    
    def get_posterior_topic(self, docs):
        logtheta_loc, logtheta_scale = self.encoder(docs)
        theta_loc = F.softmax(logtheta_loc, -1)
        return(theta_loc)

In [20]:
seed = 0
torch.manual_seed(seed)
pyro.set_rng_seed(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

num_topics = 20
docs = docs.float().to(device)
batch_size = 32
learning_rate = 1e-3
num_epochs = 5 #50
print(docs.shape) # torch.Size([18846, 12722]): (# documents, # words in the dictionary)--> values: counts

torch.Size([18846, 12722])


In [26]:
pyro.clear_param_store()
prodLDA = ProdLDA(
    vocab_size = docs.shape[1],
    num_topics = num_topics,
    hidden = 100,
    dropout=0.2
)
prodLDA.to(device)

optimizer = pyro.optim.Adam({"lr": learning_rate})
svi = SVI(prodLDA.model, prodLDA.guide, optimizer, loss=TraceMeanField_ELBO())
num_batches = int(math.ceil(docs.shape[0] / batch_size))

bar = trange(num_epochs)
for epoch in bar:
    running_loss = 0.0
    for i in range(num_batches):
        batch_docs = docs[i * batch_size:(i+1) * batch_size, :]
        loss = svi.step(batch_docs)
        running_loss += loss / batch_docs.size(0)
        
    bar.set_postfix(epoch_loss='{:.2e}'.format(running_loss))

100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [02:14<00:00, 26.96s/it, epoch_loss=4.08e+05]


In [17]:
def plot_word_cloud(b, ax, v, n):
    sorted_, indices = torch.sort(b, descending=True)
    df = pd.DataFrame(indices[:100].numpy(), columns=['index'])
    words = pd.merge(df, vocab[['index', 'word']],
                     how='left', on='index')['word'].values.tolist()
    sizes = (sorted_[:100] * 10000).int().numpy().tolist()
    freqs = {words[i]: sizes[i] for i in range(len(words))}
    wc = WordCloud(background_color="white", width=800, height=500)
    wc = wc.generate_from_frequencies(freqs)
    ax.set_title('Topic %d' % (n+1))
    ax.imshow(wc, interpolation='bilinear')
    ax.axis('off')
    
import matplotlib.pyplot as plt
from wordcloud import WordCloud # this does not work right now on my setup

beta = prodLDA.beta()
fig, axs = plt.subplots(7, 3, figsize=(14,24))
for n in range(beta.shape[0]):
    i, j = divmod(n, 3)
    plot_word_cloud(beta[n], axs[i, j], vocab, n)
axs[-1,-1].axis('off');

plt.show()

ModuleNotFoundError: No module named 'wordcloud'

In [33]:
topic_posterior = prodLDA.get_posterior_topic(docs)
print(topic_posterior.shape)
print(topic_posterior.max())
print(topic_posterior.sum(axis = 1))

torch.Size([18846, 20])
tensor(0.9999, grad_fn=<MaxBackward1>)
tensor([1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)
