<font color=purple>  
### Model Internal pipeline
This notebook examines the internal workings and pipeline of unlabelled ETM. 

In [2]:
import argparse
import torch
import pickle 
import numpy as np 
import os 
import math 
import random 
import sys
import matplotlib.pyplot as plt 
import scipy.io

from torch import nn, optim
from torch.nn import functional as F

sys.path.append('..')
import data
from etm import ETM
from utils import nearest_neighbors, get_topic_coherence, get_topic_diversity

np.random.seed(2019)
torch.manual_seed(2019)
if torch.cuda.is_available():
    torch.cuda.manual_seed(2019)
    
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

<font color=purple>  
### Input data
The input data contains three important objects: `vocab`, `tokens`, `counts`. 
`vocab` is the actual string words, while `token` are indices that lead to the respective words and `counts`, as the name suggests, counts the words.

In [3]:
## get data
# 1. vocabulary
vocab, train, valid, test = data.get_data(os.path.join('../data/20ng'))
vocab_size = len(vocab)

# 2. tokens, counts for train, dev and test set
train_tokens = train['tokens']
train_counts = train['counts']
num_docs_train = len(train_tokens)
valid_tokens = valid['tokens']
valid_counts = valid['counts']
num_docs_valid = len(valid_tokens)
test_tokens = test['tokens']
test_counts = test['counts']
num_docs_test = len(test_tokens)
test_1_tokens = test['tokens_1']
test_1_counts = test['counts_1']
num_docs_test_1 = len(test_1_tokens)
test_2_tokens = test['tokens_2']
test_2_counts = test['counts_2']
num_docs_test_2 = len(test_2_tokens)

display( vocab[:5] )
display( train_tokens[:2][0][0] )
display( train_counts[:2][0][0] )

['ii', 'plate', 'duke', 'greatly', 'holds']

array([  94,  100,  233,  327,  357,  504,  530,  597,  662,  720,  805,
        859,  889,  897,  898,  987,  996, 1024, 1098, 1177, 1178, 1532,
       1642, 1658, 1706, 1728, 1732, 1808, 1822, 1857, 1858, 1890, 1895,
       1957, 2013, 2059, 2091, 2118, 2174, 2236, 2386, 2478, 2522, 2539,
       2566, 2569, 2662, 2673, 2790, 2812, 2887, 2934, 3018, 3048, 3071],
      dtype=int32)

array([1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1,
       1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1])

<font color=purple>  
### Settings and checkpoint

In [4]:
dataset = '20ng'
num_topics = 50
t_hidden_size = 800 # encoding dimension
optimizer = 'adam'
clip = 0
theta_act = 'relu' # activation
lr = 0.005 # learning rate
wdecay=1.2e-6
enc_drop = 0.0 # drop out rate on encoder
batch_size = 1000
rho_size = 300 # dimension of rho, the word embedding?
emb_size = 300
train_embeddings = 1
    
ckpt = os.path.join("train_ckpt", 
        'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'.format(
        dataset, num_topics, t_hidden_size, optimizer, clip, theta_act, 
            lr, batch_size, rho_size, train_embeddings))


<font color=purple>  
Importantly, there are no pre-trained embeddings here. 

In [None]:
embeddings = None

<font color=purple>  
### Init the model  
ETM is a pytorch module with a whole bunch of parameters. 
Some questions would be:  
    <font color="red">  
1. Whats the difference between `emsize` and `rho_size`? One should be the topic embedding size, while the other should be the word embedding, but they should be of the same dimension and even related to each other.   
2. What is `t_hidden_size`?
        
        
```
class ETM(nn.Module):
def __init__(self, num_topics, vocab_size, t_hidden_size, rho_size, emsize, 
                theta_act, embeddings=None, train_embeddings=True, enc_drop=0.5):
    super(ETM, self).__init__()

        
        
        
    """ 
    Define hyperparameters
    """
    self.num_topics = num_topics
    self.vocab_size = vocab_size
        
    self.t_hidden_size = t_hidden_size # ?
    
    self.rho_size = rho_size
    self.emsize = emsize
        
    self.enc_drop = enc_drop
    self.t_drop = nn.Dropout(enc_drop)

    self.theta_act = self.get_activation(theta_act)

        
        
        
        
    """
    define the word embedding matrix \rho
    """
    if train_embeddings:
        self.rho = nn.Linear(rho_size, vocab_size, bias=False)
    else:
        num_embeddings, emsize = embeddings.size()
        rho = nn.Embedding(num_embeddings, emsize)
        self.rho = embeddings.clone().float().to(device)

        
        
        
        
    """
    define the matrix containing the topic embeddings
    """
    self.alphas = nn.Linear(rho_size, num_topics, bias=False)#nn.Parameter(torch.randn(rho_size, num_topics))

    ## define variational distribution for \theta_{1:D} via amortizartion
    self.q_theta = nn.Sequential(
            nn.Linear(vocab_size, t_hidden_size), 
            self.theta_act,
            nn.Linear(t_hidden_size, t_hidden_size),
            self.theta_act,
        )
    self.mu_q_theta = nn.Linear(t_hidden_size, num_topics, bias=True)
    self.logsigma_q_theta = nn.Linear(t_hidden_size, num_topics, bias=True)
```

In [5]:
embeddings = None
model = ETM(num_topics,        # the all-important number of topics
            vocab_size,        # vocab size is needed for input shape sizes, possibly redundant
            t_hidden_size,     # t_hidden_size is the size of document encoding
            rho_size,          # embedding size of word embedding 
            emb_size,          # embedding size of word embedding # redundant!
            theta_act,         # activation function (string)
            embeddings,        # prefit embeddings
            train_embeddings,  # binary, for whether to train embeddings
            enc_drop           # encoder dropout 
           ).to(device)

<font color=purple>  
Set to training mode:   
https://pytorch.org/docs/stable/nn.html  
Essentially, this 'activates' layers specifically for training, namely `dropout`, `BatchNorm`.

In [6]:
model.train();

<font color=purple>  
Setting the train indices

In [7]:
acc_loss = 0
acc_kl_theta_loss = 0
cnt = 0

num_docs_train = len(train_tokens)
batch_size = 1000

indices = torch.randperm(num_docs_train)
indices = torch.split(indices, batch_size)
idx, ind = 0, indices[0]

<font color=purple>  
As typical of pytorch training scripts, set to zero gradient.

In [8]:
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wdecay)
optimizer.zero_grad()
model.zero_grad()

<font color=purple>  
### retrieving batch train data
`data_batch` will be of shape (batch size, vocab size).   
`sums` is the sum of words in each doc which may be used to normalized the bag of words, `bow`. 

In [79]:
data_batch = data.get_batch(train_tokens, train_counts, ind, vocab_size, device)
sums = data_batch.sum(1).unsqueeze(1)
normalized_data_batch = data_batch / sums

display(data_batch.shape)
display(sums[:4])

torch.Size([1000, 3072])

tensor([[61.],
        [27.],
        [66.],
        [34.]], device='cuda:0')

In [178]:
data_batch

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

<font color=purple>  
### Forward pass within model
There are two inputs: `data_batch` and `normalized_data_batch`. 
`recon_loss, kld_theta = model(data_batch, normalized_data_batch)`

The forward function is as such:  
```
    def forward(self, bows, normalized_bows, theta=None, aggregate=True):
        ## get \theta
        if theta is None:
            theta, kld_theta = self.get_theta(normalized_bows)
        else:
            kld_theta = None

        ## get \beta
        beta = self.get_beta()

        ## get prediction loss
        preds = self.decode(theta, beta)
        recon_loss = -(preds * bows).sum(1)
        if aggregate:
            recon_loss = recon_loss.mean()
        return recon_loss, kld_theta
```

<font color=purple>   
## Forward pass #1: Retreiving topic proportions, theta
The first part of the forward pass is to get the topic proportions, `theta`, which really is $\delta$ in the paper.  
This tells us how much of each document belongs to each topic. 

<font color=purple>   
`normalized_data_batch` is first encoded.  
This will give us a `theta` which really is $\delta$ in the paper. It refers to the topic proportions, that is, how much of each document belongs to each topic. Thus, `theta` is of shape (batch size, no. of topics)

In [86]:
# apply variational params, the neural network
# this gives us the document encoding, q, of shape (batchsize, t_hidden_size)
q_theta = model.q_theta(normalized_data_batch)

# apply dropout 
if model.enc_drop > 0:
    q_theta = model.t_drop(q_theta)

# the output layer of the inference network
# "The inference network ingest the doocument wd
# and outputs a mean and a variance of delta_d (theta)"
mu_theta = model.mu_q_theta(q_theta)
logsigma_theta = model.logsigma_q_theta(q_theta)

# Now to input mu and sigma 
# to sample theta.
# theta is a logistic normal model
# I dont understand how to derive "eps.mul_(std).add_(mu_theta)" !!!!!!!!!!!!!!!!!!!
std = torch.exp(0.5 * logsigma_theta) 
eps = torch.randn_like(std)
z = eps.mul_(std).add_(mu_theta)
theta = torch.nn.functional.softmax(z, dim=-1) 

In [146]:
theta.shape

torch.Size([1000, 50])

<font color=purple>   
The loss function of forward pass #1 is in `kld_theta` or `kl_theta`.  
The authors write: _"[This] term encourages [the topic proportions, $\delta_{d}$] to be close to the prior, $p(\delta_{d})$_."  
This may prove useful for seeding. 

In [95]:
# How is this the kullback leibler? 
kld_theta = kl_theta = -0.5 * torch.sum(1 + logsigma_theta - mu_theta.pow(2) - logsigma_theta.exp(), dim=-1).mean()

<font color=purple>   
## Forward pass #2: Retrieve beta  
    
"`beta` denotes a traditional topic; ie, a distribution over all the words."  
Thus, it makes sense that beta follows the shape (no. of topics, size of vocab)
    </br>  
    
If `model.alpha` is the topic representation and `model.rho` is the word representation, then their dot products would be the agreement between topic and words.  

</br>  
  
The following softmax function over this agreement values intuitively gives a probability distribution over the following words, that is:   
$$softmax(\alpha \cdot \rho) = \beta$$

In [172]:
print( f'alpha: {model.alphas} - (embsize x topics)' )
print( f'logit: {model.rho.weight.shape} - (vocab x embsize)' )

logit = model.alphas(model.rho.weight)
beta = torch.nn.functional.softmax(logit, dim=0).transpose(1, 0)

print( f"beta shape: {beta.shape}")

alpha: Linear(in_features=300, out_features=50, bias=False) - (embsize x topics)
logit: torch.Size([3072, 300]) - (vocab x embsize)
beta shape: torch.Size([50, 3072])


<font color=purple>   
## Forward pass #3: Reconstruct bag of words and calculate loss
How much does our model agree with the data? This is observed in `recon_loss`.

In [176]:
# decode
res = torch.mm(theta, beta)
preds = torch.log(res+1e-6) # +1e-6 to prevent log(0) errro
recon_loss = -(preds * data_batch).sum(1)

print('\n' + '_'*10)
print("actual bag of words")
display( data_batch )

print('\n' + '_'*10)
print("predicted bag of words")
display( preds )

print('\n' + '_'*10)
print( f"reconstruction loss shape before aggregating: {recon_loss.shape[0]}" )

recon_loss = recon_loss.mean()
print( f"reconstruction loss: {recon_loss}" )


__________
actual bag of words


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


__________
predicted bag of words


tensor([[-8.0245, -8.0295, -8.0288,  ..., -8.0267, -8.0191, -8.0231],
        [-8.0304, -8.0275, -8.0292,  ..., -8.0236, -8.0212, -8.0251],
        [-8.0257, -8.0284, -8.0244,  ..., -8.0295, -8.0160, -8.0211],
        ...,
        [-8.0320, -8.0251, -8.0266,  ..., -8.0343, -8.0200, -8.0196],
        [-8.0291, -8.0227, -8.0251,  ..., -8.0234, -8.0173, -8.0263],
        [-8.0300, -8.0369, -8.0218,  ..., -8.0206, -8.0241, -8.0191]],
       device='cuda:0', grad_fn=<LogBackward>)


__________
reconstruction loss shape before aggregating: 1000
reconstruction loss: 668.0684204101562


<font color=purple>  
## Backward pass

In [112]:
total_loss = recon_loss + kld_theta
total_loss.backward()