# User Latent Dirichlet Allocation

In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import pickle
sys.path.append(os.path.abspath('../../vae_playground'))
sys.path.append(os.path.abspath('..'))
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f348c272310>

In [28]:
%load_ext autoreload
%autoreload 2

from lib.models import InductiveLDA as InductiveLDA
from lib import utils as utils

import vae_playground.datasets as datasets

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Import Data

In [3]:
class Corpus:
    def __init__(self, datadir):
        filenames = ['train.txt.npy', 'test.txt.npy']
        self.datapaths = [os.path.join(datadir, x) for x in filenames]
        with open(os.path.join(datadir, 'vocab.pkl'), 'rb') as f:
            self.vocab = pickle.load(f)
        self.train, self.test = [
            Data(dp, len(self.vocab)) for dp in self.datapaths]


class Data:
    def __init__(self, datapath, vocab_size):
        data = np.load(datapath, allow_pickle=True, encoding='bytes')
        self.data = np.array([np.bincount(x.astype('int'), minlength=vocab_size) for x in data if np.sum(x)>0])
        self.documents = data
        
    @property
    def size(self):
        return len(self.data)
    
    def get_batch(self, batch_size, start_id=None):
        if start_id is None:
            batch_idx = np.random.choice(np.arange(self.size), batch_size)
        else:
            batch_idx = np.arange(start_id, start_id + batch_size)
        batch_data = self.data[batch_idx]
        data_tensor = torch.from_numpy(batch_data).float()
        return data_tensor

In [4]:
corpus = Corpus("../data/20news")

In [5]:
RANDOM_SEED = 2112
np.random.seed(RANDOM_SEED)

In [6]:
X_document = corpus.train.data

In [7]:
num_docs, vocab_size = X_document.shape
print(f"Number of documents: {num_docs}, vocab size: {vocab_size}")

Number of documents: 11258, vocab size: 1995


In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X): self.X = X
    def __len__(self): return len(self.X)
    def __getitem__(self, index): return self.X[index]

In [9]:
dataset = Dataset(torch.tensor(X_document).to(torch.float32))

In [10]:
num_topics = 10
prodlda = False
conv = False
num_layers = 0 # num hidden layers in the NN block. There are at least (1+num_heads) hidden layers in the encoder and decoder. num_layers comes on top of those. Total number of hidden layers is (1+num_layers+num_heads).
num_neurons = 500
dropout = True
dropout_rate = 0.5
batch_normalization = False
prior_params = {'alpha': 1.0}
decoder_temperature = 1.0
encoder_temperature = 1.0

In [11]:
model = InductiveLDA(input_dim=vocab_size,
                    num_topics=num_topics,
                    prior_params=prior_params,
                    conv=conv,
                    prodlda=prodlda,
                    decoder_temperature=decoder_temperature,
                    encoder_temperature=encoder_temperature,
                    num_hidden_layers=num_layers,
                    num_neurons=num_neurons,
                    dropout=dropout,
                    dropout_rate=dropout_rate,
                    batch_normalization=batch_normalization,
                    )

In [12]:
lr = 2e-3
batch_size = 200
num_epochs = 1000
beta = 1.0
learn_prior = False

In [13]:
# Train the model using default partitioning choice 
model.fit(lr=lr,
            dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True),
            epochs = num_epochs,
            beta = beta,
            mc_samples = 1,
            learn_prior = learn_prior,
            tensorboard = False,
            )

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
                                                       
Epoch:   0%|          | 2/1000 [00:05<36:17,  2.18s/it]            

Iteration: 100 -- ELBO=-6.72e+02 / RLL=-6.65e+02 / KL=6.39e+00


                                                       
Epoch:   0%|          | 4/1000 [00:10<42:49,  2.58s/it]            

Iteration: 200 -- ELBO=-6.01e+02 / RLL=-5.95e+02 / KL=6.62e+00


                                                       
Epoch:   1%|          | 6/1000 [00:14<39:28,  2.38s/it]            

Iteration: 300 -- ELBO=-6.48e+02 / RLL=-6.41e+02 / KL=6.65e+00


                                                       
Epoch:   1%|          | 8/1000 [00:18<39:06,  2.37s/it]   

Iteration: 400 -- ELBO=-6.40e+02 / RLL=-6.33e+02 / KL=6.62e+00


                                                       
Epoch:   1%|          | 9/1000 [00:22<38:33,  2.33s/it]            

Iteration: 500 -- ELBO=-6.09e+02 / RLL=-6.03e+02 / KL=6.34e+00


                                                        
Epoch:   1%|          | 11/1000 [00:26<37:55,  2.30s/it]           

Iteration: 600 -- ELBO=-5.78e+02 / RLL=-5.71e+02 / KL=6.70e+00


                                                        
Epoch:   1%|▏         | 13/1000 [00:30<37:30,  2.28s/it]           

Iteration: 700 -- ELBO=-7.72e+02 / RLL=-7.65e+02 / KL=7.05e+00


                                                        
Epoch:   2%|▏         | 15/1000 [00:34<38:01,  2.32s/it]  

Iteration: 800 -- ELBO=-6.43e+02 / RLL=-6.36e+02 / KL=7.06e+00


                                                        
Epoch:   2%|▏         | 16/1000 [00:38<38:02,  2.32s/it]           

Iteration: 900 -- ELBO=-6.05e+02 / RLL=-5.98e+02 / KL=7.04e+00


                                                        
Epoch:   2%|▏         | 18/1000 [00:42<37:21,  2.28s/it]           

Iteration: 1000 -- ELBO=-5.06e+02 / RLL=-4.99e+02 / KL=7.11e+00


                                                        
Epoch:   2%|▏         | 20/1000 [00:46<37:05,  2.27s/it]           

Iteration: 1100 -- ELBO=-7.86e+02 / RLL=-7.79e+02 / KL=7.82e+00


                                                        
Epoch:   2%|▏         | 22/1000 [00:50<36:50,  2.26s/it]  

Iteration: 1200 -- ELBO=-4.50e+02 / RLL=-4.43e+02 / KL=7.07e+00


                                                        
Epoch:   2%|▏         | 23/1000 [00:54<37:24,  2.30s/it]           

Iteration: 1300 -- ELBO=-6.42e+02 / RLL=-6.34e+02 / KL=7.78e+00


                                                        
Epoch:   2%|▎         | 25/1000 [00:58<36:56,  2.27s/it]           

Iteration: 1400 -- ELBO=-5.89e+02 / RLL=-5.81e+02 / KL=7.57e+00


                                                        
Epoch:   3%|▎         | 27/1000 [01:02<36:37,  2.26s/it]           

Iteration: 1500 -- ELBO=-5.33e+02 / RLL=-5.26e+02 / KL=7.52e+00


                                                        
Epoch:   3%|▎         | 29/1000 [01:06<36:31,  2.26s/it]          

Iteration: 1600 -- ELBO=-5.00e+02 / RLL=-4.93e+02 / KL=7.60e+00


                                                        
Epoch:   3%|▎         | 30/1000 [01:10<36:23,  2.25s/it]           

Iteration: 1700 -- ELBO=-7.31e+02 / RLL=-7.23e+02 / KL=7.83e+00


                                                        
Epoch:   3%|▎         | 32/1000 [01:14<36:14,  2.25s/it]           

Iteration: 1800 -- ELBO=-4.84e+02 / RLL=-4.76e+02 / KL=7.43e+00


                                                        
Epoch:   3%|▎         | 34/1000 [01:18<36:14,  2.25s/it]           

Iteration: 1900 -- ELBO=-6.85e+02 / RLL=-6.76e+02 / KL=8.55e+00


                                                        
Epoch:   4%|▎         | 36/1000 [01:22<36:20,  2.26s/it]          

Iteration: 2000 -- ELBO=-5.88e+02 / RLL=-5.80e+02 / KL=8.21e+00


                                                        
Epoch:   4%|▎         | 37/1000 [01:26<36:17,  2.26s/it]           

Iteration: 2100 -- ELBO=-5.09e+02 / RLL=-5.01e+02 / KL=8.01e+00


                                                        
Epoch:   4%|▍         | 39/1000 [01:30<36:02,  2.25s/it]           

Iteration: 2200 -- ELBO=-6.18e+02 / RLL=-6.09e+02 / KL=8.79e+00


                                                        
Epoch:   4%|▍         | 41/1000 [01:34<35:41,  2.23s/it]           

Iteration: 2300 -- ELBO=-5.36e+02 / RLL=-5.28e+02 / KL=8.16e+00


                                                        
Epoch:   4%|▍         | 43/1000 [01:38<35:38,  2.23s/it]          

Iteration: 2400 -- ELBO=-6.12e+02 / RLL=-6.03e+02 / KL=8.65e+00


                                                        
Epoch:   4%|▍         | 44/1000 [01:42<35:47,  2.25s/it]           

Iteration: 2500 -- ELBO=-6.15e+02 / RLL=-6.06e+02 / KL=8.84e+00


                                                        
Epoch:   5%|▍         | 46/1000 [01:45<35:45,  2.25s/it]           

Iteration: 2600 -- ELBO=-6.46e+02 / RLL=-6.37e+02 / KL=9.20e+00


                                                        
Epoch:   5%|▍         | 48/1000 [01:49<35:49,  2.26s/it]           

Iteration: 2700 -- ELBO=-5.36e+02 / RLL=-5.27e+02 / KL=8.89e+00


                                                        
Epoch:   5%|▌         | 50/1000 [01:54<37:08,  2.35s/it]          

Iteration: 2800 -- ELBO=-5.66e+02 / RLL=-5.57e+02 / KL=8.66e+00


                                                        
Epoch:   5%|▌         | 51/1000 [01:58<38:09,  2.41s/it]           

Iteration: 2900 -- ELBO=-5.15e+02 / RLL=-5.06e+02 / KL=8.70e+00


                                                        
Epoch:   5%|▌         | 53/1000 [02:02<36:47,  2.33s/it]           

Iteration: 3000 -- ELBO=-4.60e+02 / RLL=-4.52e+02 / KL=8.26e+00


                                                        
Epoch:   6%|▌         | 55/1000 [02:06<36:05,  2.29s/it]           

Iteration: 3100 -- ELBO=-7.25e+02 / RLL=-7.16e+02 / KL=9.32e+00


Epoch:   6%|▌         | 56/1000 [02:07<35:54,  2.28s/it]

KeyboardInterrupt: 

In [27]:
model.decoder.get_beta().detach()

tensor([[2.1380e-05, 6.7945e-03, 1.6753e-02,  ..., 2.5484e-05, 1.0430e-05,
         4.0732e-05],
        [2.7504e-05, 7.4103e-02, 8.7654e-03,  ..., 1.4902e-04, 3.5682e-05,
         3.6110e-04],
        [3.9541e-05, 3.7813e-03, 1.8193e-02,  ..., 1.9339e-05, 6.6842e-06,
         1.0731e-05],
        ...,
        [4.8998e-05, 6.4741e-03, 6.7097e-03,  ..., 8.4822e-06, 2.7553e-04,
         8.8048e-05],
        [3.1692e-05, 8.1897e-03, 5.0928e-03,  ..., 6.1062e-06, 5.5307e-05,
         7.3982e-06],
        [3.8582e-06, 2.5370e-03, 3.7577e-03,  ..., 2.7722e-05, 1.8260e-05,
         3.8726e-05]])

In [29]:
utils.to_alpha(model.prior_params["alpha"])

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [30]:
model.eval()

InductiveLDA(
  (encoder): DirichletNN(
    (parameterizer): ParameterizerNN(
      (block_dict): ModuleDict(
        (input): NNBlock(
          (input_layer): Sequential(
            (0): Linear(in_features=1995, out_features=500, bias=True)
            (1): ELU(alpha=1.0)
            (2): Dropout(p=0.5, inplace=False)
          )
          (middle_layers): ModuleList()
          (output_layer): Sequential(
            (0): Linear(in_features=500, out_features=500, bias=True)
            (1): ELU(alpha=1.0)
            (2): Dropout(p=0.5, inplace=False)
          )
        )
        (alpha): NNBlock(
          (input_layer): Sequential(
            (0): Linear(in_features=500, out_features=500, bias=True)
            (1): ELU(alpha=1.0)
          )
          (middle_layers): ModuleList()
          (output_layer): Sequential(
            (0): Linear(in_features=500, out_features=10, bias=True)
          )
        )
      )
    )
  )
  (decoder): BetaDecoder()
)

In [None]:
_, theta_dict = model(torch.tensor(X_document).to(torch.float32))
gamma_document = theta_dict['params']['alpha'].detach().T
gamma_document_mean = gamma_document / gamma_document.sum(dim=0, keepdim=True)

: 

In [31]:
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import gensim.corpora as corpora

In [32]:
## turn the encoding in corpus.train.documents into a list of list of words using the vocab
id2word = {v: k for k, v in corpus.vocab.items()}
texts = [[id2word[i] for i in doc] for doc in corpus.train.documents]

In [34]:
Beta = model.decoder.get_beta().detach().numpy()

In [35]:
result = {}
result["topic-word-matrix"] = Beta

top_k = 10

if top_k > 0:
    topics_output = []
    for topic in result["topic-word-matrix"]:
        top_k_words = list(reversed([id2word[i] for i in np.argsort(topic)[-top_k:]]))
        topics_output.append(top_k_words)
    result["topics"] = topics_output

In [36]:
result["topics"]

[['get', 'like', 'think', 'go', 'one', 'know', 'say', 'see', 'thing', 'time'],
 ['write',
  'article',
  'good',
  'like',
  'know',
  'car',
  'post',
  'get',
  'one',
  'never'],
 ['one',
  'university',
  'work',
  'look',
  'cost',
  'money',
  'bill',
  'report',
  'also',
  'use'],
 ['game', 'year', 'go', 'team', 'play', 'win', 'get', 'last', 'run', 'one'],
 ['use',
  'thanks',
  'card',
  'anyone',
  'drive',
  'work',
  'please',
  'know',
  'software',
  'system'],
 ['new',
  'include',
  'send',
  'list',
  'line',
  'address',
  'information',
  'may',
  'email',
  'group'],
 ['god',
  'say',
  'one',
  'people',
  'believe',
  'write',
  'know',
  'christian',
  'think',
  'make'],
 ['use',
  'file',
  'program',
  'window',
  'run',
  'also',
  'system',
  'version',
  'like',
  'windows'],
 ['key',
  'law',
  'use',
  'government',
  'write',
  'chip',
  'make',
  'public',
  'encryption',
  'state'],
 ['people',
  'year',
  'say',
  'kill',
  'war',
  'israel',
  'armen

In [37]:
# Initialize metric
npmi = CoherenceModel(
    topics=result["topics"],
    texts=texts,
    corpus=corpus.train.data,
    dictionary=Dictionary(texts),
    coherence="c_npmi",
    topn=top_k)

In [38]:
npmi.get_coherence()

0.04501569333317622