<a href="https://colab.research.google.com/github/kristynpantoja/math689project/blob/master/AVITMtoMiao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch

Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/49/0e/e382bcf1a6ae8225f50b99cc26effa2d4cc6d66975ccf3fa9590efcbedce/torch-0.4.1-cp36-cp36m-manylinux1_x86_64.whl (519.5MB)
[K    100% |████████████████████████████████| 519.5MB 23kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x5a25a000 @  0x7f7b19b5e2a4 0x594e17 0x626104 0x51190a 0x4f5277 0x510c78 0x5119bd 0x4f5277 0x4f3338 0x510fb0 0x5119bd 0x4f5277 0x4f3338 0x510fb0 0x5119bd 0x4f5277 0x4f3338 0x510fb0 0x5119bd 0x4f6070 0x510c78 0x5119bd 0x4f5277 0x4f3338 0x510fb0 0x5119bd 0x4f6070 0x4f3338 0x510fb0 0x5119bd 0x4f6070
[?25hInstalling collected packages: torch
Successfully installed torch-0.4.1


In [0]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import Parameter
import torch.nn.functional as F
import math

In [0]:
class ProdLDA(nn.Module):

    def __init__(self, net_arch):
        super(ProdLDA, self).__init__()
        ac = net_arch
        self.net_arch = net_arch
        # encoder
        self.en1_fc     = nn.Linear(ac.num_input, ac.en1_units)             # 1995 -> 100
        self.en2_fc     = nn.Linear(ac.en1_units, ac.en2_units)             # 100  -> 100
        self.en2_drop   = nn.Dropout(0.2)
        self.mean_fc    = nn.Linear(ac.en2_units, ac.num_topic)             # 100  -> 50
        self.mean_bn    = nn.BatchNorm1d(ac.num_topic)                      # bn for mean
        self.logvar_fc  = nn.Linear(ac.en2_units, ac.num_topic)             # 100  -> 50
        self.logvar_bn  = nn.BatchNorm1d(ac.num_topic)                      # bn for logvar
        # z
        self.p_drop     = nn.Dropout(0.2)
        # decoder
        self.decoder    = nn.Linear(ac.num_topic, ac.num_input)             # 50   -> 1995
        self.decoder_bn = nn.BatchNorm1d(ac.num_input)                      # bn for decoder
        # prior mean and variance as constant buffers
        prior_mean   = torch.Tensor(1, ac.num_topic).fill_(0)
        prior_var    = torch.Tensor(1, ac.num_topic).fill_(ac.variance)
        prior_logvar = prior_var.log()
        self.register_buffer('prior_mean',    prior_mean)
        self.register_buffer('prior_var',     prior_var)
        self.register_buffer('prior_logvar',  prior_logvar)
        # initialize decoder weight
        if ac.init_mult != 0:
            #std = 1. / math.sqrt( ac.init_mult * (ac.num_topic + ac.num_input))
            self.decoder.weight.data.uniform_(0, ac.init_mult)
        # remove BN's scale parameters
#         self.logvar_bn .register_parameter('weight', None)
#         self.mean_bn   .register_parameter('weight', None)
#         self.decoder_bn.register_parameter('weight', None)
#         self.decoder_bn.register_parameter('weight', None)


        ###
        self.word_embedding = nn.Embedding(ac.num_input, 50) # decoder
        self.word_embedding_bn = nn.BatchNorm1d(50)
        self.topic_embedding = nn.Embedding(ac.num_topic, 50) # decoder
        self.topic_embedding_bn = nn.BatchNorm1d(50)
        
        self.beta = torch.zeros([ac.num_topic, ac.num_input], dtype = torch.float32) # decoder
        ###

        
    def forward(self, input, compute_loss=False, avg_loss=True):
        # compute posterior
        assert input.shape[1] == doc_term_matrix_tensor.shape[1], "input isn't batch size x vocab size"
        en1 = F.softplus(self.en1_fc(input))                            # en1_fc   output
        en2 = F.softplus(self.en2_fc(en1))                              # encoder2 output
        en2 = self.en2_drop(en2)
        posterior_mean   = self.mean_bn  (self.mean_fc  (en2))          # posterior mean
        posterior_logvar = self.logvar_bn(self.logvar_fc(en2))          # posterior log variance
        posterior_var    = posterior_logvar.exp()
        # take sample
        eps = Variable(input.data.new().resize_as_(posterior_mean.data).normal_()) # noise
        z = posterior_mean + posterior_var.sqrt() * eps                 # reparameterization
        assert z.shape[1] == self.net_arch.num_topic, "hidden variable z (from TR) isn't batch size x num_topic"
        p = F.softmax(z)                                                # mixture probability
        p = self.p_drop(p)
        assert p.shape[1] == self.net_arch.num_topic, "p (theta) isn't same size as z"
        # do reconstruction
        word_vec = self.word_embedding_bn(self.word_embedding.weight)
        topic_vec = self.topic_embedding_bn(self.topic_embedding.weight)
        self.beta = F.softmax(word_vec.mm(topic_vec.t()), dim = 0) # Vx100 times 100xK => beta is VxK
        recon = p.mm(self.beta.t())         # reconstructed distribution over vocabulary
        # p is batchxK so batchxK times KxV => batchxV
        assert input.shape[1] == doc_term_matrix_tensor.shape[1], "output isn't batch size x vocab size"
        
        if compute_loss:
            return recon, self.loss(input, recon, posterior_mean, posterior_logvar, posterior_var, avg_loss)
        else:
            return recon

    def loss(self, input, recon, posterior_mean, posterior_logvar, posterior_var, avg=True):
        # NL
        NL  = -(input * (recon+1e-10).log()).sum(1)
        # KLD, see Section 3.3 of Akash Srivastava and Charles Sutton, 2017, 
        # https://arxiv.org/pdf/1703.01488.pdf
        prior_mean   = Variable(self.prior_mean).expand_as(posterior_mean)
        prior_var    = Variable(self.prior_var).expand_as(posterior_mean)
        prior_logvar = Variable(self.prior_logvar).expand_as(posterior_mean)
        var_division    = posterior_var  / prior_var
        diff            = posterior_mean - prior_mean
        diff_term       = diff * diff / prior_var
        logvar_division = prior_logvar - posterior_logvar
        # put KLD together
        KLD = 0.5 * ( (var_division + diff_term + logvar_division).sum(1) - self.net_arch.num_topic )
        # loss
        loss = (NL + KLD)
        # in traiming mode, return averaged loss. In testing mode, return individual loss
        if avg:
            return loss.mean()
        else:
            return loss
          
def train():
    for epoch in range(args.num_epoch):
        all_indices = torch.randperm(doc_term_matrix_tensor.size(0)).split(args.batch_size)
        loss_epoch = 0.0
        model.train()                   # switch to training mode
        for batch_indices in all_indices:
            if not args.nogpu: batch_indices = batch_indices.cuda()
            input = Variable(doc_term_matrix_tensor[batch_indices])
#             print(batch_indices.shape)
#             print(input.shape)
            recon, loss = model(input, compute_loss=True)
            # optimize
            optimizer.zero_grad()       # clear previous gradients
            loss.backward()             # backprop
            optimizer.step()            # update parameters
            # report
            loss_epoch += loss.data[0]    # add loss to loss_epoch
        if epoch % 5 == 0:
            print('Epoch {}, loss={}'.format(epoch, loss_epoch / len(all_indices)))

In [0]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# categories = ['talk.politics.guns', 'sci.space', 'soc.religion.christian',
#               'misc.forsale', 'rec.sport.baseball', 'comp.sys.mac.hardware']
categories = ['talk.politics.guns', 'sci.space']
# newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_train = fetch_20newsgroups(subset='train')


Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
vectorizer = CountVectorizer(stop_words = 'english', min_df=.01, max_df=0.9, 
                             token_pattern = u'(?ui)\\b[a-z]{3,}\\b')
count_vecs = vectorizer.fit_transform(newsgroups_train.data)
doc_term_matrix = count_vecs.toarray()
doc_term_matrix.shape # number of documents, number of words (in vocab)
tokenizer = vectorizer.build_tokenizer()

# note: vectorizer.get_feature_names() != vectorizer.vocabulary_

doc_term_matrix_tensor = torch.from_numpy(doc_term_matrix).float()

In [0]:
import argparse
from types import SimpleNamespace

In [0]:
args_dict = {"en1_units" : 100, "en2_units" : 100, "num_topic" : 50, 
             "batch_size" : 200, "optimizer" : 80, "learning_rate" : 0.002, 
             "momentum" : 0.99, "num_epoch" : 80, "init_mult" : 1, 
             "variance" : 0.995, "start" : True, "nogpu" : True}
args = SimpleNamespace(**args_dict)
args.num_input = doc_term_matrix_tensor.shape[1]

In [0]:
model = ProdLDA(args)

In [0]:
optimizer = torch.optim.Adam(model.parameters(), args.learning_rate, betas=(args.momentum, 0.999))

In [31]:
train()



Epoch 0, loss=624.6458129882812
Epoch 5, loss=566.2608032226562
Epoch 10, loss=559.2239379882812
Epoch 15, loss=556.8187866210938
Epoch 20, loss=558.3740844726562
Epoch 25, loss=555.32275390625
Epoch 30, loss=558.042236328125
Epoch 35, loss=553.9308471679688
Epoch 40, loss=555.9872436523438
Epoch 45, loss=551.849365234375
Epoch 50, loss=550.748779296875
Epoch 55, loss=550.667236328125
Epoch 60, loss=549.7816772460938
Epoch 65, loss=552.714111328125
Epoch 70, loss=549.1235961914062
Epoch 75, loss=547.1510009765625


In [0]:
associations = {
    'jesus': ['prophet', 'jesus', 'matthew', 'christ', 'worship', 'church'],
    'comp ': ['floppy', 'windows', 'microsoft', 'monitor', 'workstation', 'macintosh', 
              'printer', 'programmer', 'colormap', 'scsi', 'jpeg', 'compression'],
    'car  ': ['wheel', 'tire'],
    'polit': ['amendment', 'libert', 'regulation', 'president'],
    'crime': ['violent', 'homicide', 'rape'],
    'midea': ['lebanese', 'israel', 'lebanon', 'palest'],
    'sport': ['coach', 'hitter', 'pitch'],
    'gears': ['helmet', 'bike'],
    'nasa ': ['orbit', 'spacecraft'],
}
def identify_topic_in_line(line):
    topics = []
    for topic, keywords in associations.items():
        for word in keywords:
            if word in line:
                topics.append(topic)
                break
    return topics
def print_top_words(beta, feature_names, n_top_words=10):
    print('---------------Printing the Topics------------------')
    for i in range(len(beta)):
        line = " ".join([feature_names[j] 
                            for j in beta[i].argsort()[:-n_top_words - 1:-1]])
        topics = identify_topic_in_line(line)
        print('|'.join(topics))
        print('     {}'.format(line))
    print('---------------End of Topics------------------')

In [0]:
sorted(vectorizer.vocabulary_, key = lambda x: x[1])

In [23]:
emb = model.beta.detach().numpy().T
print("shape of beta is " + str(emb.shape))
print_top_words(emb, vectorizer.get_feature_names(), n_top_words = 20)

shape of beta is (6, 1739)
---------------Printing the Topics------------------
comp 
     edu com article writes host nntp posting university max like just distribution know don reply good windows thanks think computer

     edu com article writes like don just think know people university time does new use posting good way right make

     people don edu god use know like just com think time say said does new right writes information make did

     edu com writes max article host nntp posting university like just distribution reply know good don thanks drive new usa

     edu people com don just use like know think writes time does article new god make say way right did

     people edu com use don like think god know just time does way say said new make article file information
---------------End of Topics------------------


In [0]:
beta = model.decoder.weight.detach()#.softmax(0)
print(beta.sum(0))
print(beta.shape)
_, ind = torch.sort(beta, 0)
print(ind.shape)
# ind.numpy()[0:50, 0] - ind.numpy()[0:50, 1]
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 0])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 1])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 2])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 3])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 4])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 5])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 6])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 7])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 8])
print(np.array(sorted(vectorizer.get_feature_names(), key = lambda x: x[1]))[ind.numpy()][0:25, 9])

# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:25, 4])

tensor([691.0438, 749.8492, 650.2081, 758.7484, 738.9827, 847.6075, 663.5479,
        851.0903, 808.6369, 837.2359, 813.2128, 679.8199, 592.9649, 769.5212,
        881.0038, 834.2672, 849.1828, 854.6403, 863.4843, 860.0076, 687.0235,
        715.2632, 702.8358, 867.0287, 775.7951, 790.4180, 842.8163, 840.6353,
        823.6600, 806.1698, 751.1285, 817.8627, 761.5077, 688.9834, 677.0889,
        675.3782, 781.6658, 829.0858, 625.1631, 790.7856, 818.1710, 876.3959,
        644.7657, 797.1841, 760.3964, 860.3668, 661.1928, 840.3171, 735.9488,
        763.0490])
torch.Size([1739, 50])
torch.Size([1739, 50])
['normal' 'protect' 'kill' 'dave' 'function' 'cost' 'unfortunately'
 'years' 'tell' 'normally' 'answer' 'navy' 'background' 'saw' 'room'
 'keeping' 'uses' 'sent' 'brought' 'wait' 'readers' 'issues' 'members'
 'order' 'steve']
['resources' 'equivalent' 'looking' 'errors' 'notice' 'includes' 'size'
 'lives' 'models' 'project' 'face' 'gateway' 'books' 'possibly' 'options'
 'make' 'earth' '

In [0]:
print(model.decoder.weight)
model.decoder.weight.data.cpu()

Parameter containing:
tensor([[0.6879, 0.2637, 0.0701,  ..., 0.6469, 0.4965, 0.9987],
        [0.2294, 0.0525, 0.0407,  ..., 0.2914, 0.8304, 0.1538],
        [0.1853, 0.3867, 0.4497,  ..., 0.9726, 0.5935, 0.1058],
        ...,
        [0.0036, 0.3131, 0.8020,  ..., 0.0704, 0.1168, 0.8991],
        [0.0182, 0.4061, 0.5788,  ..., 0.2914, 0.6085, 0.9265],
        [0.7966, 0.5985, 0.6315,  ..., 0.0715, 0.7860, 0.9757]],
       requires_grad=True)


tensor([[0.6879, 0.2637, 0.0701,  ..., 0.6469, 0.4965, 0.9987],
        [0.2294, 0.0525, 0.0407,  ..., 0.2914, 0.8304, 0.1538],
        [0.1853, 0.3867, 0.4497,  ..., 0.9726, 0.5935, 0.1058],
        ...,
        [0.0036, 0.3131, 0.8020,  ..., 0.0704, 0.1168, 0.8991],
        [0.0182, 0.4061, 0.5788,  ..., 0.2914, 0.6085, 0.9265],
        [0.7966, 0.5985, 0.6315,  ..., 0.0715, 0.7860, 0.9757]])

In [0]:
from itertools import product

def topic_coherence(beta, M, doc_term_matrix):
  K = beta.shape[1] # beta has dim V x K
  coherences = np.zeros(K)
  for t in range(K):
    index = np.argsort(-beta[:, t])[0:M]
    cart_prod = product(list(index), list(index))
    for ind1, ind2 in cart_prod:
      if ind1 == ind2:
        pass
      else:
        d_ind1 = (doc_term_matrix[:, ind1] > 0).sum()
        d_ind12 = ((doc_term_matrix[:, ind1] > 0) & (doc_term_matrix[:, ind2] > 0)).sum()
        coherences[t] += np.log1p(d_ind12) - np.log(d_ind1)

  return coherences

In [21]:
topic_coherence(emb.T, 20, doc_term_matrix)

array([-500.25683263, -394.54622521, -457.73329493, -499.41127946,
       -419.44022283, -485.29432162])

In [0]:
sorted(vectorizer.vocabulary_, key = lambda x: x[1])[0]

'car'

In [0]:
vocab = {"hi": 13, "bye": 2, "hello": 3}
foo = zip(*sorted(vocab.items(), key = lambda x: x[1]))
list(foo)

[('bye', 'hello', 'hi'), (2, 3, 13)]

In [0]:
def print_top_words(beta, feature_names, n_top_words=10):
    print('---------------Printing the Topics------------------')
    for i in range(len(beta)): # for all the rows (words in vocab) in beta,
        line = " ".join([feature_names[j] 
                            for j in beta[i].argsort()[:-n_top_words - 1:-1]])
#         topics = identify_topic_in_line(line)
#         print('|'.join(topics))
        print('     {}'.format(line))
    print('---------------End of Topics------------------')

In [0]:
print_top_words(beta.numpy().T, sorted(vectorizer.vocabulary_, key = lambda x: x[1]))

---------------Printing the Topics------------------
     view bear normal false fear waco arms mind english lot
     bit fact choice circuit budget trouble communication technical jews leads
     away mind happened america fear univ pick looking resources changes
     thread equal christ bet claim mil important greatly facts player
     carried store microsoft race helpful tax methods million england logic
     parts suggest works poster distribution german capable circuit serial peter
     trouble electrical bit fact choice field facts hot approach flame
     christ jesus start images face mil purchase toronto exists mike
     written wants newsgroups gas vice refer discussed carnegie brian faster
     face kept christ turkish studies type bother apart start current
     connection carried early tax blue million school large documentation background
     state trouble past bit recognize driving people fact communication accepted
     serial references athos circuit poster lack happen

['car',
 'park',
 'saw',
 'day',
 'late',
 'early',
 'called',
 'mail',
 'washington',
 'fair',
 'cards',
 'days',
 'base',
 'haven',
 'mac',
 'gave',
 'way',
 'market',
 'machine',
 'maybe',
 'make',
 'taking',
 'machines',
 'daily',
 'dangerous',
 'far',
 'harvard',
 'launch',
 'cambridge',
 'basically',
 'values',
 'lawrence',
 'makes',
 'mass',
 'say',
 'hard',
 'gas',
 'hands',
 'says',
 'easily',
 'later',
 'magazine',
 'faster',
 'range',
 'fast',
 'data',
 'facts',
 'said',
 'hardware',
 'fault',
 'want',
 'david',
 'james',
 'main',
 'saying',
 'case',
 'fall',
 'man',
 'wants',
 'hand',
 'faith',
 'happy',
 'water',
 'packard',
 'east',
 'nasa',
 'vax',
 'mark',
 'based',
 'management',
 'lab',
 'language',
 'major',
 'sale',
 'tape',
 'happened',
 'war',
 'caused',
 'making',
 'save',
 'fact',
 'page',
 'takes',
 'handle',
 'rates',
 'cars',
 'paying',
 'taken',
 'rate',
 'pay',
 'dan',
 'damn',
 'law',
 'calls',
 'handling',
 'happen',
 'navy',
 'california',
 'san',
 'date

In [0]:
np.bincount(np.array([4,6,3,6,8,2,6,78,89,5]))

array([0, 0, 1, 1, 1, 1, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1])

In [0]:
"?".join("bsc")

'b?s?c'

In [0]:
doc_term_matrix.