<a href="https://colab.research.google.com/github/kristynpantoja/math689project/blob/master/pytorchM3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install torch

In [0]:
!pip install torchvision

Model Parameters

In [0]:
!pip install gensim

In [0]:
from __future__ import print_function
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image

from types import SimpleNamespace

from sklearn.datasets import fetch_20newsgroups
import numpy as np
import torch

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import CountVectorizer

from gensim.models import Word2Vec, KeyedVectors

Data: 20newsgroups
We get the document-term matrix

In [0]:
# categories = ['talk.politics.guns', 'sci.space', 'soc.religion.christian',
#               'misc.forsale', 'rec.sport.baseball', 'comp.sys.mac.hardware']
categories = ['talk.politics.guns', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
vectorizer = CountVectorizer(stop_words = 'english', min_df=.01, max_df=0.9, 
                             token_pattern = u'(?ui)\\b[a-z]{3,}\\b')
count_vecs = vectorizer.fit_transform(newsgroups_train.data)
doc_term_matrix = count_vecs.toarray()
doc_term_matrix.shape # number of documents, number of words (in vocab)
tokenizer = vectorizer.build_tokenizer()

In [0]:
doc_term_matrix.shape[1] # vocab size

In [0]:
len(vectorizer.get_feature_names())

In [0]:
dict_word_freq = dict(zip(vectorizer.get_feature_names(), list(doc_term_matrix.sum(0))))

ragged array of words in each document (by index in vocabulary)

In [0]:
# def countsToInput(row):
#   return np.repeat(np.arange(doc_term_matrix.shape[1]),row)
  
# def numWords(row):
#   return row.sum()

# N_train = np.apply_along_axis(numWords, axis=1, arr=doc_term_matrix)
# data_train = []
# for d in range(doc_term_matrix.shape[0]):
#   data_train.append(torch.from_numpy(countsToInput(doc_term_matrix[d])))

In [0]:
from google.colab import drive


## Word2vec

In [0]:
# ideally, we would do some preprocessing
newsgroups_train_preproc = []
for document in newsgroups_train.data:
    newsgroups_train_preproc.append(document.split())

# make the model
w2v = Word2Vec(sg=1, negative=5, size=100, window=10, min_count=1, max_vocab_size=None, max_final_vocab=None)
# w2v.build_vocab(newsgroups_train_preproc)
w2v.build_vocab_from_freq(word_freq = dict_word_freq)
# train the model
w2v.train(sentences=newsgroups_train_preproc,epochs=10, total_examples=doc_term_matrix.shape[1])
# save the model
# w2v.save("sg_1_M2")

In [0]:
len(set([item for sublist in newsgroups_train_preproc for item in sublist]))

In [0]:
torch.tensor(w2v.syn1neg).shape

In [0]:
len(w2v.wv.vocab)

In [0]:
doc_term_matrix.shape[1] == len(w2v.wv.vocab)

In [0]:
w2v.syn1neg

In [0]:
w2v.wv.most_similar("university")

## Setup for Model

In [0]:
# from types import SimpleNamespace
# args_dict = {"batch_size" : 50, "epochs" : 50, "no_cuda" : False, "seed" : 1, "log_interval" : 10}
# args = SimpleNamespace(**args_dict)
# args.epochs
# args.cuda = not args.no_cuda and torch.cuda.is_available()

args_dict = {"batch_size" : 50, "epochs" : 30, "no_cuda" : False, "seed" : 1, "log_interval" : 100}
args = SimpleNamespace(**args_dict)
args.cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda" if args.cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.ToTensor()),
    batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.ToTensor()),
    batch_size=args.batch_size, shuffle=True, **kwargs)





Define model

In [0]:
class VAE(nn.Module):
    def __init__(self, vocab_size, num_docs, wordvec_dim, encoder_hidden, rp_normal_dim, num_samples, K):
        super(VAE, self).__init__()
        
        self.vocab_size = vocab_size
        self.num_samples = num_samples
        self.num_docs = num_docs
        
        self.word_embedding = nn.Embedding(vocab_size, wordvec_dim) # decoder
#         self.word_embedding = nn.Embedding.from_pretrained(torch.tensor(w2v.syn1neg), freeze=True)
        self.topic_embedding = nn.Embedding(K, wordvec_dim) # decoder
        self.lin1 = nn.Linear(vocab_size, encoder_hidden) # encoder
        self.mean = nn.Linear(encoder_hidden, rp_normal_dim) # encoder
        self.logvar = nn.Linear(encoder_hidden, rp_normal_dim) # encoder
        self.lin2 = nn.Linear(rp_normal_dim, K) # decoder 
        self.dropout = nn.Dropout(p=0.8)
        #self.topicslayer = nn.Linear(wordvec_dim, K) # decoder
        
        self.beta = torch.zeros([K, vocab_size], dtype = torch.float32) # decoder
        #self.theta = torch.zeros([10, K], dtype = torch.float32)

    def encode(self, x):
        h1 = F.relu(self.lin1(x))
        h2 = self.dropout(h1)
        return self.mean(h2), self.logvar(h2)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar) # get sigma
        eps = torch.randn_like(std) # get epsilon, generated from N(0, I_k) where k is dimension of std: k x 1
        return eps.mul(std).add_(mu) # this gives x ~ N(mu, var)
        # note: .mul is element-wise multiplication - this is fine, since sigma is diagonal matrix

      
    def decode(self, z):
        x = self.lin2(z) 
        theta = F.softmax(x, dim = 1) # to get theta, dim = batch size x K
        #word_dot_topic = self.topicslayer(self.word_embedding.weight) # weights corresp to topic vector
        self.beta = F.softmax(torch.mm(self.word_embedding.weight, 
                                       torch.transpose(self.topic_embedding.weight, 0, 1)), dim = 0) # beta, dim = V x K
        log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(self.beta, 0, 1))) # dim = batch size x V
        #theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
        ####log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
        return log_theta_dot_beta ####log_theta_dot_beta_normalized
        
    def forward(self, doc):
        mu, logvar = self.encode(doc)
#         z = self.reparameterize(mu, logvar)
        log_p = torch.zeros([args.batch_size, self.vocab_size])
        for sample in range(self.num_samples):
            z = self.reparameterize(mu, logvar)
            decoded = self.decode(z) 
            log_p = log_p.add(decoded)
        log_p /= self.num_samples
        return log_p, mu, logvar, self.topic_embedding.weight

Load training data (separate into batches)

In [0]:
# mnist_train_loader = torch.utils.data.DataLoader(
#     datasets.MNIST('../data', train=True, download=True,
#                    transform=transforms.ToTensor()),
#     batch_size=args.batch_size, shuffle=True, **kwargs)
# # enumerate(train_loader)

train_data = torch.utils.data.TensorDataset(torch.tensor(doc_term_matrix))
train_loader = torch.utils.data.DataLoader(train_data,                                            
    batch_size = args.batch_size, shuffle = True)

instantiate model and define functions for training

In [0]:
model = VAE(vocab_size = doc_term_matrix.shape[1], 
            num_docs = doc_term_matrix.shape[0], 
            wordvec_dim = 100, 
            encoder_hidden = 256, 
            rp_normal_dim = 75, 
            num_samples = 1,
            K = 2).to(device) 
      
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# optimizer = optim.RMSprop(model.parameters(), lr = 1e-3)

# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(data, train_loader, log_theta_dot_beta, x, mu, logvar, t):
    vocab_size = doc_term_matrix.shape[1]
    rt_normal_dim = 75
    # multiply by vocab_size / len(train_loader.dataset) -- this didn't work for any of the 3 possible locations      https://vxlabs.com/2017/12/08/variational-autoencoder-in-pytorch-commented-and-annotated/
    #  1.0 / (len(train_loader.dataset) * vocab_size) -- also didn't work
    
#     BCE = data[0].shape[0] * 1.0 / len(train_loader.dataset) * log_theta_dot_beta.sum() # MC est for expected log lik 

    KLD = (-0.5) * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) # KL divergence; it's fine to sum them all up now, rather than for each sample in mini-batch, because they'll all be summed up anyways
    
    BCE = log_theta_dot_beta.sum()
#     KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

#     K = t.shape[0]
#     arccos = []
#     for j in range(K):
#         for k in range(j, K):
#             arccos.append(torch.acos(torch.dot(t[:, j], t[:, k]) /
#                                                    (max(t[:, j].norm() * t[:, k].norm(), 1e-5))))
#             arccos.append(F.cosine_similarity(t[:, j], t[:, k]))
#     arccos = torch.tensor(arccos)
#     print(arccos.max())
#     zeta = (1 / (K * K)) * arccos.sum()
#     nu = torch.zeros(1)
#     print("zeta: " + str(zeta) + "nu: " + str(nu))
#     for a in arccos:
#         nu = nu.add((a - zeta).pow(2))
#     nu = (1 / (K * K)) * nu

    print("BCE: " + "{:.2f}".format(float(BCE)))
    print("KLD: " + "{:.2f}".format(float(KLD)))
    print("Loss: " + "{:.2f}".format(float(- BCE + KLD)))
    
    return data[0].shape[0] * 1.0 / len(train_loader.dataset) * (-BCE + KLD)

  
  


enc_variables = list(model.lin1.parameters()) + list(model.mean.parameters()) + list(model.logvar.parameters())
dec_variables = list(model.word_embedding.parameters()) + list(model.lin2.parameters()) + list(model.topic_embedding.parameters())

optim_enc = optim.Adam(enc_variables, lr=1e-3)
optim_enc = optim.SGD(enc_variables, nesterov=True, lr=1e-4, momentum = .5)
optim_dec = optim.Adam(dec_variables, lr=1e-3)
optim_dec = optim.SGD(dec_variables, nesterov=True, lr=1e-4, momentum = .5)

def train(epoch):
    model.train()
    train_loss = 0
    for switch in range(2):
        if switch == 0:
            print("updating encoder variables")
            optimizer = optim_enc
        else:
            print("updating decoder variables")
            optimizer = optim_dec
        for batch_idx, data in enumerate(train_loader):
    #         data = data.to(device)
            optimizer.zero_grad()
            log_theta_beta, mu, logvar, topic_vecs = model(data[0].float())
            loss = loss_function(data, train_loader, log_theta_beta, data, mu, logvar, topic_vecs)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
            if batch_idx % args.log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * data[0].shape[0], len(train_loader.dataset),
                    100. * batch_idx / len(train_loader),
    #                 loss.item() / data[0].shape[0]))
                    loss.item() ))

        print('====> Epoch: {} Average loss: {:.4f}'.format(
              epoch, train_loss / len(train_loader.dataset)))

        return train_loss / len(train_loader.dataset)


# def test(epoch):
#     model.eval()
#     test_loss = 0
#     with torch.no_grad():
#         for i, (data, _) in enumerate(test_loader):
#             data = data.to(device)
#             recon_batch, mu, logvar = model(data)
#             test_loss += loss_function(recon_batch, data, mu, logvar).item()
#             if i == 0:
#                 n = min(data.size(0), 8)
#                 comparison = torch.cat([data[:n],
#                                       recon_batch.view(args.batch_size, 1, 28, 28)[:n]])
# #                 save_image(comparison.cpu(),
# #                          'results/reconstruction_' + str(epoch) + '.png', nrow=n)

#     test_loss /= len(test_loader.dataset)
#     print('====> Test set loss: {:.4f}'.format(test_loss))

# if __name__ == "__main__":
#     for epoch in range(1, args.epochs + 1):
#         train(epoch)
#         test(epoch)
#         with torch.no_grad():
#             sample = torch.randn(64, 20).to(device)
#             sample = model.decode(sample).cpu()
#             save_image(sample.view(64, 1, 28, 28),
#                        'results/sample_' + str(epoch) + '.png')

 train the model

In [0]:
losses = []
for epoch in range(1, args.epochs + 1):
    losses.append(train(epoch))
#     if epoch > 1:
#         if np.abs(losses[epoch-1] - losses[epoch-2]) < 1e-2:
#             break



# why is KL going to 0 (and has negative sign sometimes, even though it's 0)? - this occurs when switch is off.
# when switch is on, KL is small, but never 0.
# smaller dimension for normal parameters leads to smaller KL



In [0]:
model.beta
_, ind = torch.sort(model.beta, 0)
# ind.numpy()[0:50, 0] - ind.numpy()[0:50, 1]
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:25, 0])
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:25, 1])
# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 2])
# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 3])
# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 4])
# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 5])
# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 7])
# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 8])
# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 9])
# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 15])
# print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 19])

##  Stuff

In [0]:
model.beta

In [0]:
# a = torch.randn(100, 128)
a = torch.tensor([[1,2,3], [4,5,6]]).float()
b = torch.tensor([[1,2,3], [4,5,6]]).float()

# F.cosine_similarity(a, b)
# torch.zeros_like(a)
a.add(b)
a

Get topic distributions

In [0]:
# unscaled_topics = torch.mm(model.word_embedding(torch.tensor(np.arange(doc_term_matrix.shape[1]))),
#          torch.transpose(model.topicslayer.weight, 0, 1))
# topic_dist = torch.softmax(unscaled_topics, dim = 0)
# topic_dist.sum(dim = 0)

This one helped us a lot

In [0]:
#model.encode(torch.LongTensor(doc_term_matrix[0]))
#input = torch.tensor(doc_term_matrix).float()
input = torch.tensor(doc_term_matrix).float()[[0, 1], ]
mu, sigma = model.encode(input)
z = model.reparameterize(mu, sigma)
# model.decode(x, input.shape[0])

x = model.fc3(z)
theta = F.softmax(x) # to get theta
embedding_matrix = model.word_embedding(torch.tensor(np.arange(14)))
word_dot_topic = model.fc4(embedding_matrix) # weights corresp to topic vector
beta = F.softmax(word_dot_topic)
log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(beta, 0, 1)))
#theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
# print(theta.shape)
# print(theta)
# print(embedding_matrix)
# print(word_dot_topic)
print(beta.shape)
print(beta)
print(log_theta_dot_beta)
print(torch.exp(log_theta_dot_beta_normalized))


In [0]:
#model.encode(torch.LongTensor(doc_term_matrix[0]))
#input = torch.tensor(doc_term_matrix).float()
input = torch.tensor(doc_term_matrix).float()[[0, 1], ]
print(input)
mu, sigma = model.encode(input)
z = model.reparameterize(mu, sigma)
print(z)
# model.decode(x, input.shape[0])


# x = model.lin2(z)
# theta = F.softmax(x) # to get theta
# embedding_matrix = model.word_embedding(torch.tensor(np.arange(model.num_docs)))
# word_dot_topic = model.topicslayer(embedding_matrix) # weights corresp to topic vector
# model.beta = F.softmax(word_dot_topic, dim = 0)
# log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(model.beta, 0, 1)))
# #theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
# log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
# print(embedding_matrix.shape) # dim of embedding matrix is 1544 x 100


x = model.lin2(z)
theta = F.softmax(x, 1) # to get theta
print(theta.sum(1))
embedding_matrix = model.word_embedding.weight
print(model.word_embedding(torch.tensor(np.arange(model.num_docs))).shape)
print(embedding_matrix.shape)
word_dot_topic = model.topicslayer(embedding_matrix) # weights corresp to topic vector
model.beta = F.softmax(word_dot_topic, dim = 0)
log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(model.beta, 0, 1)))
#theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
# print(embedding_matrix.shape) # dim of embedding matrix is still 1544 x 100


In [0]:
###########################
###########################
###########################

test_input = torch.tensor(doc_term_matrix).float()[[0, 1], ] # pretend_batch_size = 2
# print(test_input.shape) # 2 x 2441, where 2441 is vocab size
mu, logvar = model.encode(test_input)
# print(mu.shape) # 2 x 50
# print(sigma.shape) # 2 x 50
z = model.reparameterize(mu, sigma) # 2 x 50
# print(z.shape) # 2 x 50
output = model.decode(z)
# print(output)
# print(output.shape)

pretend_num_docs = 50
pretend_batch_size = test_input.shape[0]
#print(output.sum())
BCE = pretend_num_docs * 1.0 / pretend_batch_size * output.sum() # we sum the log probabilities
# print(1 + logvar - mu.pow(2) - logvar.exp())
KLD0 = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
print(logvar)
#KLD1 = -0.5 * torch.sum(2 + torch.sum(torch.cumprod(logvar)) - torch.mm(torch.transpose(mu, 1, 0),mu) - logvar.exp()) # this is a number, should be 
############################################################################################################
#KLD2 = 0.5 * (torch.sum(logvar.exp()) + torch.dot(mu, mu) - 50 - torch.log(torch.cumprod(logvar))) ###########################
# print(float(BCE))

# print("BCE: " + "{:.2f}".format(float(BCE)))
# print("KLD: " + "{:.2f}".format(float(KLD)))
# print("Loss: " + "{:.2f}".format(float(- BCE + KLD)))
# return - BCE + KLD # - .1 * (zeta - nu)

In [0]:
# X'X
torch.randn_like(torch.tensor(1.), )

# Topic Coherence

In [0]:
from itertools import product

def topic_coherence(beta, M, doc_term_matrix):
  K = beta.shape[1] # beta has dim V x K
  coherences = np.zeros(K)
  for t in range(K):
    index = np.argsort(-beta[:, t])[0:M]
    cart_prod = product(list(index), list(index))
    for ind1, ind2 in cart_prod:
      if ind1 == ind2:
        pass
      else:
        d_ind1 = (doc_term_matrix[:, ind1] > 0).sum()
        d_ind12 = ((doc_term_matrix[:, ind1] > 0) & (doc_term_matrix[:, ind2] > 0)).sum()
        coherences[t] += np.log1p(d_ind12) - np.log(d_ind1)

  return coherences

In [0]:
topic_coherence(model.beta.detach().numpy(), 20, doc_term_matrix)

In [0]:
torch.zeros_like(torch.tensor([10]))