<a href="https://colab.research.google.com/github/kristynpantoja/math689project/blob/master/pytorchM1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install torch



In [2]:
!pip install torchvision



Model Parameters

In [252]:
# from types import SimpleNamespace

# args_dict = {"batch_size" : 50, "epochs" : 50, "no_cuda" : False, "seed" : 1, "log_interval" : 10}
# args = SimpleNamespace(**args_dict)
# args.epochs

50

Data: 20newsgroups
We get the document-term matrix

In [5]:
from __future__ import print_function
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image

from types import SimpleNamespace

from sklearn.datasets import fetch_20newsgroups
import numpy as np
import torch

from sklearn.feature_extraction.text import CountVectorizer
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
vectorizer = CountVectorizer(min_df=.01)
count_vecs = vectorizer.fit_transform(newsgroups_train.data)
doc_term_matrix = count_vecs.toarray()
doc_term_matrix.shape

(2034, 2400)

ragged array of words in each document (by index in vocabulary)

In [0]:
def countsToInput(row):
  return np.repeat(np.arange(doc_term_matrix.shape[1]),row)
  
def numWords(row):
  return row.sum()

N_train = np.apply_along_axis(numWords, axis=1, arr=doc_term_matrix)
data_train = []
for d in range(doc_term_matrix.shape[0]):
  data_train.append(torch.from_numpy(countsToInput(doc_term_matrix[d])))

In [7]:
data_train[1030]

tensor([  49,  158,  161,  169,  188,  188,  188,  188,  188,  262,  297,  297,
         297,  322,  371,  398,  493,  644,  707,  707,  721,  721,  855,  889,
         889,  897,  945,  970, 1015, 1029, 1029, 1042, 1060, 1072, 1072, 1072,
        1074, 1114, 1127, 1160, 1167, 1245, 1247, 1252, 1303, 1356, 1371, 1410,
        1410, 1410, 1429, 1434, 1455, 1460, 1460, 1472, 1476, 1497, 1510, 1582,
        1747, 1760, 1768, 1877, 1893, 1895, 1906, 1976, 1990, 2056, 2121, 2124,
        2124, 2124, 2124, 2125, 2125, 2125, 2127, 2131, 2133, 2142, 2164, 2164,
        2164, 2193, 2238, 2298, 2333, 2345, 2373, 2388, 2388])

Setup and packages

In [0]:


# from types import SimpleNamespace


# args_dict = {"batch_size" : 50, "epochs" : 50, "no_cuda" : False, "seed" : 1, "log_interval" : 10}
# args = SimpleNamespace(**args_dict)
# args.cuda = not args.no_cuda and torch.cuda.is_available()


args_dict = {"batch_size" : 50, "epochs" : 50, "no_cuda" : False, "seed" : 1, "log_interval" : 10}
args = SimpleNamespace(**args_dict)






####3


args.cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda" if args.cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.ToTensor()),
    batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.ToTensor()),
    batch_size=args.batch_size, shuffle=True, **kwargs)





Define model

In [0]:


class VAE(nn.Module):
    def __init__(self, num_docs):
        super(VAE, self).__init__()
        
        vocab_size = doc_term_matrix.shape[1]
        wordvec_dim = 25
        K = 4
        self.num_docs = num_docs
        self.word_embedding = nn.Embedding(vocab_size, wordvec_dim)
        self.lin1 = nn.Linear(vocab_size, 10)
        self.mean = nn.Linear(10, 3)
        self.logvar = nn.Linear(10, 3)
        self.lin2 = nn.Linear(3, K)
        self.topicslayer = nn.Linear(wordvec_dim, K)


    def encode(self, x):
        h1 = F.relu(self.lin1(x))
#         h1 = self.lin1(x)
        return self.mean(h1), self.logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu) # this gives x ~ N(mu, var)

      
    def decode(self, z):
        x = self.lin2(z)
        theta = F.softmax(x) # to get theta
        embedding_matrix = self.word_embedding(torch.tensor(np.arange(self.num_docs)))
        word_dot_topic = self.topicslayer(embedding_matrix) # weights corresp to topic vector
        beta = F.softmax(word_dot_topic)
        log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(beta, 0, 1)))
        #theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
        log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
        return log_theta_dot_beta_normalized
        
    def forward(self, doc):
        mu, logvar = self.encode(doc)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar


      


Load training data (separate into batches)

In [0]:
# mnist_train_loader = torch.utils.data.DataLoader(
#     datasets.MNIST('../data', train=True, download=True,
#                    transform=transforms.ToTensor()),
#     batch_size=args.batch_size, shuffle=True, **kwargs)
# # enumerate(train_loader)

train_data = torch.utils.data.TensorDataset(torch.tensor(doc_term_matrix))
train_loader = torch.utils.data.DataLoader(train_data,                                            
    batch_size = args.batch_size, shuffle = True)

instantiate model and define functions for training

In [0]:
model = VAE(doc_term_matrix.shape[1]).to(device) 
      
# optimizer = optim.Adam(model.parameters(), lr=1e-3)
optimizer = optim.RMSprop(model.parameters(), lr = 1e-3)

# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(log_theta_dot_beta_normalized, x, mu, logvar):
    BCE = log_theta_dot_beta_normalized.sum() # ?
#     print("BCE: " + str(BCE.max()))
    #BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum')

    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    
    # KLD = 0.5 * (1/logvar.exp() + )
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
#     print("KLD: " + str(KLD))
#     print("logvar: " + str(logvar.max()))
#     print("mu: " + str(mu.max()))
    return - BCE + KLD

def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(train_loader):
        #data = data.to(device)
        optimizer.zero_grad()
        log_theta_beta, mu, logvar = model(data[0].float())
        loss = loss_function(log_theta_beta, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * data[0].shape[0], len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                loss.item() / data[0].shape[0]))

    print('====> Epoch: {} Average loss: {:.4f}'.format(
          epoch, train_loss / len(train_loader.dataset)))


# def test(epoch):
#     model.eval()
#     test_loss = 0
#     with torch.no_grad():
#         for i, (data, _) in enumerate(test_loader):
#             data = data.to(device)
#             recon_batch, mu, logvar = model(data)
#             test_loss += loss_function(recon_batch, data, mu, logvar).item()
#             if i == 0:
#                 n = min(data.size(0), 8)
#                 comparison = torch.cat([data[:n],
#                                       recon_batch.view(args.batch_size, 1, 28, 28)[:n]])
# #                 save_image(comparison.cpu(),
# #                          'results/reconstruction_' + str(epoch) + '.png', nrow=n)

#     test_loss /= len(test_loader.dataset)
#     print('====> Test set loss: {:.4f}'.format(test_loss))

# if __name__ == "__main__":
#     for epoch in range(1, args.epochs + 1):
#         train(epoch)
#         test(epoch)
#         with torch.no_grad():
#             sample = torch.randn(64, 20).to(device)
#             sample = model.decode(sample).cpu()
#             save_image(sample.view(64, 1, 28, 28),
#                        'results/sample_' + str(epoch) + '.png')

<generator object Module.parameters at 0x7f9834eb9ca8>

 train the model

In [0]:
#model = VAE(doc_term_matrix.shape[1])#.to(device)

In [35]:
for epoch in range(1, args.epochs + 1):
    train(epoch)



====> Epoch: 1 Average loss: 9380.6761
====> Epoch: 2 Average loss: 9375.1091
====> Epoch: 3 Average loss: 9374.1038
====> Epoch: 4 Average loss: 9373.7083
====> Epoch: 5 Average loss: 9373.5442
====> Epoch: 6 Average loss: 9373.4737
====> Epoch: 7 Average loss: 9373.4449
====> Epoch: 8 Average loss: 9373.4310
====> Epoch: 9 Average loss: 9373.4227
====> Epoch: 10 Average loss: 9373.4190
====> Epoch: 11 Average loss: 9373.4156
====> Epoch: 12 Average loss: 9373.4247
====> Epoch: 13 Average loss: 9373.4175
====> Epoch: 14 Average loss: 9373.4124
====> Epoch: 15 Average loss: 9373.4047
====> Epoch: 16 Average loss: 9373.3942
====> Epoch: 17 Average loss: 9373.4191
====> Epoch: 18 Average loss: 9373.3927
====> Epoch: 19 Average loss: 9373.3927
====> Epoch: 20 Average loss: 9373.3968
====> Epoch: 21 Average loss: 9373.3880
====> Epoch: 22 Average loss: 9373.3896
====> Epoch: 23 Average loss: 9373.4233
====> Epoch: 24 Average loss: 9373.4033
====> Epoch: 25 Average loss: 9373.4011
====> Epo

In [84]:
model = VAE(doc_term_matrix.shape[1]).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
train(epoch)



====> Epoch: 7 Average loss: nan


In [70]:
optimizer.
model.word_embedding.weight
model.lin1.weight
model.lin2.weight
model.topicslayer.weight

Parameter containing:
tensor([[ 0.5191, -0.4505,  0.4671,  0.4755, -0.3381, -0.4420,  0.4226, -0.4525,
          0.3360,  0.4570,  0.3691, -0.2974, -0.4897, -0.4309,  0.4802,  0.3489,
          0.2622, -0.2527,  0.3144,  0.2765, -0.4011,  0.4457, -0.3800, -0.4889,
          0.4484],
        [-0.4174,  0.4940, -0.4468, -0.4230,  0.3201, -0.2856, -0.3884,  0.4389,
          0.0144, -0.4571, -0.3816,  0.4967,  0.4741,  0.2404, -0.3002, -0.4741,
         -0.4107,  0.4685, -0.4682, -0.4351,  0.3360, -0.2891,  0.4288,  0.3620,
         -0.4533],
        [ 0.3202, -0.2215, -0.0019,  0.0865, -0.2934,  0.2802,  0.1807, -0.0014,
         -0.2988,  0.0420,  0.1480,  0.2780, -0.2968, -0.2348,  0.3020,  0.2106,
          0.2236,  0.0830, -0.0792,  0.2026, -0.0954,  0.2134, -0.0432, -0.0955,
          0.2622],
        [ 0.3895, -0.2256,  0.4482,  0.1591, -0.3488,  0.4473,  0.4237,  0.3158,
         -0.4711,  0.2832,  0.3595, -0.2066, -0.1595,  0.3625,  0.4815,  0.2891,
         -0.4965, -0.3233,  0.

In [77]:
train(epoch)



====> Epoch: 4 Average loss: -561390227454822543453852434169856.0000


Get word vectors and topic vectors

In [187]:
model.word_embedding.weight

Parameter containing:
tensor([[ 1.0982, -0.6489,  0.0177,  ...,  0.7344,  0.6163,  0.6307],
        [-0.6461,  0.5238, -0.1179,  ..., -1.1260, -0.5322,  0.7694],
        [ 1.2182,  0.6254,  0.0470,  ...,  1.6700,  0.4856,  2.6060],
        ...,
        [ 0.5594,  1.2776,  1.0770,  ...,  0.8158,  1.5054, -0.4085],
        [ 1.0393, -0.1039,  0.9977,  ...,  1.5477, -0.1037,  0.0440],
        [ 0.7130,  1.8861, -1.6776,  ..., -0.1981,  1.5925, -1.3581]],
       requires_grad=True)

Get topic distributions

In [183]:
unscaled_topics = torch.mm(model.word_embedding(torch.tensor(np.arange(doc_term_matrix.shape[1]))),
         torch.transpose(model.topicslayer.weight, 0, 1))
topic_dist = torch.softmax(unscaled_topics, dim = 0) 
topic_dist.sum(dim = 0)

tensor([1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<SumBackward1>)

This one helped us a lot

In [175]:
#model.encode(torch.LongTensor(doc_term_matrix[0]))
#input = torch.tensor(doc_term_matrix).float()
input = torch.tensor(doc_term_matrix).float()[[0, 1], ]
mu, sigma = model.encode(input)
z = model.reparameterize(mu, sigma)
# model.decode(x, input.shape[0])

x = model.fc3(z)
theta = F.softmax(x) # to get theta
embedding_matrix = model.word_embedding(torch.tensor(np.arange(14)))
word_dot_topic = model.fc4(embedding_matrix) # weights corresp to topic vector
beta = F.softmax(word_dot_topic)
log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(beta, 0, 1)))
#theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
# print(theta.shape)
# print(theta)
# print(embedding_matrix)
# print(word_dot_topic)
print(beta.shape)
print(beta)
print(log_theta_dot_beta)
print(torch.exp(log_theta_dot_beta_normalized))


torch.Size([14, 4])
tensor([[0.3401, 0.2617, 0.1955, 0.2027],
        [0.0803, 0.1339, 0.6496, 0.1362],
        [0.1823, 0.2116, 0.3501, 0.2560],
        [0.2025, 0.2731, 0.2038, 0.3205],
        [0.2845, 0.1723, 0.3624, 0.1809],
        [0.3429, 0.2672, 0.1781, 0.2118],
        [0.3087, 0.0926, 0.4911, 0.1075],
        [0.4425, 0.1420, 0.1383, 0.2771],
        [0.0997, 0.5205, 0.0475, 0.3323],
        [0.2193, 0.2567, 0.1770, 0.3470],
        [0.2371, 0.2123, 0.4225, 0.1282],
        [0.2663, 0.1078, 0.4305, 0.1954],
        [0.2647, 0.2212, 0.3442, 0.1700],
        [0.1429, 0.4773, 0.1642, 0.2157]], grad_fn=<SoftmaxBackward>)
tensor([[-1.3482, -1.6479, -1.4619, -1.3612, -1.4926, -1.3389, -1.6200, -1.4568,
         -1.1139, -1.3719, -1.4637, -1.5954, -1.4334, -1.1614],
        [-1.5109, -1.1875, -1.2897, -1.3329, -1.3813, -1.5150, -1.3666, -1.4759,
         -1.4383, -1.3252, -1.3899, -1.3060, -1.4082, -1.4945]],
       grad_fn=<LogBackward>)
tensor([[0.5406, 0.3869, 0.4571, 0.4929, 0.

  import sys
  # Remove the CWD from sys.path while we load stuff.
