<a href="https://colab.research.google.com/github/kristynpantoja/math689project/blob/master/pytorchM1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
!pip install torch



In [2]:
!pip install torchvision



Model Parameters

In [252]:
# from types import SimpleNamespace

# args_dict = {"batch_size" : 50, "epochs" : 50, "no_cuda" : False, "seed" : 1, "log_interval" : 10}
# args = SimpleNamespace(**args_dict)
# args.epochs

50

Data: 20newsgroups
We get the document-term matrix

In [36]:
from __future__ import print_function
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image

from types import SimpleNamespace

from sklearn.datasets import fetch_20newsgroups
import numpy as np
import torch

from sklearn.feature_extraction.text import CountVectorizer
# categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
# newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_train = fetch_20newsgroups(subset='train')
vectorizer = CountVectorizer(min_df=.01, max_df=0.05, stop_words = 'english',
                             token_pattern = u'(?ui)\\b[a-z]{3,}\\b')
count_vecs = vectorizer.fit_transform(newsgroups_train.data)
doc_term_matrix = count_vecs.toarray()
doc_term_matrix.shape # number of documents, number of words (in vocab)

(11314, 1544)

In [0]:
vectorizer.get_feature_names()

ragged array of words in each document (by index in vocabulary)

In [0]:
def countsToInput(row):
  return np.repeat(np.arange(doc_term_matrix.shape[1]),row)
  
def numWords(row):
  return row.sum()

N_train = np.apply_along_axis(numWords, axis=1, arr=doc_term_matrix)
data_train = []
for d in range(doc_term_matrix.shape[0]):
  data_train.append(torch.from_numpy(countsToInput(doc_term_matrix[d])))



In [20]:
data_train[1030]

tensor([  27,   65,  160,  160,  161,  161,  240,  266,  323,  339,  416,  427,
         549,  617,  658,  772,  844,  844,  844, 1082, 1083, 1099, 1120, 1120,
        1120, 1120, 1120, 1124, 1165, 1214, 1292, 1297, 1318, 1333, 1449])

Setup and packages

In [0]:


# from types import SimpleNamespace


# args_dict = {"batch_size" : 50, "epochs" : 50, "no_cuda" : False, "seed" : 1, "log_interval" : 10}
# args = SimpleNamespace(**args_dict)
# args.cuda = not args.no_cuda and torch.cuda.is_available()


args_dict = {"batch_size" : 50, "epochs" : 50, "no_cuda" : False, "seed" : 1, "log_interval" : 10}
args = SimpleNamespace(**args_dict)






####3


args.cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda" if args.cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.ToTensor()),
    batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.ToTensor()),
    batch_size=args.batch_size, shuffle=True, **kwargs)





Define model

In [0]:
class VAE(nn.Module):
    def __init__(self, num_docs):
        super(VAE, self).__init__()
        
        vocab_size = doc_term_matrix.shape[1]
        wordvec_dim = 100
        K = 20
        self.num_docs = num_docs
        self.word_embedding = nn.Embedding(vocab_size, wordvec_dim) # decoder
        self.lin1 = nn.Linear(vocab_size, 100) # encoder
        self.mean = nn.Linear(100, 25) # encoder
        self.logvar = nn.Linear(100, 25) # encoder
        self.lin2 = nn.Linear(25, K) # decoder 
        self.topicslayer = nn.Linear(wordvec_dim, K) # decoder
        self.beta = torch.zeros([K, vocab_size], dtype = torch.float32) # decoder

    def encode(self, x):
        h1 = F.relu(self.lin1(x))
#         h1 = self.lin1(x)
        return self.mean(h1), self.logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu) # this gives x ~ N(mu, var)

      
    def decode(self, z):
        x = self.lin2(z)
        theta = F.softmax(x, dim = 1) # to get theta
        word_dot_topic = self.topicslayer(self.word_embedding.weight) # weights corresp to topic vector
        self.beta = F.softmax(word_dot_topic, dim = 0)
        log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(self.beta, 0, 1)))
        #theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
        log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
        return log_theta_dot_beta_normalized
        
    def forward(self, doc):
        mu, logvar = self.encode(doc)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar
        


      


Load training data (separate into batches)

In [0]:
# mnist_train_loader = torch.utils.data.DataLoader(
#     datasets.MNIST('../data', train=True, download=True,
#                    transform=transforms.ToTensor()),
#     batch_size=args.batch_size, shuffle=True, **kwargs)
# # enumerate(train_loader)

train_data = torch.utils.data.TensorDataset(torch.tensor(doc_term_matrix))
train_loader = torch.utils.data.DataLoader(train_data,                                            
    batch_size = args.batch_size, shuffle = True)

instantiate model and define functions for training

In [0]:
model = VAE(doc_term_matrix.shape[1]).to(device) 
      
# optimizer = optim.Adam(model.parameters(), lr=1e-3)
# optimizer = optim.RMSprop(model.parameters(), lr = 1e-3)

# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(log_theta_dot_beta_normalized, x, mu, logvar):
    BCE = log_theta_dot_beta_normalized.sum() # ?
    #print("BCE: " + str(BCE.max()))
    #BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum')

    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    
    # KLD = 0.5 * (1/logvar.exp() + )
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    #print("KLD: " + str(KLD))
#     print("logvar: " + str(logvar.max()))
#     print("mu: " + str(mu.max()))
    return - BCE + KLD

enc_variables = list(model.lin1.parameters()) + list(model.mean.parameters()) +  list(model.logvar.parameters())
dec_variables = list(model.word_embedding.parameters()) + list(model.lin2.parameters()) + list(model.topicslayer.parameters())


optim_enc = optim.Adam(enc_variables, lr=1e-3)
optim_dec = optim.Adam(dec_variables, lr=1e-3)

def train(epoch):
    model.train()
    train_loss = 0
    for switch in range(0,2):
        if switch == 0:
            print("updating encoder variables")
            optimizer = optim_enc
        else:
            print("updating decoder variables")
            optimizer = optim_dec
        for batch_idx, data in enumerate(train_loader):
            #data = data.to(device)
            optimizer.zero_grad()
            log_theta_beta, mu, logvar = model(data[0].float())
            loss = loss_function(log_theta_beta, data, mu, logvar)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
            if batch_idx % args.log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * data[0].shape[0], len(train_loader.dataset),
                    100. * batch_idx / len(train_loader),
                    loss.item() / data[0].shape[0]))

        print('====> Epoch: {} Average loss: {:.4f}'.format(
              epoch, train_loss / len(train_loader.dataset)))


# def test(epoch):
#     model.eval()
#     test_loss = 0
#     with torch.no_grad():
#         for i, (data, _) in enumerate(test_loader):
#             data = data.to(device)
#             recon_batch, mu, logvar = model(data)
#             test_loss += loss_function(recon_batch, data, mu, logvar).item()
#             if i == 0:
#                 n = min(data.size(0), 8)
#                 comparison = torch.cat([data[:n],
#                                       recon_batch.view(args.batch_size, 1, 28, 28)[:n]])
# #                 save_image(comparison.cpu(),
# #                          'results/reconstruction_' + str(epoch) + '.png', nrow=n)

#     test_loss /= len(test_loader.dataset)
#     print('====> Test set loss: {:.4f}'.format(test_loss))

# if __name__ == "__main__":
#     for epoch in range(1, args.epochs + 1):
#         train(epoch)
#         test(epoch)
#         with torch.no_grad():
#             sample = torch.randn(64, 20).to(device)
#             sample = model.decode(sample).cpu()
#             save_image(sample.view(64, 1, 28, 28),
#                        'results/sample_' + str(epoch) + '.png')

 train the model

In [0]:
#model = VAE(doc_term_matrix.shape[1])#.to(device)

In [268]:
for epoch in range(1, args.epochs + 1):
    train(epoch)

updating encoder variables
====> Epoch: 1 Average loss: 6041.7642
updating decoder variables
====> Epoch: 1 Average loss: 12080.3936
updating encoder variables
====> Epoch: 2 Average loss: 6037.8031
updating decoder variables
====> Epoch: 2 Average loss: 12075.5474
updating encoder variables
====> Epoch: 3 Average loss: 6037.7357
updating decoder variables
====> Epoch: 3 Average loss: 12075.4682
updating encoder variables
====> Epoch: 4 Average loss: 6037.7323
updating decoder variables
====> Epoch: 4 Average loss: 12075.4651
updating encoder variables
====> Epoch: 5 Average loss: 6037.7307
updating decoder variables
====> Epoch: 5 Average loss: 12075.4610
updating encoder variables
====> Epoch: 6 Average loss: 6037.7304
updating decoder variables
====> Epoch: 6 Average loss: 12075.4609
updating encoder variables


KeyboardInterrupt: ignored

In [269]:
model.beta
_, ind = torch.sort(model.beta, 0)
# ind.numpy()[0:50, 0] - ind.numpy()[0:50, 1]
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 0])
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 1])
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 2])
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 3])
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 4])
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 5])
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 7])
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 8])
print(np.array(vectorizer.get_feature_names())[ind.numpy()][0:20, 9])

['operations' 'san' 'price' 'release' 'andy' 'color' 'require' 'described'
 'edge' 'america' 'criminals' 'self' 'feel' 'honest' 'meaning' 'monitor'
 'silver' 'direct' 'suggestions' 'definition']
['san' 'release' 'operations' 'self' 'silver' 'color' 'america'
 'understand' 'require' 'suggestions' 'separate' 'direct' 'occur' 'war'
 'andy' 'friends' 'meaning' 'joseph' 'criminals' 'honest']
['operations' 'san' 'release' 'color' 'self' 'require' 'andy' 'price'
 'america' 'direct' 'described' 'silver' 'edge' 'meaning' 'suggestions'
 'honest' 'feel' 'criminals' 'separate' 'monitor']
['release' 'san' 'operations' 'self' 'edge' 'require' 'color' 'america'
 'suggestions' 'criminals' 'described' 'silver' 'andy' 'monitor' 'war'
 'school' 'direct' 'feel' 'understand' 'separate']
['operations' 'san' 'self' 'require' 'release' 'andy' 'color' 'school'
 'direct' 'america' 'suggestions' 'separate' 'meaning' 'price' 'silver'
 'war' 'edge' 'honest' 'cpu' 'described']
['operations' 'san' 'self' 'release' '

In [13]:
ind[:, 0]

NameError: ignored

In [43]:
torch.tensor((4, 25))

tensor([ 4, 25])

Get word vectors and topic vectors

In [255]:
for i in range(0, 2):
  print(i)

0
1


In [187]:
model.word_embedding.weight

Parameter containing:
tensor([[ 1.0982, -0.6489,  0.0177,  ...,  0.7344,  0.6163,  0.6307],
        [-0.6461,  0.5238, -0.1179,  ..., -1.1260, -0.5322,  0.7694],
        [ 1.2182,  0.6254,  0.0470,  ...,  1.6700,  0.4856,  2.6060],
        ...,
        [ 0.5594,  1.2776,  1.0770,  ...,  0.8158,  1.5054, -0.4085],
        [ 1.0393, -0.1039,  0.9977,  ...,  1.5477, -0.1037,  0.0440],
        [ 0.7130,  1.8861, -1.6776,  ..., -0.1981,  1.5925, -1.3581]],
       requires_grad=True)

Get topic distributions

In [183]:
unscaled_topics = torch.mm(model.word_embedding(torch.tensor(np.arange(doc_term_matrix.shape[1]))),
         torch.transpose(model.topicslayer.weight, 0, 1))
topic_dist = torch.softmax(unscaled_topics, dim = 0) 
topic_dist.sum(dim = 0)

tensor([1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<SumBackward1>)

This one helped us a lot

In [270]:
#model.encode(torch.LongTensor(doc_term_matrix[0]))
#input = torch.tensor(doc_term_matrix).float()
input = torch.tensor(doc_term_matrix).float()[[0, 1], ]
mu, sigma = model.encode(input)
z = model.reparameterize(mu, sigma)
# model.decode(x, input.shape[0])

x = model.fc3(z)
theta = F.softmax(x) # to get theta
embedding_matrix = model.word_embedding(torch.tensor(np.arange(14)))
word_dot_topic = model.fc4(embedding_matrix) # weights corresp to topic vector
beta = F.softmax(word_dot_topic)
log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(beta, 0, 1)))
#theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
# print(theta.shape)
# print(theta)
# print(embedding_matrix)
# print(word_dot_topic)
print(beta.shape)
print(beta)
print(log_theta_dot_beta)
print(torch.exp(log_theta_dot_beta_normalized))


AttributeError: ignored

In [284]:
#model.encode(torch.LongTensor(doc_term_matrix[0]))
#input = torch.tensor(doc_term_matrix).float()
input = torch.tensor(doc_term_matrix).float()[[0, 1], ]
print(input)
mu, sigma = model.encode(input)
z = model.reparameterize(mu, sigma)
print(z)
# model.decode(x, input.shape[0])


# x = model.lin2(z)
# theta = F.softmax(x) # to get theta
# embedding_matrix = model.word_embedding(torch.tensor(np.arange(model.num_docs)))
# word_dot_topic = model.topicslayer(embedding_matrix) # weights corresp to topic vector
# model.beta = F.softmax(word_dot_topic, dim = 0)
# log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(model.beta, 0, 1)))
# #theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
# log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
# print(embedding_matrix.shape) # dim of embedding matrix is 1544 x 100


x = model.lin2(z)
theta = F.softmax(x, 1) # to get theta
print(theta.sum(1))
embedding_matrix = model.word_embedding.weight
print(model.word_embedding(torch.tensor(np.arange(model.num_docs))).shape)
print(embedding_matrix.shape)
word_dot_topic = model.topicslayer(embedding_matrix) # weights corresp to topic vector
model.beta = F.softmax(word_dot_topic, dim = 0)
log_theta_dot_beta = torch.log(torch.mm(theta, torch.transpose(model.beta, 0, 1)))
#theta_dot_beta = torch.exp(log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0))
log_theta_dot_beta_normalized = log_theta_dot_beta - torch.logsumexp(log_theta_dot_beta, dim = 0)
# print(embedding_matrix.shape) # dim of embedding matrix is still 1544 x 100


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([[-0.3542, -0.9824,  1.6668,  0.2909, -1.7727,  1.6447, -0.0335, -0.3574,
         -1.0229,  0.9524,  0.7662, -0.0573,  0.7221, -0.9453,  0.6689, -0.3142,
          1.1116, -0.5060, -0.5279,  0.1749,  0.5559, -0.7530, -0.2019, -0.1538,
          1.5552],
        [ 1.1045,  0.5738, -1.3214,  0.2868,  0.1640, -0.9203,  0.2256, -1.9079,
         -0.8485, -0.1582,  0.7142, -0.5238,  0.4466, -0.1109,  0.0060,  0.5935,
         -2.4257, -1.1120, -0.1708, -0.0022, -3.3380, -0.6230, -0.6414,  2.1400,
          0.5324]], grad_fn=<ThAddBackward>)
tensor([1.0000, 1.0000], grad_fn=<SumBackward1>)
torch.Size([1544, 100])
torch.Size([1544, 100])
