In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import Parameter
import torch.nn.functional as F
import math

import TopicVAE

from sklearn.datasets import fetch_20newsgroups
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import argparse
from types import SimpleNamespace

import gensim.downloader as api
from gensim.models import Word2Vec, FastText, KeyedVectors
from os.path import isfile

import tools

import random
random.seed(1234)

import pandas as pd

Import the data (20NewsGroups) and make the doc-term matrix, which is the input to all of our models

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')

vectorizer = CountVectorizer(stop_words = 'english', min_df=.01, max_df=0.9, 
                             token_pattern = u'(?ui)\\b[a-z]{3,}\\b')
count_vecs = vectorizer.fit_transform(newsgroups_train.data)
doc_term_matrix = count_vecs.toarray()
doc_term_matrix.shape # number of documents, number of words (in vocab)

# note: vectorizer.get_feature_names() != vectorizer.vocabulary_

doc_term_matrix_tensor = torch.from_numpy(doc_term_matrix).float()

args_dict = {"en1_units" : 100, "en2_units" : 100, "num_topic" : 50, 
             "batch_size" : 200, "optimizer" : 80, "learning_rate" : 0.002, 
             "momentum" : 0.99, "num_epoch" : 80, "init_mult" : 1, 
             "variance" : 0.995, "start" : True, "nogpu" : True, 
             "embedding_dim" : 300, "freeze" : False}
args = SimpleNamespace(**args_dict)
args.num_input = doc_term_matrix_tensor.shape[1]

In [3]:
newsgroups_test = fetch_20newsgroups(subset='test')

count_vecs_test = vectorizer.transform(newsgroups_test.data)
doc_term_matrix_test = count_vecs_test.toarray()

# note: vectorizer.get_feature_names() != vectorizer.vocabulary_

doc_term_tensor_test = torch.from_numpy(doc_term_matrix_test).float()

# Experiments

## Getting Pretrained Vectors (20NewsGroups)

In [4]:
### make input to language models (word2vec, fasttext, etc.) ###

# we would do some more preprocessing later
newsgroups_train_preproc = []
for document in newsgroups_train.data:
    newsgroups_train_preproc.append(document.split())
    
# dict_word_freq = dict(zip(vectorizer.get_feature_names(), list(doc_term_matrix.sum(0))))

### Word2Vec: Skip-Gram

In [5]:
### make language model using word2vec ###

w2v = Word2Vec(sg=1, negative=5, size=300, window=10, min_count=1, max_vocab_size=None, seed=1, workers=1)
lm_w2v_20newsgroups = tools.create_language_model("lm_w2v_20newsgroups", w2v, doc_term_matrix,
                                            vectorizer.get_feature_names(), 
                                            sentences = newsgroups_train_preproc)

### get embedding matrix for word2vec language model trained on 20newsgroups ###
embedding_matrix_w2v_20newsgroups = tools.create_embedding_matrix(lm_w2v_20newsgroups, 
                                                                  vectorizer.get_feature_names())


### FastText: Skip-Gram

In [6]:
fasttext = FastText(sg=1, negative=5,size=300, window=10, min_count=1, max_vocab_size=None, seed=1, workers=1)
lm_fasttext_20newsgroups = tools.create_language_model("lm_fasttext_20newsgroups", fasttext, doc_term_matrix,
                                                       vectorizer.get_feature_names(), 
                                                       sentences = newsgroups_train_preproc)

### get embedding matrix for word2vec language model trained on 20newsgroups ###
embedding_matrix_fasttext_20newsgroups = tools.create_embedding_matrix(lm_fasttext_20newsgroups, 
                                                                       vectorizer.get_feature_names())


### Word2Vec: CBOW

In [7]:
### make language model using word2vec ###

w2v_cbow = Word2Vec(sg=0, negative=5, size=300, window=10, min_count=1, max_vocab_size=None, seed=1, workers=1)
lm_w2v_cbow_20newsgroups = tools.create_language_model("lm_w2v_cbow_20newsgroups", w2v_cbow, doc_term_matrix,
                                                       vectorizer.get_feature_names(), sentences = newsgroups_train_preproc)

### get embedding matrix for word2vec language model trained on 20newsgroups ###
embedding_matrix_w2v_cbow_20newsgroups = tools.create_embedding_matrix(lm_w2v_cbow_20newsgroups, 
                                                                  vectorizer.get_feature_names())


### FastText: CBOW

In [8]:
fasttext_cbow = FastText(sg=0, negative=5,size=300, window=10, min_count=1, max_vocab_size=None, seed=1, workers=1)
lm_fasttext_cbow_20newsgroups = tools.create_language_model("lm_fasttext_cbow_20newsgroups", fasttext_cbow,
                                                            doc_term_matrix, vectorizer.get_feature_names(), 
                                                            sentences = newsgroups_train_preproc)

### get embedding matrix for word2vec language model trained on 20newsgroups ###
embedding_matrix_fasttext_cbow_20newsgroups = tools.create_embedding_matrix(lm_fasttext_cbow_20newsgroups, 
                                                                       vectorizer.get_feature_names())


## Getting Pretrained Vectors (trained on outside)

### FastText: from Wiki

In [9]:
# pretrained_language_model = api.load("fasttext-wiki-news-subwords-300")
# pretrained_language_model.save("fasttext-wiki-news-subwords-300")

lm_fasttext_wiki = KeyedVectors.load("fasttext-wiki-news-subwords-300")

embedding_matrix_fasttext_wiki = np.random.randn(len(vectorizer.get_feature_names()), 300)
iterator = 0
for word in vectorizer.get_feature_names():
    if word in lm_fasttext_wiki.wv.vocab:
        embedding_matrix_fasttext_wiki[iterator] = lm_fasttext_wiki.wv.word_vec(word)
    else:
        continue
        # embedding_matrix2[iterator] = pretrained_language_model.wv.most_similar(word)
        # or something like that
    iterator += 1




### Word2Vec: from ???

# Create Models

This isn't actually the method described in Miao et. al., since the encoder is different (it's not MLP) - however, the decoder is (I think) the same.

In [10]:
#### Want several models

n = 5

GSMLDA_w2v_20news = []
GSMLDA_fasttext_20news = []
GSMLDA_w2v_cbow_20news = []
GSMLDA_fasttext_cbow_20news = []
GSMLDA_fasttext_wiki = []
GSMLDA = []
NVLDA = []


for i in range(n):
    random.seed(1234 + i)
    GSMLDA_w2v_20news_model = TopicVAE.GSMLDA(args, embedding_matrix_w2v_20newsgroups)
    GSMLDA_fasttext_20news_model = TopicVAE.GSMLDA(args, embedding_matrix_fasttext_20newsgroups)
    GSMLDA_w2v_cbow_20news_model = TopicVAE.GSMLDA(args, embedding_matrix_w2v_cbow_20newsgroups)
    GSMLDA_fasttext_cbow_20news_model = TopicVAE.GSMLDA(args, embedding_matrix_fasttext_cbow_20newsgroups)
    GSMLDA_fasttext_wiki_model = TopicVAE.GSMLDA(args, embedding_matrix_fasttext_wiki)
    GSMLDA_model = TopicVAE.GSMLDA(args)
    NVLDA_model = TopicVAE.LDA(args)
    
    GSMLDA_w2v_20news.append(tools.create_TopicVAE_model("GSMLDA_w2v_20news_" + str(i), 
                                                         GSMLDA_w2v_20news_model, args, doc_term_matrix_tensor))
    GSMLDA_fasttext_20news.append(tools.create_TopicVAE_model("GSMLDA_fasttext_20news_" + str(i), 
                                                              GSMLDA_fasttext_20news_model, args, doc_term_matrix_tensor))
    GSMLDA_w2v_cbow_20news.append(tools.create_TopicVAE_model("GSMLDA_w2v_cbow_20news_" + str(i), 
                                                              GSMLDA_w2v_cbow_20news_model, args, doc_term_matrix_tensor))
    GSMLDA_fasttext_cbow_20news.append(tools.create_TopicVAE_model("GSMLDA_fasttext_cbow_20news_" + str(i), 
                                                                   GSMLDA_fasttext_cbow_20news_model, args, doc_term_matrix_tensor))
    GSMLDA_fasttext_wiki.append(tools.create_TopicVAE_model("GSMLDA_fasttext_wiki_" + str(i), 
                                                            GSMLDA_fasttext_wiki_model, args, doc_term_matrix_tensor))
    GSMLDA.append(tools.create_TopicVAE_model("GSMLDA" + str(i), GSMLDA_model, args, doc_term_matrix_tensor))
    NVLDA.append(tools.create_TopicVAE_model("NVLDA" + str(i), NVLDA_model, args, doc_term_matrix_tensor))
    



In [11]:
models = [GSMLDA_w2v_20news, GSMLDA_fasttext_20news, GSMLDA_w2v_cbow_20news, 
          GSMLDA_fasttext_cbow_20news, GSMLDA_fasttext_wiki, GSMLDA, NVLDA]

coherences = []
for model in models:
    coherences.append([tools.topic_coherence_NPMI(sub_model.get_beta(), 20, doc_term_matrix) for sub_model in model])



KeyboardInterrupt: 

In [None]:
coherence_means = [np.mean(coherence) for coherence in coherences]
coherence_ses = [np.std(coherence)/np.sqrt(5) for coherence in coherences]



In [None]:
print(coherence_means)
print(coherence_sds)


# Compare Perplexities

In [12]:

perplexities = []
for model in models:
    perplexities.append([tools.perplexity(sub_model, doc_term_tensor_test) for sub_model in model])
    
[np.mean(perplexity) for perplexity in perplexities]

  p = F.softmax(z)                                                # mixture probability
  p = F.softmax(z)                                                # mixture probability


[868.49884, 870.25653, 871.25244, 875.3202, 2296132600.0, 892.83215, 911.44305]

In [13]:
[np.std(perplexity)/np.sqrt(5) for perplexity in perplexities]

[1.6202070471342904,
 1.5407734242771198,
 1.0327445741654535,
 1.7373556152569092,
 1257643168.408669,
 1.8429923067331657,
 1.5674330569926398]

In [None]:
perplexities = []
for model in models:
    perplexities.append([tools.perplexity(sub_model, doc_term_tensor_test) for sub_model in model])
    
[float(perplexity) for perplexity in perplexities]

## Compare Coherences OLD.

In [None]:
import matplotlib.pyplot as plt

In [None]:
# plt.style.use("seaborn-deep")

x = GSMLDA_without_embedding_coherence
y = GSMLDA2_20newsgroups_coherence

plt.hist([x, y], label = ["without embedding", "with 20newsgroups embedding"])
plt.legend(loc = 'upper right')
plt.show()

# t test

In [None]:
new_word_vecs = dict(zip(vectorizer.get_feature_names(), [model_GSMLDA2.word_embedding.weight[i] for i in range(model_GSMLDA2.word_embedding.weight.shape[0])]))


In [None]:
model_GSMLDA2.word_embedding.weight.detach().numpy()

In [None]:
len(vectorizer.get_feature_names())

In [None]:
len([model_GSMLDA2.word_embedding.weight[i] for i in range(model_GSMLDA2.word_embedding.weight.shape[0])])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cos_sim_matrix = cosine_similarity(model_GSMLDA2.word_embedding.weight.detach().numpy(), 
                                   model_GSMLDA2.word_embedding.weight.detach().numpy())

In [None]:
def n_closest_words(word, cos_sim_matrix, n):
    word_index = vectorizer.get_feature_names().index(word)
    close_words_indices = np.argsort(cos_sim_matrix[word_index])[-n:]
    print(close_words_indices)
    return [vectorizer.get_feature_names()[j] for j in close_words_indices]
    

In [None]:
n_closest_words("nasa", cos_sim_matrix, 20)

In [None]:
model_GSMLDA_cos_sim_matrix = cosine_similarity(model_GSMLDA_without_embedding.word_embedding.weight.detach().numpy(), 
                                   model_GSMLDA_without_embedding.word_embedding.weight.detach().numpy())
n_closest_words("amendment", model_GSMLDA_cos_sim_matrix, 20)


In [None]:
lm_20newsgroups.most_similar("nasa")

In [None]:
tools.perplexity(model_GSMLDA, doc_term_tensor_test)

In [None]:
doc_term_tensor_test.shape

In [None]:
doc_term_matrix_tensor.shape

In [None]:
doc_term_matrix_test.shape