In [12]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import Parameter
import torch.nn.functional as F
import math

import TopicVAE

from sklearn.datasets import fetch_20newsgroups
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import argparse
from types import SimpleNamespace

import gensim.downloader as api
from gensim.models import KeyedVectors

In [4]:
newsgroups_train = fetch_20newsgroups(subset='train')

vectorizer = CountVectorizer(stop_words = 'english', min_df=.01, max_df=0.9, 
                             token_pattern = u'(?ui)\\b[a-z]{3,}\\b')
count_vecs = vectorizer.fit_transform(newsgroups_train.data)
doc_term_matrix = count_vecs.toarray()
doc_term_matrix.shape # number of documents, number of words (in vocab)
tokenizer = vectorizer.build_tokenizer()

# note: vectorizer.get_feature_names() != vectorizer.vocabulary_

doc_term_matrix_tensor = torch.from_numpy(doc_term_matrix).float()

args_dict = {"en1_units" : 100, "en2_units" : 100, "num_topic" : 50, 
             "batch_size" : 200, "optimizer" : 80, "learning_rate" : 0.002, 
             "momentum" : 0.99, "num_epoch" : 80, "init_mult" : 1, 
             "variance" : 0.995, "start" : True, "nogpu" : True}
args = SimpleNamespace(**args_dict)
args.num_input = doc_term_matrix_tensor.shape[1]

In [7]:
pretrained_language_model = api.load("fasttext-wiki-news-subwords-300")



In [9]:
pretrained_language_model.save("fasttext-wiki-news-subwords-300")

In [13]:
EMBEDDING_FILE = "fasttext-wiki-news-subwords-300"
try_loading_pretrained_lm = KeyedVectors.load(EMBEDDING_FILE)

In [14]:
pretrained_language_model.most_similar("cat")

[('cats', 0.8368596434593201),
 ('housecat', 0.767471194267273),
 ('-cat', 0.7602992057800293),
 ('dog', 0.7502298355102539),
 ('kitten', 0.7480818033218384),
 ('feline', 0.7353992462158203),
 ('super-cat', 0.7305206060409546),
 ('supercat', 0.7163283824920654),
 ('pet', 0.709028422832489),
 ('moggy', 0.7057286500930786)]

In [16]:
embedding_matrix = np.random.randn([doc_term_matrix.shape[1], 300])

iterator = 0
for word in vectorizer.get_feature_names():
    if word in pretrained_language_model.vocab:
        embedding_matrix[iterator] = pretrained_language_model.wv.word_vec(word)
    else:
        continue
        # embedding_matrix[iterator] = pretrained_language_model.wv.most_similar(word)
        # or something like that
    iterator += 1



In [20]:
sum(embedding_matrix.sum(1) == 0) # when it was np.zeros

20