# TOPIC2VEC algorithm by using gensim and according to the second hint given by Gordon Mohr.  
(https://groups.google.com/forum/#!topic/gensim/BVu5-pD6910)


1. Vectorization of docs by using CountVectorizer (with or without tfidf) with no lemmatization
2. Latent Dirichlet Allocation 
3. Topic2Vec of the entire dataset (20 NewsGroups)   

It saves:
* the topic2vec model obtained

Changes:
1. Minor changes to be compatible with newer gensim library

In [51]:
# !pip install pyorient

In [2]:
import numpy as np; import pandas as pd; import matplotlib.pyplot as plt
%matplotlib inline
import codecs 
from glob import glob
import os
import pickle
import copy
import pyorient
import ast

In [3]:
from __future__ import print_function
from time import time, sleep
import string
import re
# random
from random import shuffle

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation

In [4]:
from gensim import corpora, models, similarities

In [5]:
n_top_words = 20

## 1. IMPORTING DOCS FROM 20 NEWSGROUPS DATASET

In [6]:
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.med',
'sci.space',
'soc.religion.christian']

n_topics = len(categories)

categories_source = {}

for cat in categories:
    categories_source[cat] = cat.replace('.', '_')

In [7]:
categories_source

{'comp.sys.ibm.pc.hardware': 'comp_sys_ibm_pc_hardware',
 'comp.sys.mac.hardware': 'comp_sys_mac_hardware',
 'comp.windows.x': 'comp_windows_x',
 'rec.sport.baseball': 'rec_sport_baseball',
 'rec.sport.hockey': 'rec_sport_hockey',
 'sci.med': 'sci_med',
 'sci.space': 'sci_space',
 'soc.religion.christian': 'soc_religion_christian'}

In [8]:
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [9]:
for i,j in categories_source.items():
    print(i,j)

soc.religion.christian soc_religion_christian
comp.sys.mac.hardware comp_sys_mac_hardware
comp.windows.x comp_windows_x
sci.space sci_space
sci.med sci_med
rec.sport.baseball rec_sport_baseball
comp.sys.ibm.pc.hardware comp_sys_ibm_pc_hardware
rec.sport.hockey rec_sport_hockey


#### TOTAL NUMBER OF DOC

In [10]:
n_docs = newsgroups_train.filenames.shape[0]
n_docs

4744

In [11]:
type(newsgroups_train)

sklearn.utils.Bunch

In [12]:
newsgroups_train.filenames

array(['/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/sci.space/61065',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.hockey/52618',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/comp.windows.x/67032',
       ...,
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.hockey/52576',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20809',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20733'],
      dtype='<U96')

# 2. LDA to find the topic most-associated with each word

## 2.1 From Strings to Vectors

### WITH Lemmatization

In [33]:
# Run this once before doing Lemmatization
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

In [31]:
# Return the casting of the original tag in a single
# character which is accepted by the lemmatizer
import nltk.corpus  # splits on punctuactions   
stop_words = nltk.corpus.stopwords.words('english')

import re
def get_wordnet_pos(treebank_tag):

    # I recognize the initial character of the word, identifying the type
    if treebank_tag.startswith('J'):
        return nltk.corpus.reader.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.reader.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.reader.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.reader.wordnet.ADV
    else:
        return None

from nltk import word_tokenize, pos_tag        
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokenized_doc = word_tokenize(doc) # splits on punctuactions  
        tagged_doc = pos_tag(tokenized_doc)
        
        lemmatized_doc = []
        # Scan the (word, tag) tuples which are the elements of tagged_tweet1
        for word, tag in tagged_doc:
            ret_value = get_wordnet_pos(tag)
            # If the function does not return None I provide the ret_value
            if ret_value != None:
                lemmatized_doc.append(self.wnl.lemmatize(word, get_wordnet_pos(tag)))
            # If the function returns None I do not provide the ret_value
            else:
                lemmatized_doc.append(self.wnl.lemmatize(word))
        nonumbers_nopunct_lemmatized_doc = [word for word in lemmatized_doc if re.search('[a-zA-Z]{2,}', word)]
#        nonumbers_nopunct_lemmatized_doc = [word for word in nopunct_lemmatized_doc if not re.search('\d+', word)]
        lemmatized_doc_stopw = [word for word in nonumbers_nopunct_lemmatized_doc if word not in stop_words]
        
        return lemmatized_doc_stopw #[self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [38]:
t0 = time()
tf_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), encoding='utf-8', analyzer='word',
                                stop_words=["'s", "fx"], ngram_range = (1,1), min_df = 2).fit(newsgroups_train.data)
print("fit vectorizer with lemmatization done in %0.3fs." % (time() - t0))

fit vectorizer with lemmatization done in 86.246s.


In [39]:
tf_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words=["'s", 'fx'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.LemmaTokenizer object at 0x7f2eab3324e0>,
        vocabulary=None)

### WITHOUT Lemmatization

In [13]:
t0 = time()
tf_vectorizer = CountVectorizer(encoding='utf-8', analyzer='word', stop_words='english',
                                ngram_range = (1,1), min_df = 2, token_pattern = '[a-zA-Z]{2,}').fit(newsgroups_train.data)
print("fit vectorizer without lemmatization done in %0.3fs." % (time() - t0))

fit vectorizer without lemmatization done in 0.904s.


In [14]:
CountVectorizer

sklearn.feature_extraction.text.CountVectorizer

In [15]:
tf_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='[a-zA-Z]{2,}', tokenizer=None,
        vocabulary=None)

### Vectorization

In [16]:
# # get_feature_names() returns vocabulary
n_features = len(tf_vectorizer.get_feature_names())

In [17]:
n_features

19012

In [18]:
[tf_vectorizer.get_feature_names()[:10], tf_vectorizer.get_feature_names()[-10:]]

[['aa',
  'aaa',
  'aardvark',
  'aaron',
  'aas',
  'ab',
  'abandon',
  'abandoned',
  'abbott',
  'abbreviation'],
 ['zoomed',
  'zooming',
  'zorro',
  'zou',
  'zpixmap',
  'zterm',
  'zubov',
  'zupancic',
  'zupcic',
  'zyxel']]

In [19]:
newsgroups_train.data[0]

"Hmmm. I seem to recall that the attraction of solid state record-\nplayers and radios in the 1960s wasn't better performance but lower\nper-unit cost than vacuum-tube systems.\n\n\tMind you, my father was a vacuum-tube fan in the 60s (Switched\nto solid-state in the mid-seventies and then abruptly died; no doubt\nthere's a lesson in that) and his account could have been biased."

In [20]:
tf_docs = tf_vectorizer.transform(newsgroups_train.data)

In [21]:
tf_docs

<4744x19012 sparse matrix of type '<class 'numpy.int64'>'
	with 259893 stored elements in Compressed Sparse Row format>

### WITH TFIDF

tfidf_vectorizer = TfidfTransformer(sublinear_tf=False, use_idf = True).fit(tf_docs)
tfidf_docs = tfidf_vectorizer.transform(tf_docs)

## 2.2 LDA implementation

In [22]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [23]:
LatentDirichletAllocation

sklearn.decomposition.online_lda.LatentDirichletAllocation

In [24]:
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_docs, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf_docs)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=4744 and n_features=19012...




done in 11.349s.


In [25]:
# tf_feature_names is vocabulary
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
lssu gl cpu maria dma mask stormed dominating highlighting thirty belonged numlock mod wrist ik sgi rex indiana datahand cache
Topic #1:
like use just know does god people don time think problem drive edu new work good way window want thanks
Topic #2:
game year team games like season don think good win right just play players got didn did won hockey league
Topic #3:
food msg health disease cause patients diet lib medical diseases blood eat libxmu xmu foods day brain common effect doctor
Topic #4:
entry output com pts file edu period la buf entries pp rules power printf van oname build eof pt vs
Topic #5:
pitt gordon banks skepticism patients soon geb intellect jxp chastity dsl shameful cadre surrender candida pain edu yeast medical treatment
Topic #6:
space nasa launch orbit research center data earth shuttle satellite april lunar national moon year mission university information years science
Topic #7:
church catholic pope holy orthodox son schism autho

In [26]:
# row# of lda.components_ is topic#
# columns are the topic's unnormalized word weights (for all vocabulary words), in the sequence of the vocabulary word order
per_topic_distr_LDA = lda.components_
per_topic_distr_LDA.shape
#per_topic_distr_LDA.sum(axis=1)

(8, 19012)

## pyLDAVis

In [28]:
LDAvis_data_filepath = os.path.join('./', 'ldavis_prepared')

In [29]:
import pyLDAvis
import pyLDAvis.sklearn
import warnings

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    LDAvis_prepared = pyLDAvis.sklearn.prepare(lda, tf_docs, tf_vectorizer, mds='tsne')
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

In [30]:
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.display(LDAvis_prepared)

# 3. TOPIC2VEC

In [51]:
# argmax example:
# >>> kkk
# array([[1, 2, 3],
#       [0, 4, 2]])
# >>> np.argmax(kkk, 0)
# array([0, 1, 0])
# >>> np.argmax(kkk, 1)
# array([2, 1])
#
# this will select the topic with the most word weight for each word in the vocabulary
# after this, we can easily lookup the best topic of each vocabulary word by 
# most_p_topic[word_voca_index] -> word's topic (0 -7 in this case) 
most_p_topic = np.argmax(per_topic_distr_LDA, axis=0)

In [52]:
# per_topic_distr_LDA

In [53]:
most_p_topic.shape

(17197,)

In [54]:
word_and_topic = zip(tf_feature_names, most_p_topic)
# word2topic_dict = {word : 'topic_' + np.array_str(topic) for word, topic in word_and_topic}
word2topic_dict = {word : 'topic_{}'.format(topic) for word, topic in word_and_topic}

In [55]:
# list the top 5 words and their belonged topics
from itertools import islice
list(islice(word2topic_dict.items(), 5))

[('disrespectful', 'topic_4'),
 ('closed', 'topic_4'),
 ('fraud', 'topic_5'),
 ('ventura', 'topic_7'),
 ('powerful', 'topic_1')]

## 3.1 Tokenization

In [56]:
def tokenizer(document):
    text = "".join([ch for ch in document if ch not in string.punctuation])
    text_list = text.split()
    normalized_text = [x.lower() for x in text_list]
    # Define an empty list
    nostopwords_text = []
    # Scan the words
    for word in normalized_text:
        # Determine if the word is contained in the stop words list
        if word not in ENGLISH_STOP_WORDS:
            # If the word is not contained I append it
            nostopwords_text.append(word)
    tokenized_text = [word for word in nostopwords_text if re.search('[a-zA-Z]{2,}', word)]
            
    return tokenized_text

In [57]:
def map_doc_to_topic(tokenized_text, prefix, doc_id_number, word2topic_dict):
    doc_to_topic_list = [prefix + '_' + str(doc_id_number)]
    # print('adding doc_to_topic header element {}'.format(doc_to_topic_list[0]))

    for word in tokenized_text:
        if word in word2topic_dict.keys():
            doc_to_topic_list.append(word2topic_dict[word])
        # else:
        #    print('{} not found in word2topic_dict.keys'.format(word))

    return doc_to_topic_list

In [58]:
from gensim.models.deprecated.doc2vec import LabeledSentence

In [59]:
class LabeledLineSentence_training(object):
    def __init__(self, sources, word2topic_dict):
        self.labels_list = word2topic_dict
        self.sources = sources
        flipped = {}
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        print('len of sources is {}'.format(len(self.sources)))
        for source, prefix in self.sources.items():
            print(source)
            newsgroups_train_cat = fetch_20newsgroups(subset='train',
                                                      remove=('headers', 'footers', 'quotes'),
                                                      categories=[source])
            # print('len of newsgroups_train_cat is {}'.format(len(newsgroups_train_cat)))
            # (Pdb) newsgroups_train_cat.keys() -> 
            # dict_keys(['data', 'filenames', 'target', 'description', 'DESCR', 'target_names'])
            # import pdb; pdb.set_trace()
            for idx, doc in enumerate(newsgroups_train_cat.data):
                words_doc=tokenizer(doc)
                tags_doc = map_doc_to_topic(words_doc, prefix, idx, word2topic_dict)
                yield LabeledSentence(words = words_doc,
                                                     tags = tags_doc)
                
    def to_array(self):
        self.sentences = []
        print('len of sources is {}'.format(len(self.sources)))
        for source, prefix in self.sources.items():
            newsgroups_train_cat = fetch_20newsgroups(subset='train',
                                                      remove=('headers', 'footers', 'quotes'),
                                                      categories=[source])
            # print('len of newsgroups_train_cat is {}'.format(len(newsgroups_train_cat)))
            # import pdb; pdb.set_trace()
            # (Pdb) type(newsgroups_train_cat) -> <class 'sklearn.utils.Bunch'> => len is 6
            # (Pdb) type(newsgroups_train_cat.data) -> <class 'list'>
            # (Pdb) len(newsgroups_train_cat.data) -> 593
            # (Pdb) newsgroups_train_cat.data[0] -> document 1 strings, with newlines inside
            # (Pdb) newsgroups_train_cat.data[1] -> document 2 strings, with newlines inside
            # (Pdb) newsgroups_train_cat.target.shape -> (593,)
            # (Pdb) newsgroups_train_cat.target.max() -> 0
            for idx, doc in enumerate(newsgroups_train_cat.data):
                words_doc=tokenizer(doc)
                tags_doc = map_doc_to_topic(words_doc, prefix, idx, word2topic_dict)
                self.sentences.append(LabeledSentence(words = words_doc,
                                                     tags = tags_doc))
        return self.sentences
            
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

## 3.1 Training

### Revisit parameters before training

In [60]:
categories_source

{'comp.sys.ibm.pc.hardware': 'comp_sys_ibm_pc_hardware',
 'comp.sys.mac.hardware': 'comp_sys_mac_hardware',
 'comp.windows.x': 'comp_windows_x',
 'rec.sport.baseball': 'rec_sport_baseball',
 'rec.sport.hockey': 'rec_sport_hockey',
 'sci.med': 'sci_med',
 'sci.space': 'sci_space',
 'soc.religion.christian': 'soc_religion_christian'}

In [61]:
list(islice(word2topic_dict.items(), 5))

[('disrespectful', 'topic_4'),
 ('closed', 'topic_4'),
 ('fraud', 'topic_5'),
 ('ventura', 'topic_7'),
 ('powerful', 'topic_1')]

### Tokenization

In [62]:
# For all input news group documents
#    For all sentences in that document
#        Generate gensim.models.deprecated.doc2vec.LabeledSentence
#            (words, tags with group name and word's topics)
it = LabeledLineSentence_training(categories_source, word2topic_dict)

#### Quote notes about LabeledSentence and TaggedDocument
1. LabeledSentence is an older, deprecated name for the same simple object-type to encapsulate a text-example that is now called TaggedDocument. 
2. Any objects that have words and tags properties, each a list, will do.
    - words is always a list of strings
    - tags can be a mix of integers and strings, but in the common and most-efficient case, is just a list with a single id integer, starting at 0.)

#### [Info about how to use Gensim doc2vec](https://medium.com/@mishra.thedeepak/doc2vec-in-a-simple-way-fa80bfe81104)
1. In this example it uses filename and doc label
2. And after the training it can print the vector of the file using its name

    ```
    docvec = d2v_model.docvecs[‘1.txt’] #if string tag used in training
    print docvec
    ```
3. Or to get most similar document with similarity scores using document-index

    ```
    similar_doc = d2v_model.docvecs.most_similar(14) 
    print similar_doc
    ```

In [63]:
# print the first new group's item #1
inspect_item = next(iter(it))
print(type(inspect_item))
print(inspect_item)
print(len(inspect_item.tags), len(inspect_item.words))
print(inspect_item.tags[:10], inspect_item.words[:10])

len of sources is 8
sci.space
<class 'gensim.models.deprecated.doc2vec.LabeledSentence'>
LabeledSentence(['lunar', 'satellite', 'needs', 'fuel', 'regular', 'orbit', 'corrections', 'fuel', 'runs', 'crash', 'months', 'orbits', 'apollo', 'motherships', 'changed', 'noticeably', 'lunar', 'missions', 'lasting', 'days', 'possible', 'stable', 'orbits', 'moons', 'gravitational', 'field', 'poorly', 'mapped', 'know', 'perturbations', 'sun', 'earth', 'relatively', 'minor', 'issues', 'low', 'altitudes', 'big', 'problem', 'moons', 'gravitational', 'field', 'quite', 'lumpy', 'irregular', 'distribution', 'mass', 'moon'], ['sci_space_0', 'topic_5', 'topic_5', 'topic_5', 'topic_7', 'topic_5', 'topic_5', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_7', 'topic_1', 'topic_1', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_5', 'topic_5', 'topic_5', 'topic_5'])
30 48
['sci_space_0', 'topic_5', 'topic_5', 'topic_



In [64]:
# type(models.Doc2Vec)
model = models.Doc2Vec(size=100, window=10, min_count=4, dm=1, dbow_words=1,
                              workers=50, alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(it.to_array())



len of sources is 8




In [65]:
from tqdm import tqdm
for epoch in tqdm(range(20)):
    model.train(it.sentences_perm(), total_examples=model.corpus_count, epochs=1)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

100%|██████████| 20/20 [04:39<00:00, 13.99s/it]


In [66]:
fname =  os.getcwd() # Prints the working directory
fname = fname + '/topic2vec_20NG_2_ndoc' + str(n_docs) + 'n_topic' + str(n_topics) + '.model'
model.save(fname)

### Show results
A quick info about [how to use gensim doc2vec model to query words by label vector or vice versa](https://github.com/RaRe-Technologies/gensim/issues/1397)

1. search words using word

    ```
    model.most_similar('word')
    # only similar words were returned but not labels
    ```
2. search label by label
    - use model.docvecs.most_similar to search for similar labels using labels
3. search words by label

    ```
    model.docvecs['label']
    model.similar_by_vector(label_vec)
    # only similar words were returned
    ```
4. search labels by word

    ```
    word_vec = model['word']
    model.docvecs.most_similar([word_vec])
    # returns similar labels
    ```

In [67]:
from gensim import corpora, models, similarities

# load the model back
fname = fname if fname is not None else 'topic2vec_20NG_2_ndoc4744n_topic8.model'
print('loading model from {}'.format(fname))
d2v_model = models.doc2vec.Doc2Vec.load(fname)

loading model from /home/aimladmin/notebooks/home/ksong/Topic2Vec/topic2vec_20NG_2_ndoc4744n_topic8.model


In [68]:
# list the top 5 tags in the model
from itertools import islice

paragraphs_tag = d2v_model.docvecs.doctags
type(paragraphs_tag)
print(len(paragraphs_tag), list(islice(paragraphs_tag.items(),5)))

4752 [('comp_sys_ibm_pc_hardware_354', Doctag(offset=2148, word_count=224, doc_count=1)), ('comp_sys_ibm_pc_hardware_326', Doctag(offset=2120, word_count=13, doc_count=1)), ('comp_windows_x_442', Doctag(offset=4601, word_count=72, doc_count=1)), ('soc_religion_christian_326', Doctag(offset=1521, word_count=10, doc_count=1)), ('comp_sys_ibm_pc_hardware_291', Doctag(offset=2085, word_count=58, doc_count=1))]


In [69]:
ragraphs_vector = d2v_model.docvecs.doctag_syn0
ragraphs_vector.shape

  """Entry point for launching an IPython kernel.


(4752, 100)

In [105]:
d2v_model.docvecs.most_similar(positive = ['sci_space_96'])

[('rec_sport_baseball_389', 0.35365036129951477),
 ('rec_sport_baseball_378', 0.34690025448799133),
 ('comp_sys_ibm_pc_hardware_563', 0.3393045663833618),
 ('sci_med_524', 0.3304837942123413),
 ('comp_sys_mac_hardware_518', 0.32284557819366455),
 ('sci_med_125', 0.3165658116340637),
 ('rec_sport_hockey_94', 0.28935739398002625),
 ('comp_sys_mac_hardware_292', 0.289227157831192),
 ('rec_sport_hockey_203', 0.285552978515625),
 ('rec_sport_baseball_449', 0.28499162197113037)]

In [115]:
label_vec = d2v_model.docvecs['sci_space_96']
d2v_model.wv.similar_by_vector(label_vec)

[('lords', 0.39834630489349365),
 ('doesnt', 0.3908179998397827),
 ('darling', 0.3805291950702667),
 ('ahola', 0.35309669375419617),
 ('worry', 0.3530368208885193),
 ('destroyed', 0.3495197892189026),
 ('tale', 0.33816206455230713),
 ('cpus', 0.33736613392829895),
 ('lzone', 0.325408935546875),
 ('ears', 0.3231354355812073)]

In [70]:
for topic_idx in range(8):
    print('>>> top 10 relevant words of topic {}'.format(topic_idx))
    topic_vec = d2v_model.docvecs['topic_{}'.format(topic_idx)]
    print(d2v_model.wv.similar_by_vector(topic_vec))

>>> top 10 relevant words of topic 0
[('al', 0.8068424463272095), ('holds', 0.7901829481124878), ('rd3', 0.7826679944992065), ('percentage', 0.7784439921379089), ('behalf', 0.7727761268615723), ('spouse', 0.7723900079727173), ('intensive', 0.7632749080657959), ('crime', 0.7625982165336609), ('molecular', 0.7587553858757019), ('experimental', 0.7542406320571899)]
>>> top 10 relevant words of topic 1
[('echohostname', 0.922170877456665), ('hank', 0.9113080501556396), ('echo', 0.9076107144355774), ('set', 0.9072037935256958), ('woof', 0.8928797841072083), ('aaron', 0.8849927186965942), ('tail', 0.8846719264984131), ('iivx', 0.8805248141288757), ('finished', 0.8791848421096802), ('cdrom', 0.8782916069030762)]
>>> top 10 relevant words of topic 2
[('decs', 0.8631792068481445), ('create', 0.8595727682113647), ('exposuremask', 0.8494325876235962), ('waking', 0.8476630449295044), ('spoke', 0.8470355272293091), ('event', 0.8446255922317505), ('program', 0.8435776829719543), ('meditating', 0.842

In [15]:
word_vec = d2v_model['nasa']
d2v_model.docvecs.most_similar([word_vec])

[('sci_space_411', 0.6604948043823242),
 ('comp_sys_mac_hardware_42', 0.6437797546386719),
 ('soc_religion_christian_278', 0.6072692275047302),
 ('rec_sport_baseball_549', 0.5811659097671509),
 ('comp_windows_x_404', 0.5668190717697144),
 ('rec_sport_hockey_233', 0.5499863028526306),
 ('soc_religion_christian_28', 0.5372079610824585),
 ('comp_sys_mac_hardware_476', 0.47990018129348755),
 ('comp_sys_mac_hardware_125', 0.47634610533714294),
 ('soc_religion_christian_226', 0.4659426808357239)]

In [107]:
d2v_model.docvecs.n_similarity(['topic_0', 'topic_2'], ['topic_3', 'topic_4'])

0.644150725372538

In [108]:
d2v_model.docvecs.similarity('topic_0', 'topic_2')

0.3563553612509076