# Topic2Vec_20newsgroups on Gensim

In [51]:
# !pip install pyorient

In [327]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from tqdm import tqdm

## 1. IMPORTING DOCS FROM 20 NEWSGROUPS DATASET

In [328]:
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.med',
'sci.space',
'soc.religion.christian']

n_topics = len(categories)

categories_source = {}

for cat in categories:
    categories_source[cat] = cat.replace('.', '_')

In [329]:
categories_source

{'comp.sys.ibm.pc.hardware': 'comp_sys_ibm_pc_hardware',
 'comp.sys.mac.hardware': 'comp_sys_mac_hardware',
 'comp.windows.x': 'comp_windows_x',
 'rec.sport.baseball': 'rec_sport_baseball',
 'rec.sport.hockey': 'rec_sport_hockey',
 'sci.med': 'sci_med',
 'sci.space': 'sci_space',
 'soc.religion.christian': 'soc_religion_christian'}

In [330]:
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [331]:
for i,j in categories_source.items():
    print(i,j)

comp.sys.ibm.pc.hardware comp_sys_ibm_pc_hardware
comp.sys.mac.hardware comp_sys_mac_hardware
comp.windows.x comp_windows_x
rec.sport.baseball rec_sport_baseball
rec.sport.hockey rec_sport_hockey
sci.med sci_med
sci.space sci_space
soc.religion.christian soc_religion_christian


#### TOTAL NUMBER OF DOC

In [332]:
n_docs = newsgroups_train.filenames.shape[0]
n_docs

4744

In [333]:
type(newsgroups_train)

sklearn.utils.Bunch

In [334]:
newsgroups_train.filenames

array(['/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/sci.space/61065',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.hockey/52618',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/comp.windows.x/67032',
       ...,
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.hockey/52576',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20809',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20733'],
      dtype='<U96')

In [335]:
len(newsgroups_train.data)

4744

In [336]:
newsgroups_train.data[0]

"Hmmm. I seem to recall that the attraction of solid state record-\nplayers and radios in the 1960s wasn't better performance but lower\nper-unit cost than vacuum-tube systems.\n\n\tMind you, my father was a vacuum-tube fan in the 60s (Switched\nto solid-state in the mid-seventies and then abruptly died; no doubt\nthere's a lesson in that) and his account could have been biased."

### Spacy Playground

In [76]:
import spacy
import pandas as pd

# for quick reference goes to https://github.com/hailusong/lda2vec/blob/master/lda2vec/preprocess.py
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm', disable=['parser'])

if 1 == 1:
    parsed_review = nlp('Hello the > < wolrd!\nWhere we Are are Be become never today unfortunately?\nWill that The Walnurs or they or he or she join us as a lovely team\'s party')
    token_attributes = [(token.orth_,
                     token.lemma_,
                     token.prob,
                     token.is_stop,
                     token.is_alpha,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_review]

    df = pd.DataFrame(token_attributes,
                      columns=['text',
                               'lemma',
                               'log_probability',
                               'stop?',
                               'isalpha?',
                               'punctuation?',
                               'whitespace?',
                               'number?',
                               'out of vocab.?'])

    # df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
    #                                       .applymap(lambda x: 'Yes' if x else ''))

    print(df)

             text          lemma  log_probability  stop?  isalpha?  \
0           Hello          hello            -20.0  False      True   
1             the            the            -20.0   True      True   
2               >              >            -20.0  False     False   
3               <              <            -20.0  False     False   
4           wolrd          wolrd            -20.0  False      True   
5               !              !            -20.0  False     False   
6              \n             \n            -20.0  False     False   
7           Where          where            -20.0  False      True   
8              we         -PRON-            -20.0   True      True   
9             Are             be            -20.0  False      True   
10            are             be            -20.0   True      True   
11             Be             be            -20.0  False      True   
12         become         become            -20.0   True      True   
13          never   

## Phrase Modeling
Code from [Modern NLP in Python](http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb)

In [337]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import spacy
import pandas as pd
import itertools as it

nlp = spacy.load('en_core_web_sm', disable=['parser'])

In [338]:
removed_tokens = set()

def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    result = token.is_punct or token.is_space or token.is_stop or not token.is_alpha
    # result = token.is_punct or token.is_space
    if result:
       removed_tokens.add(token.text)

    return result

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

def line_asis(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    # with codecs.open(filename, encoding='utf_8') as f:
    with open(filename, 'r') as f:
        # for line in f:
        #     yield line
        return [line.rstrip('\n') for line in f]

def line_sklearn_data(sklearn_data):
    """
    generator function to read in sklearn data from the list
    and un-escape the original line breaks in the text
    """
    
    for data in sklearn_data:
        yield data.replace('\\n', '\n').replace('\n', ' ').lower()
        # yield data
    
def line_sklearn_data_keepcase(sklearn_data):
    """
    generator function to read in sklearn data from the list
    and un-escape the original line breaks in the text
    """
    
    for data in sklearn_data:
        yield data.replace('\\n', '\n').replace('\n', ' ')
        # yield data
    
def lemmatized_sentence_corpus(sentences):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for sent in nlp.pipe(sentences,
                                  batch_size=10000, n_threads=4):
        # for sent in parsed_review.sents:
        # for sent in parsed_review:
        yield ' '.join([token.lemma_ for token in sent if not punct_space(token)])

def no_lemmatized_sentence_corpus(sentences):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for sent in nlp.pipe(sentences,
                                  batch_size=10000, n_threads=4):
        # for sent in parsed_review.sents:
        # for sent in parsed_review:
        yield ' '.join([token.text for token in sent if not punct_space(token)])


### Prepare corups in different flavours: with/without NLP, with/without Lemma, etc.

In [303]:
import codecs

if 1 == 1:
    with codecs.open('./withoutnlp.txt', 'w', encoding='utf_8') as f:
        for sentence in tqdm(line_sklearn_data(newsgroups_train.data)):
            f.write(sentence + '\n')

4744it [00:00, 91037.43it/s]


In [304]:
import itertools as it

# nlp - https://spacy.io/api/language
# https://spacy.io/usage/linguistic-features#section-sbd
print('Checking {} {}'.format(type(nlp), type(nlp.pipe)))

strs = ['Hello my word. Here am I again.', 'where is it?']

for data in line_sklearn_data(strs):
    print('>> >>{}'.format(data))

print('>>0>>{}'.format(newsgroups_train.data[0]))

for data in line_sklearn_data(newsgroups_train.data[0:1]):
    print('>>1>>{}'.format(data))

for data in no_lemmatized_sentence_corpus(line_sklearn_data(strs)):
    print('>>2a>>{}'.format(data))

for data in no_lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data[0:1])):
    print('>>2>>{}'.format(data))

for data in lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data[0:1])):
    print('>>3>>{}'.format(data))

Checking <class 'spacy.lang.en.English'> <class 'method'>
>> >>hello my word. here am i again.
>> >>where is it?
>>0>>Hmmm. I seem to recall that the attraction of solid state record-
players and radios in the 1960s wasn't better performance but lower
per-unit cost than vacuum-tube systems.

	Mind you, my father was a vacuum-tube fan in the 60s (Switched
to solid-state in the mid-seventies and then abruptly died; no doubt
there's a lesson in that) and his account could have been biased.
>>1>>hmmm. i seem to recall that the attraction of solid state record- players and radios in the 1960s wasn't better performance but lower per-unit cost than vacuum-tube systems.  	mind you, my father was a vacuum-tube fan in the 60s (switched to solid-state in the mid-seventies and then abruptly died; no doubt there's a lesson in that) and his account could have been biased.
>>2a>>hello word
>>2a>>
>>2>>hmmm recall attraction solid state players radios better performance lower unit cost vacuum tube sys

In [305]:
if 1 == 1:
    with codecs.open('./withnlp-nolemma.txt', 'w', encoding='utf_8') as f:
        for sentence in tqdm(no_lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data))):
            f.write(sentence + '\n')

    with codecs.open('./withnlp-lemma.txt', 'w', encoding='utf_8') as f:
        for sentence in tqdm(lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data))):
            f.write(sentence + '\n')

4744it [01:57, 40.34it/s]
4744it [01:58, 39.91it/s]


In [306]:
if 1 == 1:
    with codecs.open('./withoutnlp-keepcase.txt', 'w', encoding='utf_8') as f:
        for sentence in tqdm(line_sklearn_data_keepcase(newsgroups_train.data)):
            f.write(sentence + '\n')    

4744it [00:00, 96856.34it/s]


### Unigram

In [339]:
import os
import codecs

In [340]:
intermediate_directory = './'

In [341]:
unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all.txt')

In [325]:
%%time
from tqdm import tqdm

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
    removed_tokens = set()
    print('Original >>>')
    print(newsgroups_train.data[1:2])

    print('New >>>')
    for sentence in tqdm(lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data[1:2]))):
        print(sentence + '\n')

    print(removed_tokens)

CPU times: user 10 µs, sys: 1e+03 ns, total: 11 µs
Wall time: 14.5 µs


In [326]:
%%time
from tqdm import tqdm

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in tqdm(lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data))):
            f.write(sentence + '\n')

4744it [02:00, 39.22it/s]

CPU times: user 3min, sys: 18.1 s, total: 3min 18s
Wall time: 2min





In [342]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [343]:
# print a few example lemmatized sentences
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print(' '.join(unigram_sentence))
    print('')

right suppose problem xmncolormap xtncolormap truly literate being set want start application new colormap chicken egg sort problem look xt faq example let know maybe improve example

read right say essence large economy nation discretionary fund waste lunar facility certainly partially case apollo real lunar colony probably require continue military scientific commercial reason money approach conceivable luna military purpose possible luna commercial purpose likely luna scientific purpose year lunar base predicate funding level little different find antarctic base person base moon million year use grad student gary

probably mean blood pressure go treadmill normal ask mean answer person gordon bank skepticism chastity intellect shameful surrender soon

hey folk course develope x window application encounter problem transform x window bitmap postscript file library routine source code job

post game interview larussa claim sparky good manager basebal explain history sparky soften blow 

### Bigram

In [344]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')

In [345]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:
    # gensim Phrases
    # automatically detect common phrases – multi-word expressions / word n-grams – from a stream of sentences.
    #
    # example
    # >>> sentences = Text8Corpus(datapath('testcorpus.txt'))     <- load training example data
    # >>> phrases = Phrases(sentences, min_count=1, threshold=1)  <- train the Phrases model
    # >>> phrases[[u'trees', u'graph', u'minors']]                <- apply trained model to sentence
    # [u'trees_graph', u'minors']                                 <- identify phrases 'trees_graph'
    #
    # >>> phrases.add_vocab([["hello", "world"], ["meow"]])       <- update model with new sentences
    # 
    # >>> bigram = Phraser(phrases)                               <- construct faster model (this is only an wrapper)
    # >>> bigram[[u'trees', u'graph', u'minors']]                 <- apply model to sentence
    # [u'trees_graph', u'minors']
    #
    bigram_model = Phrases(unigram_sentences)
    bigram_model.save(bigram_model_filepath)

CPU times: user 2.04 s, sys: 60.5 ms, total: 2.1 s
Wall time: 2.15 s


In [346]:
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

In [347]:
# Prepare to apply the trained Phrases model to the unigram lemmatized sentences
bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_all.txt')

In [348]:
%%time
from tqdm import tqdm

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for unigram_sentence in tqdm(unigram_sentences):
            bigram_sentence = ' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')

4623it [00:02, 2146.21it/s]

CPU times: user 2.14 s, sys: 24.8 ms, total: 2.16 s
Wall time: 2.16 s





In [349]:
# print out a few examples of bigram on lemmatized sentences.
# note that vice_versa, original_question, etc. are now considered as phrases
bigram_sentences = LineSentence(bigram_sentences_filepath)

for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print(' '.join(bigram_sentence))
    print('')

right suppose problem xmncolormap xtncolormap truly literate being set want start application new colormap chicken egg sort problem look xt faq example let_know maybe improve example

read right say essence large economy nation discretionary fund waste lunar facility certainly partially case apollo real lunar_colony probably require continue military scientific commercial reason money approach conceivable luna military purpose possible luna commercial purpose likely luna scientific purpose year lunar_base predicate funding level little different find antarctic base person base moon million year use grad_student gary

probably mean blood_pressure go treadmill normal ask mean answer person gordon_bank skepticism_chastity intellect_shameful surrender_soon

hey folk course develope x window application encounter problem transform x window bitmap postscript file library routine source_code job

post game interview larussa claim sparky good manager basebal explain history sparky soften blow 

### Trigram

In [350]:
trigram_model_filepath = os.path.join(intermediate_directory,
                                      'trigram_model_all')

In [351]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:
    trigram_model = Phrases(bigram_sentences)
    trigram_model.save(trigram_model_filepath)

CPU times: user 1.27 s, sys: 11.8 ms, total: 1.28 s
Wall time: 1.35 s


In [352]:
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

In [353]:
type(trigram_model)

gensim.models.phrases.Phrases

In [354]:
trigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'trigram_sentences_all.txt')

In [355]:
%%time
from tqdm import tqdm

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in tqdm(bigram_sentences):
            trigram_sentence = ' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')

4623it [00:02, 2153.12it/s]

CPU times: user 2.14 s, sys: 24.7 ms, total: 2.16 s
Wall time: 2.15 s





In [356]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [357]:
# print a few trigram examples on lemmatized sentences
# note that bigram words are still there but we see a few trigrams now.
# like san_jose_sharks, learn_how_to
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print(' '.join(trigram_sentence))
    print('')

right suppose problem xmncolormap xtncolormap truly literate being set want start application new colormap chicken egg sort problem look xt faq example let_know maybe improve example

read right say essence large economy nation discretionary fund waste lunar facility certainly partially case apollo real lunar_colony probably require continue military scientific commercial reason money approach conceivable luna military purpose possible luna commercial purpose likely luna scientific purpose year lunar_base predicate funding level little different find antarctic base person base moon million year use grad_student gary

probably mean blood_pressure go treadmill normal ask mean answer person gordon_bank_skepticism_chastity intellect_shameful_surrender_soon

hey folk course develope x_window application encounter problem transform x_window bitmap postscript file library routine source_code job

post game interview larussa claim sparky good manager basebal explain history sparky soften blow 

In [358]:
trigram_reviews_filepath = os.path.join(intermediate_directory,
                                        'trigram_transformed_reviews_all.txt')

In [359]:
%%time

# NOT NEED TO RUN THIS AS WE HAVE DONE ALL STEPS HERE BEFORE
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                      batch_size=10000, n_threads=4):
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.en.STOPWORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.39 µs


In [360]:
# print('Original:' + u'\n')
#
# for review in it.islice(line_review(review_txt_filepath), 11, 12):
#    print(review)
trigram_reviews_filepath = trigram_sentences_filepath

print('----' + u'\n')
print('Transformed:' + u'\n')

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print(review)

----

Transformed:

simply wish thank dave mielke share tract concern god_love welcome great source comfort



# 2. LDA to find the topic most-associated with each word
both **sklearn** and **gensim** LDA implementation

### Create Vocabulary

In [124]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from tqdm import tqdm
import spacy
import pandas as pd
import itertools as it

nlp = spacy.load('en_core_web_sm', disable=['parser'])

In [125]:
import os

intermediate_directory = '.'
trigram_dictionary_filepath = os.path.join(intermediate_directory,
                                           'trigram_dict_all.dict')

In [126]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 1 == 1:
    trigram_reviews_filepath = './withnlp-lemma.txt'
    print('loading data from {}'.format(trigram_reviews_filepath))
    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(tqdm(trigram_reviews))
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.03)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)

727it [00:00, 7179.25it/s]

loading data from ./withnlp-lemma.txt


4623it [00:00, 7484.93it/s]

CPU times: user 678 ms, sys: 8.37 ms, total: 686 ms
Wall time: 676 ms





In [127]:
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [128]:
# This module implements the concept of a Dictionary – a mapping between words and their integer ids.
# ops supported: 
# - doc2bow
# - doc2idx
# - filter_extremes
# - filter_n_most_frequent
# - compactify
#
# compactify: Assign new word ids to all words, shrinking any gaps.
print(type(trigram_dictionary), trigram_dictionary.num_nnz, trigram_dictionary.num_docs, trigram_dictionary.num_pos)

<class 'gensim.corpora.dictionary.Dictionary'> 246561 4623 390822


In [129]:
trigram_bow_filepath = os.path.join(intermediate_directory,
                                    'trigram_bow_corpus_all.mm')

In [130]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [131]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 1 == 1:
    print('MmCorpus from bow generator on {}'.format(trigram_reviews_filepath))
    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))

MmCorpus from bow generator on ./withnlp-lemma.txt
CPU times: user 654 ms, sys: 12 ms, total: 666 ms
Wall time: 667 ms


In [132]:
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [133]:
# Corpus in the Matrix Market format.
print(type(trigram_bow_corpus))

<class 'gensim.corpora.mmcorpus.MmCorpus'>


In [134]:
document_index = 0
[trigram_dictionary[id] for (id, bow_count) in trigram_bow_corpus[document_index]]

['account',
 'bias',
 'die',
 'doubt',
 'father',
 'hmmm',
 'lesson',
 'mid',
 'performance',
 'radio',
 'recall',
 'solid',
 'switch',
 'tube',
 'unit',
 'vacuum']

In [135]:
document_index = 1
[trigram_dictionary.id2token[id] for (id, bow_count) in trigram_bow_corpus[document_index]][:20]

['alabama',
 'atlanta',
 'bay',
 'beach',
 'begin',
 'believer',
 'blue',
 'bruin',
 'canada',
 'canuck',
 'capital',
 'car',
 'conference',
 'cool',
 'dalla',
 'devil',
 'diego',
 'direction',
 'dream',
 'drug']

## 2.2a Sklearn LDA implementation

In [13]:
from time import time

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation

import spacy 
# nlp = spacy.load('en_core_web_sm')
nlp = spacy.load('en_core_web_sm', disable=['parser'])

### No Lemmatization

In [383]:
t0 = time()

intermediate_directory = '.'
# trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt')
# trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt')
# train_data = line_review(trigram_sentences_filepath)
# train_data = lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data))
# train_data = newsgroups_train.data
# train_data = no_lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data))
# train_data = line_asis('./withoutnlp.txt')
# train_data = line_asis('./withnlp-lemma.txt')
# train_data = line_asis('./withnlp-nolemma.txt')
train_data = line_asis('./trigram_sentences_all.txt')

# for train_line in it.islice(train_data, 0, 10):
#    print(train_line)

In [384]:
(train_data[:2], newsgroups_train.data[:2])

(['hmmm recall attraction solid_state player radio good performance low unit cost vacuum tube system mind father vacuum tube fan switch solid_state mid seventy abruptly die doubt lesson account bias',
  'nhl year get reliable source dream year_ago initially think take strong drug realization begin league start direction walt disney conference anaheim mighty chipmunk change new la king hockey movie la flame see san_jose_shark san_diego bruin tijuana red_wing hockey_team follow car industry dalla star houston oiler texas_ranger seattle canuck norm green conference alabama white hawk biloxi blue tampa_bay_lightning miami blade helsinki jet hear start get anthem montreal quebecois sp canada atlanta devil orlando penquin key west islander hartford_whaler whaler huh palm beach capital anahaim team becomm real begin believe rest message sure future turn believer nhl abandond ice rink expensive cool rink subtropic local hardly know ice nhl roller skate hockey_league way create public interest 

In [385]:
# train_data = line_review(trigram_sentences_filepath)
# train_data = lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data))

tf_vectorizer = CountVectorizer(encoding='utf-8', analyzer='word', stop_words='english',
                                ngram_range = (1,1), min_df = 2, token_pattern = '[a-zA-Z]{2,}').fit(train_data)
print("fit vectorizer without lemmatization done in %0.3fs." % (time() - t0))

fit vectorizer without lemmatization done in 7.571s.


In [386]:
CountVectorizer

sklearn.feature_extraction.text.CountVectorizer

In [387]:
tf_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='[a-zA-Z]{2,}', tokenizer=None,
        vocabulary=None)

### Vectorization

In [388]:
# # get_feature_names() returns vocabulary
n_features = len(tf_vectorizer.get_feature_names())

In [389]:
n_features

14196

In [390]:
[tf_vectorizer.get_feature_names()[:10], tf_vectorizer.get_feature_names()[-10:]]

[['aa',
  'aaa',
  'aaron',
  'aas',
  'ab',
  'abandon',
  'abbott',
  'abbreviation',
  'abc',
  'abd'],
 ['zoom',
  'zooming',
  'zorro',
  'zou',
  'zpixmap',
  'zterm',
  'zubov',
  'zupancic',
  'zupcic',
  'zyxel']]

In [391]:
# train_data = line_review(trigram_sentences_filepath)
# train_data = lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data))
# train_data = newsgroups_train.data
# train_data = no_lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data))
# train_data = line_asis('./withoutnlp.txt')
# train_data = line_asis('./withoutnlp-keepcase.txt')
# train_data = line_asis('./withnlp-lemma.txt')
# train_data = line_asis('./withnlp-nolemma.txt')
train_data = line_asis('./trigram_sentences_all.txt')

tf_docs = tf_vectorizer.transform(train_data)

In [392]:
tf_docs

<4623x14196 sparse matrix of type '<class 'numpy.int64'>'
	with 223639 stored elements in Compressed Sparse Row format>

### Sklearn LDA

In [393]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [394]:
LatentDirichletAllocation

sklearn.decomposition.online_lda.LatentDirichletAllocation

In [395]:
# print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
#       % (n_docs, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf_docs)
print("done in %0.3fs." % (time() - t0))



done in 15.652s.


In [396]:
# tf_feature_names is vocabulary
print("\nTopics in LDA model:")
n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
gm baltimore john utica adirondack moncton rochester providence springfield binghamton cape cdi vs breton st fredericton disc vision terminator halifax
Topic #1:
file window program use entry server widget application include available run set motif version line display information source code list
Topic #2:
game year good win team think player play season like run hit league score look right fan lose come time
Topic #3:
space launch nasa year center orbit satellite mission research program earth report shuttle health high information national lunar use food
Topic #4:
play pt game team hockey period la goal nhl new playoff pittsburgh player season power cup shot detroit van pp
Topic #5:
jb het sea spacewalk scott te ns en utrecht kan andrew karma boy compaq mil snd dortmund cin tex wright
Topic #6:
god people think know believe good say time christian jesus come like thing church day way question want life mean
Topic #7:
drive know like work use problem 

In [275]:
# row# of lda.components_ is topic#
# columns are the topic's unnormalized word weights (for all vocabulary words), in the sequence of the vocabulary word order
per_topic_distr_LDA = lda.components_
per_topic_distr_LDA.shape
#per_topic_distr_LDA.sum(axis=1)

(8, 14196)

## 2.2b Gensim LDA implementation

In [136]:
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')

In [137]:
%%time
import warnings
from gensim.models import LdaModel

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 1 == 1:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        # lda = LdaMulticore(trigram_bow_corpus,
        #                   num_topics=7,
        #                   id2word=trigram_dictionary,
        #                   workers=3)
        lda = LdaModel(trigram_bow_corpus,
                           num_topics=7,
                           id2word=trigram_dictionary)

    lda.save(lda_model_filepath)

CPU times: user 8.88 s, sys: 268 ms, total: 9.15 s
Wall time: 5.13 s


In [138]:
# load the finished LDA model from disk
lda = LdaModel.load(lda_model_filepath)

In [139]:
# find out topics by word
for test_word in ['nasa', 'space', 'good', 'go']:
    test_id = trigram_dictionary.token2id[test_word]
    test_topics = lda.get_term_topics(test_id)
    print('{} id is {}, topics is {}'.format(trigram_dictionary[test_id], test_id, test_topics))

nasa id is 823, topics is []


KeyError: 'space'

In [140]:
def explore_topic(topic_number, topn=10):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    # print('{:20} {}'.format('term', 'frequency') + '\n')

    # tf_list = [(term, frequency) for term, frequency in lda.show_topic(topic_number, topn)]
    tf_list = [term for term, frequency in lda.show_topic(topic_number, topn)]
    print('Topic {}: {}'.format(topic_number, ' '.join(tf_list)))

In [141]:
for topic_id in range(0,7):
    explore_topic(topic_id, topn=20)

Topic 0: law truth keyboard key paul colormap score pain hell belief pick argument puck faith value human evidence offense mouse brave
Topic 1: motif manager server widget xt user offer client port modem screen xlib function serial mit code resource patch toolkit character
Topic 2: entry period king la flame pt pittsburgh chicago blue april vs pp goal trade playoff calgary captain shot chi score
Topic 3: nasa launch mission orbit shuttle dc rocket flight center satellite spacecraft m bank fund faq gordon earth tax skepticism april
Topic 4: msg food disease center religion belief doctor bible earth christianity cause patient switch lunar father faith moon adaptec reaction cancer
Topic 5: scsi controller driver mb bus tape device mode floppy s ide interface ram adaptec fast video hardware server transfer port
Topic 6: church simms catholic law gm service st nasa station baseball bishop canon pope scripture de american orthodox language bible development


### 2.3 pyLDAVis

In [136]:
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

In [137]:
%%time
import pickle
import pyLDAvis
import pyLDAvis.gensim
import warnings

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

CPU times: user 6.4 s, sys: 193 ms, total: 6.6 s
Wall time: 4.28 s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [138]:
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.display(LDAvis_prepared)

# 3. TOPIC2VEC

In [51]:
# argmax example:
# >>> kkk
# array([[1, 2, 3],
#       [0, 4, 2]])
# >>> np.argmax(kkk, 0)
# array([0, 1, 0])
# >>> np.argmax(kkk, 1)
# array([2, 1])
#
# this will select the topic with the most word weight for each word in the vocabulary
# after this, we can easily lookup the best topic of each vocabulary word by 
# most_p_topic[word_voca_index] -> word's topic (0 -7 in this case) 
most_p_topic = np.argmax(per_topic_distr_LDA, axis=0)

In [52]:
# per_topic_distr_LDA

In [53]:
most_p_topic.shape

(17197,)

In [54]:
word_and_topic = zip(tf_feature_names, most_p_topic)
# word2topic_dict = {word : 'topic_' + np.array_str(topic) for word, topic in word_and_topic}
word2topic_dict = {word : 'topic_{}'.format(topic) for word, topic in word_and_topic}

In [55]:
# list the top 5 words and their belonged topics
from itertools import islice
list(islice(word2topic_dict.items(), 5))

[('disrespectful', 'topic_4'),
 ('closed', 'topic_4'),
 ('fraud', 'topic_5'),
 ('ventura', 'topic_7'),
 ('powerful', 'topic_1')]

## 3.1 Tokenization

In [56]:
def tokenizer(document):
    text = "".join([ch for ch in document if ch not in string.punctuation])
    text_list = text.split()
    normalized_text = [x.lower() for x in text_list]
    # Define an empty list
    nostopwords_text = []
    # Scan the words
    for word in normalized_text:
        # Determine if the word is contained in the stop words list
        if word not in ENGLISH_STOP_WORDS:
            # If the word is not contained I append it
            nostopwords_text.append(word)
    tokenized_text = [word for word in nostopwords_text if re.search('[a-zA-Z]{2,}', word)]
            
    return tokenized_text

In [57]:
def map_doc_to_topic(tokenized_text, prefix, doc_id_number, word2topic_dict):
    doc_to_topic_list = [prefix + '_' + str(doc_id_number)]
    # print('adding doc_to_topic header element {}'.format(doc_to_topic_list[0]))

    for word in tokenized_text:
        if word in word2topic_dict.keys():
            doc_to_topic_list.append(word2topic_dict[word])
        # else:
        #    print('{} not found in word2topic_dict.keys'.format(word))

    return doc_to_topic_list

In [58]:
from gensim.models.deprecated.doc2vec import LabeledSentence

In [59]:
class LabeledLineSentence_training(object):
    def __init__(self, sources, word2topic_dict):
        self.labels_list = word2topic_dict
        self.sources = sources
        flipped = {}
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        print('len of sources is {}'.format(len(self.sources)))
        for source, prefix in self.sources.items():
            print(source)
            newsgroups_train_cat = fetch_20newsgroups(subset='train',
                                                      remove=('headers', 'footers', 'quotes'),
                                                      categories=[source])
            # print('len of newsgroups_train_cat is {}'.format(len(newsgroups_train_cat)))
            # (Pdb) newsgroups_train_cat.keys() -> 
            # dict_keys(['data', 'filenames', 'target', 'description', 'DESCR', 'target_names'])
            # import pdb; pdb.set_trace()
            for idx, doc in enumerate(newsgroups_train_cat.data):
                words_doc=tokenizer(doc)
                tags_doc = map_doc_to_topic(words_doc, prefix, idx, word2topic_dict)
                yield LabeledSentence(words = words_doc,
                                                     tags = tags_doc)
                
    def to_array(self):
        self.sentences = []
        print('len of sources is {}'.format(len(self.sources)))
        for source, prefix in self.sources.items():
            newsgroups_train_cat = fetch_20newsgroups(subset='train',
                                                      remove=('headers', 'footers', 'quotes'),
                                                      categories=[source])
            # print('len of newsgroups_train_cat is {}'.format(len(newsgroups_train_cat)))
            # import pdb; pdb.set_trace()
            # (Pdb) type(newsgroups_train_cat) -> <class 'sklearn.utils.Bunch'> => len is 6
            # (Pdb) type(newsgroups_train_cat.data) -> <class 'list'>
            # (Pdb) len(newsgroups_train_cat.data) -> 593
            # (Pdb) newsgroups_train_cat.data[0] -> document 1 strings, with newlines inside
            # (Pdb) newsgroups_train_cat.data[1] -> document 2 strings, with newlines inside
            # (Pdb) newsgroups_train_cat.target.shape -> (593,)
            # (Pdb) newsgroups_train_cat.target.max() -> 0
            for idx, doc in enumerate(newsgroups_train_cat.data):
                words_doc=tokenizer(doc)
                tags_doc = map_doc_to_topic(words_doc, prefix, idx, word2topic_dict)
                self.sentences.append(LabeledSentence(words = words_doc,
                                                     tags = tags_doc))
        return self.sentences
            
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

## 3.1 Training

### Revisit parameters before training

In [60]:
categories_source

{'comp.sys.ibm.pc.hardware': 'comp_sys_ibm_pc_hardware',
 'comp.sys.mac.hardware': 'comp_sys_mac_hardware',
 'comp.windows.x': 'comp_windows_x',
 'rec.sport.baseball': 'rec_sport_baseball',
 'rec.sport.hockey': 'rec_sport_hockey',
 'sci.med': 'sci_med',
 'sci.space': 'sci_space',
 'soc.religion.christian': 'soc_religion_christian'}

In [61]:
list(islice(word2topic_dict.items(), 5))

[('disrespectful', 'topic_4'),
 ('closed', 'topic_4'),
 ('fraud', 'topic_5'),
 ('ventura', 'topic_7'),
 ('powerful', 'topic_1')]

### Tokenization

In [62]:
# For all input news group documents
#    For all sentences in that document
#        Generate gensim.models.deprecated.doc2vec.LabeledSentence
#            (words, tags with group name and word's topics)
it = LabeledLineSentence_training(categories_source, word2topic_dict)

#### Quote notes about LabeledSentence and TaggedDocument
1. LabeledSentence is an older, deprecated name for the same simple object-type to encapsulate a text-example that is now called TaggedDocument. 
2. Any objects that have words and tags properties, each a list, will do.
    - words is always a list of strings
    - tags can be a mix of integers and strings, but in the common and most-efficient case, is just a list with a single id integer, starting at 0.)

#### [Info about how to use Gensim doc2vec](https://medium.com/@mishra.thedeepak/doc2vec-in-a-simple-way-fa80bfe81104)
1. In this example it uses filename and doc label
2. And after the training it can print the vector of the file using its name

    ```
    docvec = d2v_model.docvecs[‘1.txt’] #if string tag used in training
    print docvec
    ```
3. Or to get most similar document with similarity scores using document-index

    ```
    similar_doc = d2v_model.docvecs.most_similar(14) 
    print similar_doc
    ```

In [63]:
# print the first new group's item #1
inspect_item = next(iter(it))
print(type(inspect_item))
print(inspect_item)
print(len(inspect_item.tags), len(inspect_item.words))
print(inspect_item.tags[:10], inspect_item.words[:10])

len of sources is 8
sci.space
<class 'gensim.models.deprecated.doc2vec.LabeledSentence'>
LabeledSentence(['lunar', 'satellite', 'needs', 'fuel', 'regular', 'orbit', 'corrections', 'fuel', 'runs', 'crash', 'months', 'orbits', 'apollo', 'motherships', 'changed', 'noticeably', 'lunar', 'missions', 'lasting', 'days', 'possible', 'stable', 'orbits', 'moons', 'gravitational', 'field', 'poorly', 'mapped', 'know', 'perturbations', 'sun', 'earth', 'relatively', 'minor', 'issues', 'low', 'altitudes', 'big', 'problem', 'moons', 'gravitational', 'field', 'quite', 'lumpy', 'irregular', 'distribution', 'mass', 'moon'], ['sci_space_0', 'topic_5', 'topic_5', 'topic_5', 'topic_7', 'topic_5', 'topic_5', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_7', 'topic_1', 'topic_1', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_5', 'topic_5', 'topic_5', 'topic_5'])
30 48
['sci_space_0', 'topic_5', 'topic_5', 'topic_



In [64]:
# type(models.Doc2Vec)
model = models.Doc2Vec(size=100, window=10, min_count=4, dm=1, dbow_words=1,
                              workers=50, alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(it.to_array())



len of sources is 8




In [65]:
from tqdm import tqdm
for epoch in tqdm(range(20)):
    model.train(it.sentences_perm(), total_examples=model.corpus_count, epochs=1)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

100%|██████████| 20/20 [04:39<00:00, 13.99s/it]


In [66]:
fname =  os.getcwd() # Prints the working directory
fname = fname + '/topic2vec_20NG_2_ndoc' + str(n_docs) + 'n_topic' + str(n_topics) + '.model'
model.save(fname)

### Show results
A quick info about [how to use gensim doc2vec model to query words by label vector or vice versa](https://github.com/RaRe-Technologies/gensim/issues/1397)

1. search words using word

    ```
    model.most_similar('word')
    # only similar words were returned but not labels
    ```
2. search label by label
    - use model.docvecs.most_similar to search for similar labels using labels
3. search words by label

    ```
    model.docvecs['label']
    model.similar_by_vector(label_vec)
    # only similar words were returned
    ```
4. search labels by word

    ```
    word_vec = model['word']
    model.docvecs.most_similar([word_vec])
    # returns similar labels
    ```

In [67]:
from gensim import corpora, models, similarities

# load the model back
fname = fname if fname is not None else 'topic2vec_20NG_2_ndoc4744n_topic8.model'
print('loading model from {}'.format(fname))
d2v_model = models.doc2vec.Doc2Vec.load(fname)

loading model from /home/aimladmin/notebooks/home/ksong/Topic2Vec/topic2vec_20NG_2_ndoc4744n_topic8.model


In [68]:
# list the top 5 tags in the model
from itertools import islice

paragraphs_tag = d2v_model.docvecs.doctags
type(paragraphs_tag)
print(len(paragraphs_tag), list(islice(paragraphs_tag.items(),5)))

4752 [('comp_sys_ibm_pc_hardware_354', Doctag(offset=2148, word_count=224, doc_count=1)), ('comp_sys_ibm_pc_hardware_326', Doctag(offset=2120, word_count=13, doc_count=1)), ('comp_windows_x_442', Doctag(offset=4601, word_count=72, doc_count=1)), ('soc_religion_christian_326', Doctag(offset=1521, word_count=10, doc_count=1)), ('comp_sys_ibm_pc_hardware_291', Doctag(offset=2085, word_count=58, doc_count=1))]


In [69]:
ragraphs_vector = d2v_model.docvecs.doctag_syn0
ragraphs_vector.shape

  """Entry point for launching an IPython kernel.


(4752, 100)

In [105]:
d2v_model.docvecs.most_similar(positive = ['sci_space_96'])

[('rec_sport_baseball_389', 0.35365036129951477),
 ('rec_sport_baseball_378', 0.34690025448799133),
 ('comp_sys_ibm_pc_hardware_563', 0.3393045663833618),
 ('sci_med_524', 0.3304837942123413),
 ('comp_sys_mac_hardware_518', 0.32284557819366455),
 ('sci_med_125', 0.3165658116340637),
 ('rec_sport_hockey_94', 0.28935739398002625),
 ('comp_sys_mac_hardware_292', 0.289227157831192),
 ('rec_sport_hockey_203', 0.285552978515625),
 ('rec_sport_baseball_449', 0.28499162197113037)]

In [115]:
label_vec = d2v_model.docvecs['sci_space_96']
d2v_model.wv.similar_by_vector(label_vec)

[('lords', 0.39834630489349365),
 ('doesnt', 0.3908179998397827),
 ('darling', 0.3805291950702667),
 ('ahola', 0.35309669375419617),
 ('worry', 0.3530368208885193),
 ('destroyed', 0.3495197892189026),
 ('tale', 0.33816206455230713),
 ('cpus', 0.33736613392829895),
 ('lzone', 0.325408935546875),
 ('ears', 0.3231354355812073)]

In [70]:
for topic_idx in range(8):
    print('>>> top 10 relevant words of topic {}'.format(topic_idx))
    topic_vec = d2v_model.docvecs['topic_{}'.format(topic_idx)]
    print(d2v_model.wv.similar_by_vector(topic_vec))

>>> top 10 relevant words of topic 0
[('al', 0.8068424463272095), ('holds', 0.7901829481124878), ('rd3', 0.7826679944992065), ('percentage', 0.7784439921379089), ('behalf', 0.7727761268615723), ('spouse', 0.7723900079727173), ('intensive', 0.7632749080657959), ('crime', 0.7625982165336609), ('molecular', 0.7587553858757019), ('experimental', 0.7542406320571899)]
>>> top 10 relevant words of topic 1
[('echohostname', 0.922170877456665), ('hank', 0.9113080501556396), ('echo', 0.9076107144355774), ('set', 0.9072037935256958), ('woof', 0.8928797841072083), ('aaron', 0.8849927186965942), ('tail', 0.8846719264984131), ('iivx', 0.8805248141288757), ('finished', 0.8791848421096802), ('cdrom', 0.8782916069030762)]
>>> top 10 relevant words of topic 2
[('decs', 0.8631792068481445), ('create', 0.8595727682113647), ('exposuremask', 0.8494325876235962), ('waking', 0.8476630449295044), ('spoke', 0.8470355272293091), ('event', 0.8446255922317505), ('program', 0.8435776829719543), ('meditating', 0.842

In [15]:
word_vec = d2v_model['nasa']
d2v_model.docvecs.most_similar([word_vec])

[('sci_space_411', 0.6604948043823242),
 ('comp_sys_mac_hardware_42', 0.6437797546386719),
 ('soc_religion_christian_278', 0.6072692275047302),
 ('rec_sport_baseball_549', 0.5811659097671509),
 ('comp_windows_x_404', 0.5668190717697144),
 ('rec_sport_hockey_233', 0.5499863028526306),
 ('soc_religion_christian_28', 0.5372079610824585),
 ('comp_sys_mac_hardware_476', 0.47990018129348755),
 ('comp_sys_mac_hardware_125', 0.47634610533714294),
 ('soc_religion_christian_226', 0.4659426808357239)]

In [107]:
d2v_model.docvecs.n_similarity(['topic_0', 'topic_2'], ['topic_3', 'topic_4'])

0.644150725372538

In [108]:
d2v_model.docvecs.similarity('topic_0', 'topic_2')

0.3563553612509076