# Topic2Vec_20newsgroups on Gensim

In [51]:
# !pip install pyorient

In [7]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from tqdm import tqdm

## 1. IMPORTING DOCS FROM 20 NEWSGROUPS DATASET

In [8]:
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.med',
'sci.space',
'soc.religion.christian']

n_topics = len(categories)

categories_source = {}

for cat in categories:
    categories_source[cat] = cat.replace('.', '_')

In [9]:
categories_source

{'comp.sys.ibm.pc.hardware': 'comp_sys_ibm_pc_hardware',
 'comp.sys.mac.hardware': 'comp_sys_mac_hardware',
 'comp.windows.x': 'comp_windows_x',
 'rec.sport.baseball': 'rec_sport_baseball',
 'rec.sport.hockey': 'rec_sport_hockey',
 'sci.med': 'sci_med',
 'sci.space': 'sci_space',
 'soc.religion.christian': 'soc_religion_christian'}

In [10]:
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [11]:
for i,j in categories_source.items():
    print(i,j)

comp.sys.ibm.pc.hardware comp_sys_ibm_pc_hardware
comp.sys.mac.hardware comp_sys_mac_hardware
comp.windows.x comp_windows_x
rec.sport.baseball rec_sport_baseball
rec.sport.hockey rec_sport_hockey
sci.med sci_med
sci.space sci_space
soc.religion.christian soc_religion_christian


#### TOTAL NUMBER OF DOC

In [8]:
n_docs = newsgroups_train.filenames.shape[0]
n_docs

4744

In [9]:
type(newsgroups_train)

sklearn.utils.Bunch

In [10]:
newsgroups_train.filenames

array(['/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/sci.space/61065',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.hockey/52618',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/comp.windows.x/67032',
       ...,
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.hockey/52576',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20809',
       '/home/aimladmin/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20733'],
      dtype='<U96')

In [11]:
len(newsgroups_train.data)

4744

In [28]:
newsgroups_train.data[0]

"Hmmm. I seem to recall that the attraction of solid state record-\nplayers and radios in the 1960s wasn't better performance but lower\nper-unit cost than vacuum-tube systems.\n\n\tMind you, my father was a vacuum-tube fan in the 60s (Switched\nto solid-state in the mid-seventies and then abruptly died; no doubt\nthere's a lesson in that) and his account could have been biased."

### Spacy Playground

In [76]:
import spacy
import pandas as pd

# for quick reference goes to https://github.com/hailusong/lda2vec/blob/master/lda2vec/preprocess.py
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')

if 1 == 1:
    parsed_review = nlp('Hello the > < wolrd!\nWhere we Are are Be become never today unfortunately?\nWill that The Walnurs or they or he or she join us as a lovely team\'s party')
    token_attributes = [(token.orth_,
                     token.lemma_,
                     token.prob,
                     token.is_stop,
                     token.is_alpha,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_review]

    df = pd.DataFrame(token_attributes,
                      columns=['text',
                               'lemma',
                               'log_probability',
                               'stop?',
                               'isalpha?',
                               'punctuation?',
                               'whitespace?',
                               'number?',
                               'out of vocab.?'])

    # df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
    #                                       .applymap(lambda x: 'Yes' if x else ''))

    print(df)

             text          lemma  log_probability  stop?  isalpha?  \
0           Hello          hello            -20.0  False      True   
1             the            the            -20.0   True      True   
2               >              >            -20.0  False     False   
3               <              <            -20.0  False     False   
4           wolrd          wolrd            -20.0  False      True   
5               !              !            -20.0  False     False   
6              \n             \n            -20.0  False     False   
7           Where          where            -20.0  False      True   
8              we         -PRON-            -20.0   True      True   
9             Are             be            -20.0  False      True   
10            are             be            -20.0   True      True   
11             Be             be            -20.0  False      True   
12         become         become            -20.0   True      True   
13          never   

## Phrase Modeling
Code from [Modern NLP in Python](http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb)

In [79]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import spacy
import pandas as pd
import itertools as it

In [71]:
removed_tokens = set()

def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    result = token.is_punct or token.is_space or token.is_stop or not token.is_alpha
    if result:
       removed_tokens.add(token.text)

    return result

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

def line_sklearn_data(sklearn_data):
    """
    generator function to read in sklearn data from the list
    and un-escape the original line breaks in the text
    """
    
    for data in sklearn_data:
        yield data.replace('\\n', '\n').lower()
    
def lemmatized_sentence_corpus(sentences):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(sentences,
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield ' '.join([token.lemma_ for token in sent if not punct_space(token)])

### Unigram

In [72]:
import os
import codecs

In [73]:
intermediate_directory = './'

In [74]:
unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all.txt')

In [70]:
%%time
from tqdm import tqdm

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    removed_tokens = set()
    print('Original >>>')
    print(newsgroups_train.data[1:2])

    print('New >>>')
    for sentence in tqdm(lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data[1:2]))):
        print(sentence + '\n')

    print(removed_tokens)



0it [00:00, ?it/s][A[A

16it [00:00, 219.35it/s][A[A

Original >>>
["Well, here it is, NHL in the year 2000.\nI got these from a very reliable source in a dream some years ago and \nalthough I initially thought I had just been taking too many too strong \ndrugs now it seems the realization has really begun...  You can see the \nleague has already started to move to this direction.\n\n   *The Walt Disney Conference*\nAnaheim Mighty Chipmunks    -Franchise name to be changed after each new \nLA Kings                      hockey movie         \nLA Flames                   -We've seen some of that\nSan Jose Sharks\nSan Diego Bruins\nTijuana Red Wings   -Detroit's hockey team will follow its car industry...\nDallas Stars           \nHouston Oilers\nTexas Rangers\nSeattle Canucks\n\n   *The Norm Green Conference*\nAlabama White Hawks\nBiloxi Blues\nTampa Bay Lightning\nMiami Blades\nHelsinki Jets        -You've heard them starting getting used to the anthem\nMontreal Quebecois (sp?)                 -There will be no 'Canada'\nAtlanta Devils\nOr

In [75]:
%%time
from tqdm import tqdm

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in tqdm(lemmatized_sentence_corpus(line_sklearn_data(newsgroups_train.data))):
            f.write(sentence + '\n')



0it [00:00, ?it/s][A[A

1it [03:59, 239.39s/it][A[A

3808it [03:59, 15.90it/s][A[A

7049it [03:59, 29.42it/s][A[A

10410it [03:59, 43.43it/s][A[A

13490it [03:59, 56.26it/s][A[A

16557it [03:59, 69.02it/s][A[A

20582it [03:59, 85.76it/s][A[A

23888it [04:00, 99.49it/s][A[A

27027it [04:00, 112.52it/s][A[A

30535it [04:00, 127.07it/s][A[A

34237it [04:00, 142.42it/s][A[A

37924it [04:00, 157.69it/s][A[A

41395it [04:00, 172.04it/s][A[A

44794it [04:00, 186.09it/s][A[A

48137it [04:00, 199.88it/s][A[A

51430it [04:00, 213.47it/s][A[A

54673it [04:01, 226.82it/s][A[A

58015it [04:01, 240.59it/s][A[A

61372it [04:01, 254.41it/s][A[A

64936it [04:01, 269.07it/s][A[A

68285it [04:01, 282.83it/s][A[A

70883it [04:01, 293.46it/s][A[A

CPU times: user 6min 22s, sys: 33 s, total: 6min 55s
Wall time: 4min 1s


In [80]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [82]:
# print a few example lemmatized sentences
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print(' '.join(unigram_sentence))
    print('')

child able read endeavor inculcate child right reading scripture concentrate pleasant reading gloss bad one explain away unexplainable mystery

circular argument self evdent fact truth unreason belief fear hell meat religion child eat day

doubt course mean wrath sort child learn away brain matter concern god

considerable effect child adult superstition teach nearly impossible remove

lead ask theist truly objective question god hell heaven angel soul rest

moment aside notion god

exist look unbiased point view

obviously theist somewhat especially present mythical god homeric roman egyptian etc

aside assumption god existence question impartially

stephen



### Bigram

In [83]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')

In [84]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:
    # gensim Phrases
    # automatically detect common phrases – multi-word expressions / word n-grams – from a stream of sentences.
    #
    # example
    # >>> sentences = Text8Corpus(datapath('testcorpus.txt'))     <- load training example data
    # >>> phrases = Phrases(sentences, min_count=1, threshold=1)  <- train the Phrases model
    # >>> phrases[[u'trees', u'graph', u'minors']]                <- apply trained model to sentence
    # [u'trees_graph', u'minors']                                 <- identify phrases 'trees_graph'
    #
    # >>> phrases.add_vocab([["hello", "world"], ["meow"]])       <- update model with new sentences
    # 
    # >>> bigram = Phraser(phrases)                               <- construct faster model (this is only an wrapper)
    # >>> bigram[[u'trees', u'graph', u'minors']]                 <- apply model to sentence
    # [u'trees_graph', u'minors']
    #
    bigram_model = Phrases(unigram_sentences)
    bigram_model.save(bigram_model_filepath)

CPU times: user 1.62 s, sys: 27.8 ms, total: 1.65 s
Wall time: 1.7 s


In [85]:
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

In [86]:
# Prepare to apply the trained Phrases model to the unigram lemmatized sentences
bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_all.txt')

In [87]:
%%time
from tqdm import tqdm

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for unigram_sentence in tqdm(unigram_sentences):
            bigram_sentence = ' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')





1955it [00:00, 19420.56it/s][A[A

3848it [00:00, 19152.32it/s][A[A

5593it [00:00, 18596.84it/s][A[A

7311it [00:00, 18221.12it/s][A[A

9132it [00:00, 18223.97it/s][A[A

10786it [00:00, 17944.58it/s][A[A

12556it [00:00, 17908.66it/s][A[A

14169it [00:00, 17639.93it/s][A[A

16100it [00:00, 17825.19it/s][A[A

18280it [00:01, 18223.71it/s][A[A

20159it [00:01, 18258.21it/s][A[A

22015it [00:01, 17862.26it/s][A[A

23884it [00:01, 17924.26it/s][A[A

25671it [00:01, 17926.11it/s][A[A

27596it [00:01, 18011.75it/s][A[A

29617it [00:01, 18147.75it/s][A[A

31498it [00:01, 18143.00it/s][A[A

33358it [00:01, 18066.74it/s][A[A

35187it [00:01, 18078.39it/s][A[A

37048it [00:02, 18100.96it/s][A[A

38876it [00:02, 18087.58it/s][A[A

40709it [00:02, 18101.68it/s][A[A

42529it [00:02, 18100.82it/s][A[A

44345it [00:02, 18066.47it/s][A[A

46138it [00:02, 17954.42it/s][A[A

47873it [00:02, 17772.16it/s][A[A

49496it [00:02, 17450.01it/s][A[A

50

CPU times: user 4.04 s, sys: 57.2 ms, total: 4.1 s
Wall time: 4.15 s


In [88]:
# print out a few examples of bigram on lemmatized sentences.
# note that vice_versa, original_question, etc. are now considered as phrases
bigram_sentences = LineSentence(bigram_sentences_filepath)

for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print(' '.join(bigram_sentence))
    print('')

child able read endeavor inculcate child right reading scripture concentrate pleasant reading gloss bad one explain away unexplainable mystery

circular argument self evdent fact truth unreason belief fear hell meat religion child eat day

doubt course mean wrath sort child learn away brain matter concern god

considerable effect child adult superstition teach nearly impossible remove

lead ask theist truly objective question god hell heaven angel soul rest

moment aside notion god

exist look unbiased point_view

obviously theist somewhat especially present mythical god homeric roman egyptian etc

aside assumption god existence question impartially

stephen



### Trigram

In [89]:
trigram_model_filepath = os.path.join(intermediate_directory,
                                      'trigram_model_all')

In [90]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:
    trigram_model = Phrases(bigram_sentences)
    trigram_model.save(trigram_model_filepath)

CPU times: user 1.66 s, sys: 20.8 ms, total: 1.68 s
Wall time: 1.68 s


In [91]:
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

In [92]:
type(trigram_model)

gensim.models.phrases.Phrases

In [93]:
trigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'trigram_sentences_all.txt')

In [94]:
%%time
from tqdm import tqdm

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in tqdm(bigram_sentences):
            trigram_sentence = ' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')





1838it [00:00, 18131.32it/s][A[A

3660it [00:00, 18231.10it/s][A[A

5447it [00:00, 18094.32it/s][A[A

7228it [00:00, 18031.71it/s][A[A

9192it [00:00, 18331.69it/s][A[A

11099it [00:00, 18472.01it/s][A[A

12878it [00:00, 18367.26it/s][A[A

14569it [00:00, 17966.51it/s][A[A

16323it [00:00, 17919.62it/s][A[A

18623it [00:01, 18424.51it/s][A[A

20621it [00:01, 18562.28it/s][A[A

22526it [00:01, 18536.49it/s][A[A

24408it [00:01, 18463.20it/s][A[A

26253it [00:01, 18454.96it/s][A[A

28248it [00:01, 18552.67it/s][A[A

30326it [00:01, 18690.04it/s][A[A

32271it [00:01, 18724.68it/s][A[A

34212it [00:01, 18661.49it/s][A[A

36105it [00:01, 18632.24it/s][A[A

38062it [00:02, 18680.24it/s][A[A

39955it [00:02, 18660.88it/s][A[A

41829it [00:02, 18661.41it/s][A[A

43702it [00:02, 18655.53it/s][A[A

45700it [00:02, 18709.15it/s][A[A

47605it [00:02, 18648.81it/s][A[A

49466it [00:02, 18609.50it/s][A[A

51291it [00:02, 18558.05it/s][A[A

53

CPU times: user 3.34 s, sys: 84.8 ms, total: 3.43 s
Wall time: 3.38 s


In [95]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [96]:
# print a few trigram examples on lemmatized sentences
# note that bigram words are still there but we see a few trigrams now.
# like san_jose_sharks, learn_how_to
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print(' '.join(trigram_sentence))
    print('')

child able read endeavor inculcate child right reading scripture concentrate pleasant reading gloss bad one explain away unexplainable mystery

circular argument self evdent fact truth unreason belief fear hell meat religion child eat day

doubt course mean wrath sort child learn away brain matter concern god

considerable effect child adult superstition teach nearly impossible remove

lead ask theist truly objective question god hell heaven angel soul rest

moment aside notion god

exist look unbiased point_view

obviously theist somewhat especially present mythical god homeric roman egyptian etc

aside assumption god existence question impartially

stephen



In [97]:
trigram_reviews_filepath = os.path.join(intermediate_directory,
                                        'trigram_transformed_reviews_all.txt')

In [98]:
%%time

# NOT NEED TO RUN THIS AS WE HAVE DONE ALL STEPS HERE BEFORE
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                      batch_size=10000, n_threads=4):
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.en.STOPWORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs


In [99]:
# print('Original:' + u'\n')
#
# for review in it.islice(line_review(review_txt_filepath), 11, 12):
#    print(review)
trigram_reviews_filepath = trigram_sentences_filepath

print('----' + u'\n')
print('Transformed:' + u'\n')

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print(review)

----

Transformed:

norm green conference alabama white hawk biloxi blue tampa_bay_lightning miami blade helsinki jet hear start get anthem montreal quebecois sp



# 2. LDA to find the topic most-associated with each word

### Create Vocabulary

In [100]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings

In [101]:
trigram_dictionary_filepath = os.path.join(intermediate_directory,
                                           'trigram_dict_all.dict')

In [102]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 1 == 1:
    print('loading data from {}'.format(trigram_reviews_filepath))
    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(tqdm(trigram_reviews))
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)



0it [00:00, ?it/s][A[A

4671it [00:00, 46096.13it/s][A[A

loading data from ./trigram_sentences_all.txt




8946it [00:00, 44515.76it/s][A[A

12994it [00:00, 43165.21it/s][A[A

17221it [00:00, 42632.22it/s][A[A

21418it [00:00, 42762.15it/s][A[A

25709it [00:00, 42794.06it/s][A[A

30181it [00:00, 43063.42it/s][A[A

34660it [00:00, 43254.59it/s][A[A

39398it [00:00, 43739.67it/s][A[A

44234it [00:01, 44192.02it/s][A[A

49083it [00:01, 44564.95it/s][A[A

53788it [00:01, 44782.83it/s][A[A

58405it [00:01, 44546.54it/s][A[A

61612it [00:01, 44482.13it/s][A[A

CPU times: user 1.46 s, sys: 13.1 ms, total: 1.48 s
Wall time: 1.45 s


In [103]:
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [104]:
# This module implements the concept of a Dictionary – a mapping between words and their integer ids.
# ops supported: 
# - doc2bow
# - doc2idx
# - filter_extremes
# - filter_n_most_frequent
# - compactify
#
# compactify: Assign new word ids to all words, shrinking any gaps.
print(type(trigram_dictionary), trigram_dictionary.num_nnz, trigram_dictionary.num_docs, trigram_dictionary.num_pos)

<class 'gensim.corpora.dictionary.Dictionary'> 348201 61612 364908


In [105]:
trigram_bow_filepath = os.path.join(intermediate_directory,
                                    'trigram_bow_corpus_all.mm')

In [106]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [107]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 1 == 1:
    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))

CPU times: user 2.24 s, sys: 58.9 ms, total: 2.3 s
Wall time: 2.3 s


In [108]:
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [109]:
# Corpus in the Matrix Market format.
print(type(trigram_bow_corpus))

<class 'gensim.corpora.mmcorpus.MmCorpus'>


In [110]:
document_index = 0
[trigram_dictionary[id] for (id, bow_count) in trigram_bow_corpus[document_index]]

['hmmm']

In [111]:
document_index = 1
[trigram_dictionary.id2token[id] for (id, bow_count) in trigram_bow_corpus[document_index]]

['cost',
 'good',
 'low',
 'performance',
 'player',
 'radio',
 'recall',
 'system',
 'tube',
 'unit',
 'vacuum']

## 2.2 LDA implementation

In [112]:
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')

In [113]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 1 == 1:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=7,
                           id2word=trigram_dictionary,
                           workers=3)

    lda.save(lda_model_filepath)

CPU times: user 15.9 s, sys: 734 ms, total: 16.7 s
Wall time: 20.5 s


In [114]:
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [136]:
# find out topics by word
for test_word in ['nasa', 'space', 'good', 'go']:
    test_id = trigram_dictionary.token2id[test_word]
    test_topics = lda.get_term_topics(test_id)
    print('{} id is {}, topics is {}'.format(trigram_dictionary[test_id], test_id, test_topics))

nasa id is 1827, topics is []
space id is 172, topics is []
good id is 2, topics is [(5, 0.012163425)]


In [130]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print('{:20} {}'.format('term', 'frequency') + '\n')

    for term, frequency in lda.show_topic(topic_number, topn=50):
        print('{:20} {:.3f}'.format(term, round(frequency, 3)))

In [131]:
explore_topic(6, topn=50)

term                 frequency

go                   0.011
file                 0.009
new                  0.008
say                  0.007
work                 0.007
god                  0.007
run                  0.006
like                 0.006
question             0.005
help                 0.005
john                 0.005
good                 0.005
people               0.004
great                0.004
send                 0.004
lead                 0.004
time                 0.004
man                  0.004
x                    0.004
subject              0.004
right                0.004
start                0.004
use                  0.004
mac                  0.004
appear               0.004
keyboard             0.003
able                 0.003
look                 0.003
line                 0.003
try                  0.003
come                 0.003
change               0.003
post                 0.003
couple               0.003
know                 0.003
application          0.

### 2.3 pyLDAVis

In [118]:
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

In [128]:
%%time
import pickle

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

CPU times: user 20 s, sys: 181 ms, total: 20.2 s
Wall time: 22.3 s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [129]:
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.display(LDAvis_prepared)

# 3. TOPIC2VEC

In [51]:
# argmax example:
# >>> kkk
# array([[1, 2, 3],
#       [0, 4, 2]])
# >>> np.argmax(kkk, 0)
# array([0, 1, 0])
# >>> np.argmax(kkk, 1)
# array([2, 1])
#
# this will select the topic with the most word weight for each word in the vocabulary
# after this, we can easily lookup the best topic of each vocabulary word by 
# most_p_topic[word_voca_index] -> word's topic (0 -7 in this case) 
most_p_topic = np.argmax(per_topic_distr_LDA, axis=0)

In [52]:
# per_topic_distr_LDA

In [53]:
most_p_topic.shape

(17197,)

In [54]:
word_and_topic = zip(tf_feature_names, most_p_topic)
# word2topic_dict = {word : 'topic_' + np.array_str(topic) for word, topic in word_and_topic}
word2topic_dict = {word : 'topic_{}'.format(topic) for word, topic in word_and_topic}

In [55]:
# list the top 5 words and their belonged topics
from itertools import islice
list(islice(word2topic_dict.items(), 5))

[('disrespectful', 'topic_4'),
 ('closed', 'topic_4'),
 ('fraud', 'topic_5'),
 ('ventura', 'topic_7'),
 ('powerful', 'topic_1')]

## 3.1 Tokenization

In [56]:
def tokenizer(document):
    text = "".join([ch for ch in document if ch not in string.punctuation])
    text_list = text.split()
    normalized_text = [x.lower() for x in text_list]
    # Define an empty list
    nostopwords_text = []
    # Scan the words
    for word in normalized_text:
        # Determine if the word is contained in the stop words list
        if word not in ENGLISH_STOP_WORDS:
            # If the word is not contained I append it
            nostopwords_text.append(word)
    tokenized_text = [word for word in nostopwords_text if re.search('[a-zA-Z]{2,}', word)]
            
    return tokenized_text

In [57]:
def map_doc_to_topic(tokenized_text, prefix, doc_id_number, word2topic_dict):
    doc_to_topic_list = [prefix + '_' + str(doc_id_number)]
    # print('adding doc_to_topic header element {}'.format(doc_to_topic_list[0]))

    for word in tokenized_text:
        if word in word2topic_dict.keys():
            doc_to_topic_list.append(word2topic_dict[word])
        # else:
        #    print('{} not found in word2topic_dict.keys'.format(word))

    return doc_to_topic_list

In [58]:
from gensim.models.deprecated.doc2vec import LabeledSentence

In [59]:
class LabeledLineSentence_training(object):
    def __init__(self, sources, word2topic_dict):
        self.labels_list = word2topic_dict
        self.sources = sources
        flipped = {}
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        print('len of sources is {}'.format(len(self.sources)))
        for source, prefix in self.sources.items():
            print(source)
            newsgroups_train_cat = fetch_20newsgroups(subset='train',
                                                      remove=('headers', 'footers', 'quotes'),
                                                      categories=[source])
            # print('len of newsgroups_train_cat is {}'.format(len(newsgroups_train_cat)))
            # (Pdb) newsgroups_train_cat.keys() -> 
            # dict_keys(['data', 'filenames', 'target', 'description', 'DESCR', 'target_names'])
            # import pdb; pdb.set_trace()
            for idx, doc in enumerate(newsgroups_train_cat.data):
                words_doc=tokenizer(doc)
                tags_doc = map_doc_to_topic(words_doc, prefix, idx, word2topic_dict)
                yield LabeledSentence(words = words_doc,
                                                     tags = tags_doc)
                
    def to_array(self):
        self.sentences = []
        print('len of sources is {}'.format(len(self.sources)))
        for source, prefix in self.sources.items():
            newsgroups_train_cat = fetch_20newsgroups(subset='train',
                                                      remove=('headers', 'footers', 'quotes'),
                                                      categories=[source])
            # print('len of newsgroups_train_cat is {}'.format(len(newsgroups_train_cat)))
            # import pdb; pdb.set_trace()
            # (Pdb) type(newsgroups_train_cat) -> <class 'sklearn.utils.Bunch'> => len is 6
            # (Pdb) type(newsgroups_train_cat.data) -> <class 'list'>
            # (Pdb) len(newsgroups_train_cat.data) -> 593
            # (Pdb) newsgroups_train_cat.data[0] -> document 1 strings, with newlines inside
            # (Pdb) newsgroups_train_cat.data[1] -> document 2 strings, with newlines inside
            # (Pdb) newsgroups_train_cat.target.shape -> (593,)
            # (Pdb) newsgroups_train_cat.target.max() -> 0
            for idx, doc in enumerate(newsgroups_train_cat.data):
                words_doc=tokenizer(doc)
                tags_doc = map_doc_to_topic(words_doc, prefix, idx, word2topic_dict)
                self.sentences.append(LabeledSentence(words = words_doc,
                                                     tags = tags_doc))
        return self.sentences
            
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

## 3.1 Training

### Revisit parameters before training

In [60]:
categories_source

{'comp.sys.ibm.pc.hardware': 'comp_sys_ibm_pc_hardware',
 'comp.sys.mac.hardware': 'comp_sys_mac_hardware',
 'comp.windows.x': 'comp_windows_x',
 'rec.sport.baseball': 'rec_sport_baseball',
 'rec.sport.hockey': 'rec_sport_hockey',
 'sci.med': 'sci_med',
 'sci.space': 'sci_space',
 'soc.religion.christian': 'soc_religion_christian'}

In [61]:
list(islice(word2topic_dict.items(), 5))

[('disrespectful', 'topic_4'),
 ('closed', 'topic_4'),
 ('fraud', 'topic_5'),
 ('ventura', 'topic_7'),
 ('powerful', 'topic_1')]

### Tokenization

In [62]:
# For all input news group documents
#    For all sentences in that document
#        Generate gensim.models.deprecated.doc2vec.LabeledSentence
#            (words, tags with group name and word's topics)
it = LabeledLineSentence_training(categories_source, word2topic_dict)

#### Quote notes about LabeledSentence and TaggedDocument
1. LabeledSentence is an older, deprecated name for the same simple object-type to encapsulate a text-example that is now called TaggedDocument. 
2. Any objects that have words and tags properties, each a list, will do.
    - words is always a list of strings
    - tags can be a mix of integers and strings, but in the common and most-efficient case, is just a list with a single id integer, starting at 0.)

#### [Info about how to use Gensim doc2vec](https://medium.com/@mishra.thedeepak/doc2vec-in-a-simple-way-fa80bfe81104)
1. In this example it uses filename and doc label
2. And after the training it can print the vector of the file using its name

    ```
    docvec = d2v_model.docvecs[‘1.txt’] #if string tag used in training
    print docvec
    ```
3. Or to get most similar document with similarity scores using document-index

    ```
    similar_doc = d2v_model.docvecs.most_similar(14) 
    print similar_doc
    ```

In [63]:
# print the first new group's item #1
inspect_item = next(iter(it))
print(type(inspect_item))
print(inspect_item)
print(len(inspect_item.tags), len(inspect_item.words))
print(inspect_item.tags[:10], inspect_item.words[:10])

len of sources is 8
sci.space
<class 'gensim.models.deprecated.doc2vec.LabeledSentence'>
LabeledSentence(['lunar', 'satellite', 'needs', 'fuel', 'regular', 'orbit', 'corrections', 'fuel', 'runs', 'crash', 'months', 'orbits', 'apollo', 'motherships', 'changed', 'noticeably', 'lunar', 'missions', 'lasting', 'days', 'possible', 'stable', 'orbits', 'moons', 'gravitational', 'field', 'poorly', 'mapped', 'know', 'perturbations', 'sun', 'earth', 'relatively', 'minor', 'issues', 'low', 'altitudes', 'big', 'problem', 'moons', 'gravitational', 'field', 'quite', 'lumpy', 'irregular', 'distribution', 'mass', 'moon'], ['sci_space_0', 'topic_5', 'topic_5', 'topic_5', 'topic_7', 'topic_5', 'topic_5', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_7', 'topic_1', 'topic_1', 'topic_1', 'topic_5', 'topic_5', 'topic_1', 'topic_5', 'topic_5', 'topic_5', 'topic_5'])
30 48
['sci_space_0', 'topic_5', 'topic_5', 'topic_



In [64]:
# type(models.Doc2Vec)
model = models.Doc2Vec(size=100, window=10, min_count=4, dm=1, dbow_words=1,
                              workers=50, alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(it.to_array())



len of sources is 8




In [65]:
from tqdm import tqdm
for epoch in tqdm(range(20)):
    model.train(it.sentences_perm(), total_examples=model.corpus_count, epochs=1)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

100%|██████████| 20/20 [04:39<00:00, 13.99s/it]


In [66]:
fname =  os.getcwd() # Prints the working directory
fname = fname + '/topic2vec_20NG_2_ndoc' + str(n_docs) + 'n_topic' + str(n_topics) + '.model'
model.save(fname)

### Show results
A quick info about [how to use gensim doc2vec model to query words by label vector or vice versa](https://github.com/RaRe-Technologies/gensim/issues/1397)

1. search words using word

    ```
    model.most_similar('word')
    # only similar words were returned but not labels
    ```
2. search label by label
    - use model.docvecs.most_similar to search for similar labels using labels
3. search words by label

    ```
    model.docvecs['label']
    model.similar_by_vector(label_vec)
    # only similar words were returned
    ```
4. search labels by word

    ```
    word_vec = model['word']
    model.docvecs.most_similar([word_vec])
    # returns similar labels
    ```

In [67]:
from gensim import corpora, models, similarities

# load the model back
fname = fname if fname is not None else 'topic2vec_20NG_2_ndoc4744n_topic8.model'
print('loading model from {}'.format(fname))
d2v_model = models.doc2vec.Doc2Vec.load(fname)

loading model from /home/aimladmin/notebooks/home/ksong/Topic2Vec/topic2vec_20NG_2_ndoc4744n_topic8.model


In [68]:
# list the top 5 tags in the model
from itertools import islice

paragraphs_tag = d2v_model.docvecs.doctags
type(paragraphs_tag)
print(len(paragraphs_tag), list(islice(paragraphs_tag.items(),5)))

4752 [('comp_sys_ibm_pc_hardware_354', Doctag(offset=2148, word_count=224, doc_count=1)), ('comp_sys_ibm_pc_hardware_326', Doctag(offset=2120, word_count=13, doc_count=1)), ('comp_windows_x_442', Doctag(offset=4601, word_count=72, doc_count=1)), ('soc_religion_christian_326', Doctag(offset=1521, word_count=10, doc_count=1)), ('comp_sys_ibm_pc_hardware_291', Doctag(offset=2085, word_count=58, doc_count=1))]


In [69]:
ragraphs_vector = d2v_model.docvecs.doctag_syn0
ragraphs_vector.shape

  """Entry point for launching an IPython kernel.


(4752, 100)

In [105]:
d2v_model.docvecs.most_similar(positive = ['sci_space_96'])

[('rec_sport_baseball_389', 0.35365036129951477),
 ('rec_sport_baseball_378', 0.34690025448799133),
 ('comp_sys_ibm_pc_hardware_563', 0.3393045663833618),
 ('sci_med_524', 0.3304837942123413),
 ('comp_sys_mac_hardware_518', 0.32284557819366455),
 ('sci_med_125', 0.3165658116340637),
 ('rec_sport_hockey_94', 0.28935739398002625),
 ('comp_sys_mac_hardware_292', 0.289227157831192),
 ('rec_sport_hockey_203', 0.285552978515625),
 ('rec_sport_baseball_449', 0.28499162197113037)]

In [115]:
label_vec = d2v_model.docvecs['sci_space_96']
d2v_model.wv.similar_by_vector(label_vec)

[('lords', 0.39834630489349365),
 ('doesnt', 0.3908179998397827),
 ('darling', 0.3805291950702667),
 ('ahola', 0.35309669375419617),
 ('worry', 0.3530368208885193),
 ('destroyed', 0.3495197892189026),
 ('tale', 0.33816206455230713),
 ('cpus', 0.33736613392829895),
 ('lzone', 0.325408935546875),
 ('ears', 0.3231354355812073)]

In [70]:
for topic_idx in range(8):
    print('>>> top 10 relevant words of topic {}'.format(topic_idx))
    topic_vec = d2v_model.docvecs['topic_{}'.format(topic_idx)]
    print(d2v_model.wv.similar_by_vector(topic_vec))

>>> top 10 relevant words of topic 0
[('al', 0.8068424463272095), ('holds', 0.7901829481124878), ('rd3', 0.7826679944992065), ('percentage', 0.7784439921379089), ('behalf', 0.7727761268615723), ('spouse', 0.7723900079727173), ('intensive', 0.7632749080657959), ('crime', 0.7625982165336609), ('molecular', 0.7587553858757019), ('experimental', 0.7542406320571899)]
>>> top 10 relevant words of topic 1
[('echohostname', 0.922170877456665), ('hank', 0.9113080501556396), ('echo', 0.9076107144355774), ('set', 0.9072037935256958), ('woof', 0.8928797841072083), ('aaron', 0.8849927186965942), ('tail', 0.8846719264984131), ('iivx', 0.8805248141288757), ('finished', 0.8791848421096802), ('cdrom', 0.8782916069030762)]
>>> top 10 relevant words of topic 2
[('decs', 0.8631792068481445), ('create', 0.8595727682113647), ('exposuremask', 0.8494325876235962), ('waking', 0.8476630449295044), ('spoke', 0.8470355272293091), ('event', 0.8446255922317505), ('program', 0.8435776829719543), ('meditating', 0.842

In [15]:
word_vec = d2v_model['nasa']
d2v_model.docvecs.most_similar([word_vec])

[('sci_space_411', 0.6604948043823242),
 ('comp_sys_mac_hardware_42', 0.6437797546386719),
 ('soc_religion_christian_278', 0.6072692275047302),
 ('rec_sport_baseball_549', 0.5811659097671509),
 ('comp_windows_x_404', 0.5668190717697144),
 ('rec_sport_hockey_233', 0.5499863028526306),
 ('soc_religion_christian_28', 0.5372079610824585),
 ('comp_sys_mac_hardware_476', 0.47990018129348755),
 ('comp_sys_mac_hardware_125', 0.47634610533714294),
 ('soc_religion_christian_226', 0.4659426808357239)]

In [107]:
d2v_model.docvecs.n_similarity(['topic_0', 'topic_2'], ['topic_3', 'topic_4'])

0.644150725372538

In [108]:
d2v_model.docvecs.similarity('topic_0', 'topic_2')

0.3563553612509076