### source: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
import pandas as pd

### Load data

In [10]:
# load 1M+ newspaper headlines dataset from Kaggle
data = pd.read_csv('../data/abcnews-date-text.csv', on_bad_lines='skip')
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

### Data pre-processing

In [15]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

import nltk

In [18]:
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    
    return result

In [19]:
doc_sample = documents[documents['index'] == 4310].values[0][0]

words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


 tokenized and lemmatized document: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


In [21]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0               [decid, commun, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

### Bag-of-Words (BOW) on the dataset

In [26]:
dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 commun
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [27]:
## filter out totkens that appear in
##   less than 15 documents
##   more than 0.5 documents
## keep only the first 100K most frequent tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [28]:
## gendim doc2bow
bow_corpus = [dictionary.doc2bow(d) for d in processed_docs]
bow_corpus[4310]

[(162, 1), (240, 1), (292, 1), (589, 1), (839, 1), (3578, 1), (3579, 1)]

In [30]:
## display example BOW of for a particular document 4310
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0],
          dictionary[bow_doc_4310[i][0]],
          bow_doc_4310[i][1])
         )
    
def 

Word 162 ("govt") appears 1 time.
Word 240 ("group") appears 1 time.
Word 292 ("vote") appears 1 time.
Word 589 ("local") appears 1 time.
Word 839 ("want") appears 1 time.
Word 3578 ("compulsori") appears 1 time.
Word 3579 ("ratepay") appears 1 time.


### TF-IDF

In [35]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

corpus_tfidf[0]

In [39]:
bow_corpus[0]

[(0, 1), (1, 1), (2, 1), (3, 1)]

## LDA model using BOW

In [40]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10,
                                      id2word=dictionary, passes=2, workers=8)

In [42]:
# explore the words occurring in that topic and its relative weight
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.031*"interview" + 0.026*"market" + 0.022*"kill" + 0.016*"donald" + 0.014*"hous" + 0.013*"share" + 0.012*"bank" + 0.011*"rise" + 0.009*"india" + 0.008*"drop"
Topic: 1 
Words: 0.054*"polic" + 0.031*"sydney" + 0.016*"shoot" + 0.016*"investig" + 0.015*"price" + 0.014*"melbourn" + 0.014*"break" + 0.013*"protest" + 0.011*"arrest" + 0.010*"fall"
Topic: 2 
Words: 0.022*"miss" + 0.019*"countri" + 0.015*"speak" + 0.014*"search" + 0.014*"hour" + 0.011*"parti" + 0.011*"bodi" + 0.011*"media" + 0.011*"busi" + 0.010*"final"
Topic: 3 
Words: 0.029*"australia" + 0.020*"world" + 0.015*"open" + 0.015*"australian" + 0.014*"win" + 0.011*"take" + 0.009*"beat" + 0.007*"season" + 0.007*"farmer" + 0.007*"final"
Topic: 4 
Words: 0.020*"trump" + 0.019*"school" + 0.013*"bushfir" + 0.011*"rural" + 0.011*"green" + 0.010*"guilti" + 0.010*"student" + 0.010*"report" + 0.010*"centr" + 0.009*"fund"
Topic: 5 
Words: 0.029*"elect" + 0.019*"live" + 0.015*"news" + 0.014*"farm" + 0.013*"nation" + 0.011*"co

### LDA model using TF-IDF

In [48]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf,
                                            num_topics=10, id2word=dictionary, passes=2, 
                                             workers=8)

In [49]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic {} Word: {}'.format(idx, topic))

Topic 0 Word: 0.019*"news" + 0.012*"rural" + 0.012*"weather" + 0.009*"climat" + 0.008*"monday" + 0.007*"nation" + 0.007*"turnbul" + 0.006*"farm" + 0.006*"chang" + 0.006*"august"
Topic 1 Word: 0.011*"australia" + 0.008*"world" + 0.008*"coronaviru" + 0.007*"covid" + 0.007*"test" + 0.007*"live" + 0.007*"australian" + 0.006*"thursday" + 0.006*"market" + 0.005*"cricket"
Topic 2 Word: 0.019*"charg" + 0.016*"murder" + 0.015*"court" + 0.011*"polic" + 0.010*"jail" + 0.009*"accus" + 0.009*"assault" + 0.009*"guilti" + 0.009*"alleg" + 0.008*"sentenc"
Topic 3 Word: 0.016*"countri" + 0.013*"hour" + 0.009*"price" + 0.007*"rise" + 0.007*"wall" + 0.006*"dairi" + 0.006*"dollar" + 0.005*"street" + 0.005*"rat" + 0.005*"lockdown"
Topic 4 Word: 0.009*"health" + 0.007*"drought" + 0.006*"fund" + 0.006*"queensland" + 0.006*"bushfir" + 0.006*"christma" + 0.006*"tuesday" + 0.006*"coronaviru" + 0.006*"farmer" + 0.006*"mental"
Topic 5 Word: 0.008*"sport" + 0.008*"elect" + 0.007*"friday" + 0.007*"wednesday" + 0.005

In [50]:
processed_docs[4310]

['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

In [53]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.4588412940502167	 
Topic: 0.008*"sport" + 0.008*"elect" + 0.007*"friday" + 0.007*"wednesday" + 0.005*"june" + 0.005*"minist" + 0.004*"govern" + 0.004*"council" + 0.004*"plan" + 0.004*"remov"

Score: 0.25465720891952515	 
Topic: 0.016*"countri" + 0.013*"hour" + 0.009*"price" + 0.007*"rise" + 0.007*"wall" + 0.006*"dairi" + 0.006*"dollar" + 0.005*"street" + 0.005*"rat" + 0.005*"lockdown"

Score: 0.19895096123218536	 
Topic: 0.011*"australia" + 0.008*"world" + 0.008*"coronaviru" + 0.007*"covid" + 0.007*"test" + 0.007*"live" + 0.007*"australian" + 0.006*"thursday" + 0.006*"market" + 0.005*"cricket"

Score: 0.012509157881140709	 
Topic: 0.012*"drum" + 0.008*"royal" + 0.008*"commiss" + 0.007*"michael" + 0.006*"liber" + 0.005*"abbott" + 0.005*"octob" + 0.005*"say" + 0.004*"georg" + 0.004*"mount"

Score: 0.012508533895015717	 
Topic: 0.009*"health" + 0.007*"drought" + 0.006*"fund" + 0.006*"queensland" + 0.006*"bushfir" + 0.006*"christma" + 0.006*"tuesday" + 0.006*"coronaviru" + 0.006*

### Testing model on unseen document

In [54]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

In [55]:
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.3395603895187378	 Topic: 0.014*"fight" + 0.013*"say" + 0.013*"talk" + 0.011*"train" + 0.010*"deal"
Score: 0.19379742443561554	 Topic: 0.022*"miss" + 0.019*"countri" + 0.015*"speak" + 0.014*"search" + 0.014*"hour"
Score: 0.18580032885074615	 Topic: 0.024*"queensland" + 0.019*"coast" + 0.017*"coronaviru" + 0.017*"victoria" + 0.016*"south"
Score: 0.18076707422733307	 Topic: 0.031*"interview" + 0.026*"market" + 0.022*"kill" + 0.016*"donald" + 0.014*"hous"
Score: 0.01667989231646061	 Topic: 0.020*"trump" + 0.019*"school" + 0.013*"bushfir" + 0.011*"rural" + 0.011*"green"
Score: 0.016679152846336365	 Topic: 0.019*"council" + 0.018*"chang" + 0.018*"crash" + 0.017*"plan" + 0.015*"health"
Score: 0.016679013147950172	 Topic: 0.054*"polic" + 0.031*"sydney" + 0.016*"shoot" + 0.016*"investig" + 0.015*"price"
Score: 0.01667892374098301	 Topic: 0.029*"australia" + 0.020*"world" + 0.015*"open" + 0.015*"australian" + 0.014*"win"
Score: 0.01667892187833786	 Topic: 0.029*"elect" + 0.019*"live" + 