# Topic Modeling Using Latent Dirichlet Allocation 

## We are going to use `nltk` and `gensim` python packages.

### Import necessary packages

In [1]:
import pandas as pd
from utilities import *
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk

nltk.download('wordnet') # update wordnet
np.random.seed(42)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/imsrgadich/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Get the data. We are going to use the Million Headlines from Kaggle

https://www.kaggle.com/therohk/million-headlines/data

In [2]:
documents = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False) # error bad lines removes unreadable lines
documents['headline_text'] = documents['headline_text'].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ") 
# remove all the special characters in the text
documents['index'] = documents.index # adding the index column

### Sanitize the text (remove http, @handle, @mail.com)

In [3]:
documents = standardize_text(documents, 'headline_text')

### Data Preprocessing

In [7]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v')) # lemmatize normalize the text to same
                                                                     # tense. Here, 

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text): # simple_preprocess tokenizes the text
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [8]:
processed_docs = documents.headline_text.map(preprocess) # its is mapped for each row separately.

In [14]:
processed_docs

1103665

### Bag of words

In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [18]:
# check the dictionary values
count = 0
for k,v in dictionary.iteritems():
    print(k,v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [19]:
# remove tokens which are in documents below 15 and are not above 0.5.
# and keep only 1 lakh dictionary values.
dictionary.filter_extremes(no_below=15,no_above=0.5,keep_n=100000)


In [20]:
len(dictionary)

14142

### Create Bag of Words for each document

In [21]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [22]:
bow_corpus[4310]

[(76, 1), (112, 1), (483, 1), (4014, 1)]

In [25]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print(f"Word {bow_doc_4310[i][0]} (\"{dictionary[bow_doc_4310[i][0]]}\") appears {bow_doc_4310[i][1]} times.") 

Word 76 ("bushfir") appears 1 times.
Word 112 ("help") appears 1 times.
Word 483 ("rain") appears 1 times.
Word 4014 ("dampen") appears 1 times.


### TF-IDF

In [38]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

In [40]:
type(tfidf_corpus)

gensim.interfaces.TransformedCorpus

### Running LDA with BOW

In [28]:
lda_bow = gensim.models.LdaMulticore(corpus=bow_corpus,num_topics=10,id2word=dictionary,passes=2, workers=2)

In [31]:
for idx, topic in lda_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.020*"council" + 0.018*"live" + 0.015*"water" + 0.014*"deal" + 0.012*"plan" + 0.011*"budget" + 0.011*"want" + 0.011*"royal" + 0.011*"vote" + 0.011*"push"
Topic: 1 
Words: 0.020*"help" + 0.017*"hospit" + 0.014*"work" + 0.013*"worker" + 0.012*"power" + 0.011*"urg" + 0.011*"farm" + 0.010*"fund" + 0.010*"feder" + 0.010*"health"
Topic: 2 
Words: 0.023*"adelaid" + 0.020*"perth" + 0.020*"open" + 0.017*"final" + 0.015*"tasmanian" + 0.013*"interview" + 0.010*"centr" + 0.009*"announc" + 0.009*"research" + 0.009*"drum"
Topic: 3 
Words: 0.021*"market" + 0.019*"south" + 0.016*"west" + 0.015*"share" + 0.013*"train" + 0.012*"bank" + 0.011*"island" + 0.010*"build" + 0.010*"life" + 0.009*"take"
Topic: 4 
Words: 0.032*"charg" + 0.031*"court" + 0.024*"murder" + 0.020*"face" + 0.018*"alleg" + 0.017*"jail" + 0.016*"accus" + 0.016*"woman" + 0.015*"sydney" + 0.014*"child"
Topic: 5 
Words: 0.017*"countri" + 0.015*"nation" + 0.015*"rural" + 0.015*"chang" + 0.014*"hour" + 0.012*"servic" + 0.01

In [41]:
lda_model_tfidf = gensim.models.LdaMulticore(tfidf_corpus, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [42]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"juli" + 0.007*"friday" + 0.007*"march" + 0.007*"septemb" + 0.006*"australian" + 0.006*"toni" + 0.006*"outback" + 0.006*"dollar" + 0.005*"music" + 0.005*"young"
Topic: 1 Word: 0.010*"australia" + 0.007*"ash" + 0.007*"test" + 0.006*"coast" + 0.005*"cricket" + 0.005*"india" + 0.005*"west" + 0.005*"england" + 0.004*"billion" + 0.004*"southern"
Topic: 2 Word: 0.012*"crash" + 0.010*"kill" + 0.008*"dead" + 0.006*"michael" + 0.006*"die" + 0.006*"malcolm" + 0.005*"thursday" + 0.005*"stori" + 0.005*"mount" + 0.005*"plane"
Topic: 3 Word: 0.023*"countri" + 0.021*"hour" + 0.010*"weather" + 0.008*"farm" + 0.007*"novemb" + 0.006*"decemb" + 0.005*"jam" + 0.005*"island" + 0.005*"quiz" + 0.005*"wind"
Topic: 4 Word: 0.017*"charg" + 0.017*"trump" + 0.016*"polic" + 0.015*"murder" + 0.012*"court" + 0.010*"alleg" + 0.010*"woman" + 0.009*"death" + 0.009*"jail" + 0.009*"podcast"
Topic: 5 Word: 0.009*"leagu" + 0.008*"elect" + 0.008*"queensland" + 0.008*"final" + 0.007*"world" + 0.006*"roya