# Todo

- [ ] Write about dirichlet distributions;

## Dataset loading

In [2]:
dataset_path = '../../datasets/abcnews-date-text.csv'

First, the dataset will be loaded:

In [3]:
import pandas as pd

data = pd.read_csv(dataset_path, error_bad_lines=False)
data_text = data[['headline_text']]
data_text['index'] = data_text.index

documents = data_text

In [8]:
print(len(documents))
print(documents.head())

1186018
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


## Tokenization, Lemmatization and Stemming

In [19]:
import gensim
import numpy as np
import nltk

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

np.random.seed(2018)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jean/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
# Performs lemmatization and stemming in the given text
def lemmatize_stemming(text):
    return PorterStemmer().stem(
        WordNetLemmatizer().lemmatize(text, pos='v')
    )

# Preprocess the given text, performing tokenization,
# lemmatization, stemming and stopwords removal.
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

Using the previously created functions, we get:

In [42]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs.head()

0        [decid, commun, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object

## Create Bag of Words Model

In [43]:
dictionary = gensim.corpora.Dictionary(processed_docs)

# Filter out tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Generate BoW
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

[(3847, 1), (5968, 1), (9498, 1), (13251, 1)]


In [65]:
import random

doc_idx = random.randint(0, len(documents) - 1)

doc_sample = documents[documents['index'] == doc_idx].values[0][0]

print('Original Document:')
print(doc_sample)
print([word for word in doc_sample.split(' ')])
print()

print('Preprocessed Document:')
print(preprocess(doc_sample))
print()

# print('BoW Model for Document:')
# print(bow_corpus[doc_idx])

bow_doc_sample = bow_corpus[doc_idx]
for i in range(len(bow_doc_sample)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_sample[i][0], 
                                               dictionary[bow_doc_sample[i][0]], 
bow_doc_sample[i][1]))

Original Document:
national rural news
['national', 'rural', 'news']

Preprocessed Document:
['nation', 'rural', 'news']

Word 188 ("nation") appears 1 time.
Word 1253 ("rural") appears 1 time.
Word 1257 ("news") appears 1 time.


## LDA with BoW Model

Generate the LDA model:

In [66]:
lda_model = gensim.models.LdaMulticore(
    bow_corpus, 
    num_topics=10, 
    id2word=dictionary, 
    passes=2, 
    workers=2
)

Check the topics' word list:

In [67]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.026*"market" + 0.022*"school" + 0.015*"investig" + 0.015*"speak" + 0.015*"interview" + 0.013*"fall" + 0.013*"student" + 0.011*"scott" + 0.011*"share" + 0.010*"build"
Topic: 1 
Words: 0.020*"donald" + 0.019*"plan" + 0.018*"canberra" + 0.015*"water" + 0.014*"chang" + 0.013*"council" + 0.012*"feder" + 0.010*"drum" + 0.010*"industri" + 0.010*"park"
Topic: 2 
Words: 0.054*"australia" + 0.025*"world" + 0.017*"open" + 0.015*"tasmania" + 0.015*"final" + 0.013*"win" + 0.011*"australian" + 0.011*"break" + 0.011*"game" + 0.010*"record"
Topic: 3 
Words: 0.029*"queensland" + 0.021*"help" + 0.019*"report" + 0.017*"miss" + 0.016*"royal" + 0.015*"tasmanian" + 0.015*"bushfir" + 0.015*"bank" + 0.014*"street" + 0.013*"victoria"
Topic: 4 
Words: 0.023*"news" + 0.022*"women" + 0.021*"live" + 0.020*"warn" + 0.020*"coast" + 0.019*"health" + 0.016*"rural" + 0.016*"countri" + 0.013*"gold" + 0.012*"return"
Topic: 5 
Words: 0.053*"polic" + 0.050*"say" + 0.024*"kill" + 0.023*"attack" + 0.021*"c

Classifying a document:

In [69]:
for index, score in sorted(lda_model[bow_corpus[doc_idx]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7749938368797302	 
Topic: 0.023*"news" + 0.022*"women" + 0.021*"live" + 0.020*"warn" + 0.020*"coast" + 0.019*"health" + 0.016*"rural" + 0.016*"countri" + 0.013*"gold" + 0.012*"return"

Score: 0.025001784786581993	 
Topic: 0.034*"elect" + 0.032*"govern" + 0.025*"china" + 0.015*"rise" + 0.012*"million" + 0.011*"citi" + 0.011*"trade" + 0.011*"presid" + 0.010*"farm" + 0.010*"deal"

Score: 0.02500169351696968	 
Topic: 0.020*"donald" + 0.019*"plan" + 0.018*"canberra" + 0.015*"water" + 0.014*"chang" + 0.013*"council" + 0.012*"feder" + 0.010*"drum" + 0.010*"industri" + 0.010*"park"

Score: 0.025001127272844315	 
Topic: 0.020*"south" + 0.018*"adelaid" + 0.017*"north" + 0.014*"labor" + 0.014*"hospit" + 0.011*"protest" + 0.011*"west" + 0.011*"worker" + 0.010*"state" + 0.010*"stori"

Score: 0.02500048466026783	 
Topic: 0.054*"australia" + 0.025*"world" + 0.017*"open" + 0.015*"tasmania" + 0.015*"final" + 0.013*"win" + 0.011*"australian" + 0.011*"break" + 0.011*"game" + 0.010*"record"

Sco