In [12]:
import pandas as pd
# A Million News Headlines
# News headlines published over a period of 18 Years

data = pd.read_csv("data/LDA_tuto/abcnews-date-text.csv")
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text
documents.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [25]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jigna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
# Código para procesar el texto. Se quitan las stop words
# y se quitan las palabras con largo mayor a 3. Se 
# lematizan las palabras.


def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and \
        len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [27]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


 tokenized and lemmatized document: 
['ratepayers', 'group', 'want', 'compulsory', 'local', 'govt', 'vote']


In [28]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0              [decide, community, broadcast, licence]
1                         [witness, aware, defamation]
2           [call, infrastructure, protection, summit]
3                          [staff, aust, strike, rise]
4             [strike, affect, australian, travellers]
5               [ambitious, olsson, win, triple, jump]
6               [antic, delight, record, break, barca]
7    [aussie, qualifier, stosur, waste, memphis, ma...
8             [aust, address, security, council, iraq]
9                         [australia, lock, timetable]
Name: headline_text, dtype: object

In [29]:
# Se crea un diccionario que guarda todas las palabras
# usadas en el corpus. Corresponde al vocabulario.

dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k,v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 community
2 decide
3 licence
4 aware
5 defamation
6 witness
7 call
8 infrastructure
9 protection
10 summit


In [30]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [31]:
# Representamos cada elemento del corpus como un coleción
# de vectores (w,c) donde w corresponde al índice de la
# palabra y c es la cantidad de veces que esa palabra 
# aparece en ese documento.

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[3636]

[(232, 1), (334, 1), (508, 1), (592, 1), (3214, 1), (3823, 1)]

In [32]:
from gensim import corpora, models
from pprint import pprint

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf(bow_corpus)

for doc in corpus_tfidf:
    pprint(doc)
    break

TypeError: 'TfidfModel' object is not callable

In [34]:
# Running LDA using Bag of Words
lda_model = gensim.models.LdaMulticore(bow_corpus, \
                                    num_topics=10,\
                                    id2word=dictionary,\
                                    passes=2, workers=2)

In [35]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.042*"australia" + 0.024*"donald" + 0.023*"sydney" + 0.021*"news" + 0.020*"world" + 0.018*"open" + 0.013*"protest" + 0.012*"win" + 0.011*"morrison" + 0.011*"final"
Topic: 1 
Words: 0.024*"market" + 0.020*"year" + 0.017*"record" + 0.013*"care" + 0.013*"price" + 0.013*"australian" + 0.012*"years" + 0.012*"business" + 0.011*"family" + 0.011*"premier"
Topic: 2 
Words: 0.047*"coronavirus" + 0.016*"rise" + 0.016*"restrictions" + 0.013*"royal" + 0.013*"scott" + 0.012*"concern" + 0.012*"water" + 0.011*"commission" + 0.011*"amid" + 0.010*"meet"
Topic: 3 
Words: 0.024*"kill" + 0.022*"australian" + 0.020*"die" + 0.017*"border" + 0.016*"shoot" + 0.016*"coast" + 0.016*"miss" + 0.014*"crash" + 0.013*"attack" + 0.013*"gold"
Topic: 4 
Words: 0.040*"police" + 0.026*"charge" + 0.026*"case" + 0.026*"court" + 0.021*"death" + 0.020*"murder" + 0.017*"face" + 0.014*"jail" + 0.013*"people" + 0.013*"arrest"
Topic: 5 
Words: 0.043*"queensland" + 0.036*"coronavirus" + 0.033*"government" + 0.027