In [1]:
import pandas as pd
import numpy as np
import gensim
import spacy

In [2]:
import gensim
#Metricas Inerentes:
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel #Sentidos de los grupos

In [3]:
data=pd.read_csv('../data/abcnews-date-text.csv')

In [4]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [5]:
data.shape

(1186018, 2)

In [6]:
data = data.headline_text.values.tolist()

In [7]:
data

['aba decides against community broadcasting licence',
 'act fire witnesses must be aware of defamation',
 'a g calls for infrastructure protection summit',
 'air nz staff in aust strike for pay rise',
 'air nz strike to affect australian travellers',
 'ambitious olsson wins triple jump',
 'antic delighted with record breaking barca',
 'aussie qualifier stosur wastes four memphis match',
 'aust addresses un security council over iraq',
 'australia is locked into war timetable opp',
 'australia to contribute 10 million in aid to iraq',
 'barca take record as robson celebrates birthday in',
 'bathhouse plans move ahead',
 'big hopes for launceston cycling championship',
 'big plan to boost paroo water supplies',
 'blizzard buries united states in bills',
 'brigadier dismisses reports troops harassed in',
 'british combat troops arriving daily in kuwait',
 'bryant leads lakers to double overtime win',
 'bushfire victims urged to see centrelink',
 'businesses should prepare for terrorist a

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['aba', 'decides', 'against', 'community', 'broadcasting', 'licence']]


In [9]:
print(data_words[:3])

[['aba', 'decides', 'against', 'community', 'broadcasting', 'licence'], ['act', 'fire', 'witnesses', 'must', 'be', 'aware', 'of', 'defamation'], ['calls', 'for', 'infrastructure', 'protection', 'summit']]


In [10]:

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [12]:
import spacy
from spacy.lang.en import English
parser = English()

nlp = spacy.load("en_core_web_sm")

In [13]:
data_words_nostops = remove_stopwords(data_words)

In [14]:
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

KeyboardInterrupt: 

In [14]:

id2word = corpora.Dictionary(data_words_nostops)


texts = data_words_nostops

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [15]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('aba', 1),
  ('broadcasting', 1),
  ('community', 1),
  ('decides', 1),
  ('licence', 1)]]

In [16]:
#  LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=10,)

In [18]:
from pprint import pprint

In [19]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.023*"dead" + 0.018*"attack" + 0.016*"change" + 0.015*"climate" + '
  '0.015*"scott" + 0.015*"test" + 0.013*"national" + 0.013*"minister" + '
  '0.013*"island" + 0.012*"abuse"'),
 (1,
  '0.032*"news" + 0.022*"top" + 0.017*"federal" + 0.013*"health" + '
  '0.011*"speaks" + 0.010*"wins" + 0.010*"hong" + 0.010*"wall" + 0.010*"call" '
  '+ 0.010*"kong"'),
 (2,
  '0.040*"australian" + 0.025*"bushfire" + 0.022*"people" + 0.022*"stories" + '
  '0.020*"family" + 0.017*"canberra" + 0.016*"live" + 0.014*"nsw" + '
  '0.013*"report" + 0.012*"indigenous"'),
 (3,
  '0.023*"year" + 0.021*"tasmania" + 0.016*"back" + 0.013*"drug" + 0.011*"new" '
  '+ 0.010*"bill" + 0.010*"thousands" + 0.009*"video" + 0.009*"nrl" + '
  '0.009*"tax"'),
 (4,
  '0.038*"says" + 0.025*"election" + 0.021*"queensland" + 0.012*"us" + '
  '0.011*"afl" + 0.009*"big" + 0.009*"war" + 0.008*"interview" + '
  '0.007*"premier" + 0.007*"media"'),
 (5,
  '0.023*"abc" + 0.020*"victoria" + 0.020*"sydney" + 0.015*"emergency" + '
 

In [21]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_nostops, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.27878085464384955


In [22]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis