In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import pickle
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer

In [2]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])  # 불용어처리

#  빈도수 높은 키워드 처리
stop_words.extend([ 'from', 'subject', 're', 'edu', 'use', 'home', 'hunter', 'help', 'time', 'national', 'build', 'end', 'bid', 'cup', 'un', 'come', 'join', 'across','Italy', 'grind', 'asian', 'sa', 'miss', 'one', 'use', 'three', 'Darwin', 'vic', 'number', 'may', 'start', 'law', 'way', 'communities', 'order', 'check', 'major', 'india', 'focus', 'form', 'journalist', 'milk', 'nz', 'rank', 'cook', 'egypt', 'New', 'year', 'force', 'fail', 'dead', 'was', 'fruit', 'philippines','nick'])
stop_words.extend(['fire', 'new', 'world', 'boat', 'turn', 'around', 'well', 'Find', 'two', 'adelaide', 'first', 'make', 'body', 'probe', 'outback', 'baby', 'David', 'street', 'mass', 'hotel', 'say', 'open', 'go', 'welcome', 'announce', 'level', 'allow', 'highest','queensland', 'kill', 'crash', 'road', 'record', 'nt', 'hit', 'plane', 'toll', 'suspend', 'peninsula', 'afghan', 'recovery','man', 'perth', 'flood', 'people', 'still', 'supply', 'siege', 'spark', 'summer', 'Michael', 'ops', 'large', 'flash', 'view', 'attack', 'back', 'mine', 'deal', 'fan', 'celebrate', 'target', 'hill', 'party', 'reveal', 'terrorism', 'video', 'pressure', 'remember', 'korea', 'indian', 'millions', 'drill', 'country', 'hour', 'podcast', 'leaders', 'thursday', 'abbott', 'tony',  'shorten', 'sach', 'day', 'years', 'show', 'teen', 'heat', 'issue', 'free', 'australias', 'asbestos', 'compete','South', 'china', 'talk', 'appeal', 'labor', 'plant', 'peter', 'allegedly', 'begin', 'try', 'ice', 'native', 'alcohol', 'Australia', 'league', 'live', 'launch',  'benefit', 'update', 'stream', 'cabinet', 'document', 'bob','Test', 'drug', 'brisbane', 'british', 'double', 'ebola', 'Wa', 'research', 'expansion', 'ready', 'old', 'release', 'paper', 'see'])

# 그 아래에서  빈도수 높은 키워드 처리
stop_words.extend(['call',  'queensland', 'melbourne', 'perth', 'thousands', 'alert', 'reveal', 'spark', 'amid', 'australian',  'brisbane', 'western', 'high', 'fan', 'prepare', 'british', 'battle', 'beach', 'wa', 'take',  'box', 'could',  'search', 'black', 'michael', 'week','man', 'day' ,'country', 'new', 'old', 'test',  'force', 'release', 'miss','say', 'south', 'was','fire', 'victoria', 'build','australia', 'court','find', 'fall','mine', 'darwin', 'break', 'record', 'david', 'reflect', 'remember','adelaide', 'show'])

#stop_words

In [3]:
import pandas as pd
data = pd.read_csv('/Users/moon-il/Downloads/abcnews-date-text.csv', error_bad_lines=False);

In [4]:

data['publish_date'] = pd.to_datetime(data['publish_date'].astype(str), format = '%Y%m%d')

In [5]:

data['publish_date'] = pd.DatetimeIndex(data['publish_date']).year

In [6]:

data2 = data['headline_text']

#data2

In [7]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data['headline_text']))

In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [10]:
data_words_nostops = pd.Series(data_words_nostops)
data_lemmatized = data_words_nostops.apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

In [11]:
temp = data
temp['lemmatize'] = data_lemmatized
for i in range(15):
    globals()['trend{}'.format(i+2003)] = temp.loc[temp.publish_date == i+2003]

In [12]:

trend_list = [trend2003, trend2004, trend2005, trend2006, trend2007, trend2008, trend2009, trend2010, trend2011, trend2012, trend2013, trend2014, trend2015, trend2016, trend2017]

In [13]:
lemmatized = list(trend2015['lemmatize'])  # 2015년 트렌드
id2word = corpora.Dictionary(lemmatized)
texts = lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

In [14]:

lemmatized = remove_stopwords(lemmatized)

In [15]:
id2word = corpora.Dictionary(lemmatized)
texts = lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

In [17]:
#  최적의 토픽의 수는 50으로 결정!!


# 최적의 모델은?
p = 20
ntopics = 20
r = 100
c = 1000
    
lda_model1 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=ntopics,  # 이건 나중에 그래프로 판단해보자!!!
                                           random_state=r,
                                           update_every=1,
                                           chunksize=c,
                                           passes=p,
                                           alpha='auto',
                                           per_word_topics=True)

In [18]:
# Compute Perplexity
print('\nPerplexity: ', lda_model1.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model1, texts=lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -18.118695390842145

Coherence Score:  0.5042278412488306


In [19]:

# Print the Keyword in the 10 topics
#
pprint(lda_model1.print_topics(20))
doc_lda = lda_model1[corpus]

[(0,
  '0.063*"state" + 0.058*"set" + 0.049*"port" + 0.046*"lead" + 0.030*"bring" + '
  '0.030*"beat" + 0.029*"keep" + 0.025*"mark" + 0.024*"aboriginal" + '
  '0.024*"rally"'),
 (1,
  '0.115*"christmas" + 0.043*"star" + 0.040*"family" + 0.037*"bushfire" + '
  '0.030*"lose" + 0.029*"look" + 0.027*"trade" + 0.025*"land" + 0.023*"coach" '
  '+ 0.021*"raid"'),
 (2,
  '0.058*"accuse" + 0.050*"work" + 0.047*"urge" + 0.046*"share" + '
  '0.039*"children" + 0.034*"park" + 0.031*"run" + 0.030*"men" + '
  '0.029*"damage" + 0.026*"sea"'),
 (3,
  '0.072*"council" + 0.065*"face" + 0.062*"death" + 0.060*"november" + '
  '0.053*"hobart" + 0.040*"review" + 0.027*"drive" + 0.023*"wednesday" + '
  '0.020*"drop" + 0.020*"offer"'),
 (4,
  '0.120*"sydney" + 0.104*"plan" + 0.050*"ban" + 0.040*"newcastle" + '
  '0.031*"islamic" + 0.024*"burn" + 0.023*"central" + 0.023*"pm" + '
  '0.020*"media" + 0.020*"game"'),
 (5,
  '0.058*"murder" + 0.056*"fund" + 0.046*"car" + 0.036*"return" + '
  '0.033*"centre" + 0.032

In [20]:
#  최적의 토픽의 수는 50으로 결정!!


# 최적의 모델은?
p = 20
ntopics = 20
r = 100
c = 5000
    
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=ntopics,  # 이건 나중에 그래프로 판단해보자!!!
                                           random_state=r,
                                           update_every=1,
                                           chunksize=c,
                                           passes=p,
                                           alpha='auto',
                                           per_word_topics=True)

In [21]:
# Compute Perplexity
print('\nPerplexity: ', lda_model2.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model2, texts=lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -12.894645477175882

Coherence Score:  0.4461369789148574


In [26]:

# Print the Keyword in the 10 topics
#
pprint(lda_model2.print_topics(20))
doc_lda = lda_model1[corpus]

[(0,
  '0.083*"win" + 0.033*"port" + 0.026*"beat" + 0.024*"push" + 0.020*"award" + '
  '0.018*"bring" + 0.018*"president" + 0.018*"game" + 0.017*"rescue" + '
  '0.016*"club"'),
 (1,
  '0.042*"north" + 0.033*"job" + 0.032*"cut" + 0.027*"afl" + 0.027*"lose" + '
  '0.021*"land" + 0.021*"million" + 0.021*"tas" + 0.018*"question" + '
  '0.015*"defend"'),
 (2,
  '0.061*"market" + 0.037*"share" + 0.028*"children" + 0.028*"grandstand" + '
  '0.021*"wednesday" + 0.020*"drum" + 0.020*"damage" + 0.020*"asylum" + '
  '0.019*"friday" + 0.019*"great"'),
 (3,
  '0.056*"christmas" + 0.039*"tasmania" + 0.031*"november" + 0.030*"hobart" + '
  '0.026*"women" + 0.022*"hear" + 0.019*"yo" + 0.018*"anti" + '
  '0.016*"parliament" + 0.016*"fish"'),
 (4,
  '0.066*"sydney" + 0.053*"change" + 0.037*"health" + 0.031*"ban" + '
  '0.031*"hospital" + 0.027*"climate" + 0.026*"paris" + 0.023*"turnbull" + '
  '0.020*"interview" + 0.020*"nrl"'),
 (5,
  '0.062*"us" + 0.031*"lead" + 0.030*"return" + 0.025*"island" + '
  '

In [22]:
#  최적의 토픽의 수는 50으로 결정!!


# 최적의 모델은?
p = 20
ntopics = 40
r = 100
c = 1000
    
lda_model3 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=ntopics,  # 이건 나중에 그래프로 판단해보자!!!
                                           random_state=r,
                                           update_every=1,
                                           chunksize=c,
                                           passes=p,
                                           alpha='auto',
                                           per_word_topics=True)

In [23]:
# Compute Perplexity
print('\nPerplexity: ', lda_model3.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model3, texts=lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -28.377202973874052

Coherence Score:  0.5810693562016778


In [30]:
# Print the Keyword in the 10 topics
#
pprint(lda_model3.print_topics(40))
doc_lda = lda_model1[corpus]

[(0,
  '0.123*"state" + 0.094*"port" + 0.055*"save" + 0.053*"allege" + 0.048*"risk" '
  '+ 0.046*"aboriginal" + 0.046*"rally" + 0.041*"hand" + 0.041*"mother" + '
  '0.040*"escape"'),
 (1,
  '0.220*"christmas" + 0.058*"lose" + 0.057*"future" + 0.050*"hold" + '
  '0.047*"anti" + 0.040*"rain" + 0.037*"crop" + 0.035*"game" + '
  '0.034*"emergency" + 0.031*"harvest"'),
 (2,
  '0.142*"warn" + 0.088*"star" + 0.084*"job" + 0.073*"big" + 0.067*"children" '
  '+ 0.044*"sea" + 0.035*"capital" + 0.034*"gas" + 0.031*"tip" + '
  '0.024*"night"'),
 (3,
  '0.134*"market" + 0.089*"rise" + 0.082*"cut" + 0.078*"review" + '
  '0.051*"boost" + 0.045*"business" + 0.040*"security" + 0.038*"russian" + '
  '0.034*"remain" + 0.033*"oil"'),
 (4,
  '0.137*"health" + 0.100*"assault" + 0.059*"challenge" + 0.055*"right" + '
  '0.054*"bash" + 0.049*"jet" + 0.048*"consider" + 0.036*"confirm" + '
  '0.028*"smith" + 0.024*"champion"'),
 (5,
  '0.082*"industry" + 0.068*"cricket" + 0.053*"play" + 0.049*"delay" + '
  '0.04

In [24]:
#  최적의 토픽의 수는 50으로 결정!!


# 최적의 모델은?
p = 20
ntopics = 40
r = 100
c = 5000
    
lda_model4 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=ntopics,  # 이건 나중에 그래프로 판단해보자!!!
                                           random_state=r,
                                           update_every=1,
                                           chunksize=c,
                                           passes=p,
                                           alpha='auto',
                                           per_word_topics=True)

In [25]:
# Compute Perplexity
print('\nPerplexity: ', lda_model4.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model4, texts=lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -17.491733861929415

Coherence Score:  0.5597804997436081


In [32]:
# Print the Keyword in the 10 topics
#
pprint(lda_model4.print_topics(40))
#doc_lda = lda_model1[corpus]

[(0,
  '0.076*"ban" + 0.075*"service" + 0.067*"port" + 0.062*"newcastle" + '
  '0.045*"allege" + 0.044*"september" + 0.034*"tax" + 0.033*"rally" + '
  '0.031*"mother" + 0.024*"clash"'),
 (1,
  '0.080*"star" + 0.077*"meet" + 0.048*"game" + 0.044*"rain" + 0.035*"harvest" '
  '+ 0.030*"threat" + 0.025*"likely" + 0.023*"night" + 0.020*"know" + '
  '0.019*"mango"'),
 (2,
  '0.065*"job" + 0.064*"cut" + 0.050*"island" + 0.039*"business" + '
  '0.034*"protest" + 0.031*"capital" + 0.029*"students" + 0.025*"indonesia" + '
  '0.025*"international" + 0.024*"bash"'),
 (3,
  '0.072*"house" + 0.062*"concern" + 0.047*"give" + 0.040*"trade" + '
  '0.037*"future" + 0.031*"raise" + 0.029*"laws" + 0.027*"inquest" + '
  '0.027*"parliament" + 0.026*"consider"'),
 (4,
  '0.103*"west" + 0.051*"victorian" + 0.047*"storm" + 0.038*"young" + '
  '0.038*"east" + 0.035*"opposition" + 0.032*"team" + 0.030*"weather" + '
  '0.024*"step" + 0.024*"confirm"'),
 (5,
  '0.106*"market" + 0.065*"share" + 0.050*"industry" + 0