# Import Data

In [1]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
documents

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4
5,ambitious olsson wins triple jump,5
6,antic delighted with record breaking barca,6
7,aussie qualifier stosur wastes four memphis match,7
8,aust addresses un security council over iraq,8
9,australia is locked into war timetable opp,9


In [3]:
print(len(documents))
print(documents[:5])

1103663
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


# Data Pre-processing

* Tokenization
> lower, punctuation 제거, 글자 길이 3개 미만 단어 제거, stopwords제거, 

* stopwords
> 제거

* 글자 길이 3개 미만 단어
> 제거

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MASTER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
stemmer = PorterStemmer()

In [6]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [7]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


In [8]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0               [decid, commun, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [10]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 commun
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [11]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(76, 1), (112, 1), (484, 1), (4022, 1)]

In [12]:
len(bow_corpus)

1103663

In [13]:
bow_corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1)],
 [(14, 1), (15, 1), (16, 1), (17, 1)],
 [(18, 1), (19, 1), (20, 1), (21, 1)],
 [(22, 1), (23, 1), (24, 1), (25, 1), (26, 1)],
 [(27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)],
 [(11, 1), (33, 1), (34, 1), (35, 1), (36, 1)],
 [(37, 1), (38, 1), (39, 1)],
 [(35, 1), (37, 1), (40, 1), (41, 1)],
 [(23, 1), (26, 1), (42, 1), (43, 1), (44, 1)],
 [(45, 1), (46, 1), (47, 1)],
 [(48, 1), (49, 1), (50, 1), (51, 1)],
 [(47, 1), (52, 1), (53, 1), (54, 1), (55, 1)],
 [(56, 1), (57, 1), (58, 1), (59, 1), (60, 1)],
 [(61, 1), (62, 1), (63, 1), (64, 1), (65, 1)],
 [(65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1)],
 [(71, 1), (72, 1), (73, 1), (74, 1), (75, 1)],
 [(76, 1), (77, 1), (78, 1), (79, 1)],
 [(80, 1), (81, 1), (82, 1), (83, 1)],
 [(84, 1), (85, 1), (86, 1), (87, 1)],
 [(45, 1), (88, 1), (89, 1), (90, 1)],
 [(91, 1), (92, 1), (93, 1), (94, 1), (95

# LDA

## LDA Train

테마(토픽)가 너무 많으면, 구분하기 어렵다.

프로젝트에선 적은 숫자로 하자.

In [14]:
NUM_TOPICS = 20 #20개의 토픽, k=20
# ldamodel = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics = NUM_TOPICS, id2word=dictionary)
# ldamodel.save('ldamodel')
ldamodel = gensim.models.ldamodel.LdaModel.load('ldamodel')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.070*"sydney" + 0.043*"report" + 0.027*"commun" + 0.026*"interview"')
(1, '0.110*"australia" + 0.039*"school" + 0.035*"call" + 0.029*"health"')
(2, '0.051*"court" + 0.033*"woman" + 0.032*"face" + 0.027*"chang"')
(3, '0.042*"hous" + 0.038*"women" + 0.037*"test" + 0.033*"famili"')
(4, '0.039*"perth" + 0.028*"home" + 0.027*"high" + 0.025*"win"')
(5, '0.056*"north" + 0.044*"die" + 0.029*"dead" + 0.028*"korea"')
(6, '0.099*"australian" + 0.047*"south" + 0.030*"council" + 0.027*"take"')
(7, '0.035*"hospit" + 0.032*"leav" + 0.031*"work" + 0.024*"worker"')
(8, '0.028*"break" + 0.026*"meet" + 0.026*"abus" + 0.023*"park"')
(9, '0.068*"year" + 0.024*"island" + 0.023*"claim" + 0.020*"record"')
(10, '0.035*"live" + 0.032*"china" + 0.028*"fund" + 0.026*"show"')
(11, '0.045*"crash" + 0.038*"kill" + 0.034*"tasmania" + 0.023*"victim"')
(12, '0.088*"trump" + 0.058*"govern" + 0.053*"queensland" + 0.031*"turnbul"')
(13, '0.043*"donald" + 0.043*"nation" + 0.028*"labor" + 0.025*"trial"')
(14, '0.063*"

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


각 단어 앞에 붙은 수치는 단어의 해당 토픽에 대한 기여도를 보여줍니다. 

또한 맨 앞에 있는 토픽 번호는 0부터 시작하므로 총 20개의 토픽은 0부터 19까지의 번호가 할당되어져 있습니다.

passes는 알고리즘의 동작 횟수를 말하는데, 알고리즘이 결정하는 토픽의 값이 적절히 수렴할 수 있도록 충분히 적당한 횟수를 정해주면 됩니다. 

여기서는 총 15회를 수행하였습니다.

여기서는 num_words=4로 총 4개의 단어만 출력하도록 하였습니다.

## LDA Visualization

In [27]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, bow_corpus, dictionary)
pyLDAvis.display(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [31]:
# pyLDAvis.save_html(vis, 'lda.html')
# pyLDAvis.save_json(vis, 'lda.json')

## LDA topic distribution for each documents

In [16]:
ldamodel

<gensim.models.ldamodel.LdaModel at 0x25d093856d8>

In [53]:
def make_topictable_per_doc(ldamodel, corpus, texts, indices):
    topic_table = pd.DataFrame()
    for i, topic_list in zip(indices, (ldamodel[corpus][idx] for idx in indices)):
        doc = sorted(topic_list, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(i), int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [59]:
topictable = make_topictable_per_doc(ldamodel, bow_corpus, processed_docs, [0,100,2,33])
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0.0,6.0,0.41,"[(0, 0.21), (5, 0.21), (6, 0.41)]"
1,100.0,3.0,0.2625,"[(0, 0.012500001), (1, 0.012500001), (2, 0.012..."
2,2.0,4.0,0.41,"[(1, 0.21), (4, 0.41), (6, 0.21)]"
3,33.0,7.0,0.3417,"[(2, 0.175), (7, 0.34166667), (9, 0.17500001),..."
