잠재 디리클레 할당(Latent Dirichlet Allocation, LDA)

In [2]:
#토픽을 추출하지만 단서 순서는 고려하지 않음
#LDA 알고리즘은 토픽 k가 M개의 문서에 걸쳐 분포되었다고 가정

#LSA : 단어 문서 행렬을 차원 축소하여 축소 차원에서 근접 단어들을 토픽으로 묶는다
#LDA : 단어가 특정 토픽에 존재할 확률과 문서에 특정 토픽이 존재할 확률을 결합확률로 추정하여 토픽을 추출

In [4]:
import pandas as pd
data = pd.read_csv("data/abcnews-date-text.csv", error_bad_lines=False)
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [5]:
text = data[['headline_text']]
text.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [6]:
#preprocessing _ tokenizing
import nltk
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,headline_text
0,"[aba, decides, against, community, broadcastin..."
1,"[act, fire, witnesses, must, be, aware, of, de..."
2,"[a, g, calls, for, infrastructure, protection,..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."


In [7]:
#delete stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])
text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [8]:
#표제어 추출 : lemmatizer
from nltk.stem import WordNetLemmatizer
text['headline_text'] = text['headline_text'].apply(lambda x : [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
text.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,headline_text
0,"[aba, decide, community, broadcast, licence]"
1,"[act, fire, witness, must, aware, defamation]"
2,"[g, call, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [9]:
#delete words less then 3
tokenized_doc = text['headline_text'].apply(lambda x : [word for word in x if len(word) > 3])
tokenized_doc[:5]

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object

In [17]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle = True, random_state = 1, remove=('headers', 'footers', 'quotes'))
document = dataset.data
news_df = pd.DataFrame({'document':document})
# 데이터 전처리 과정
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]", " ")
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())
# 불용어 처리 및 토큰화
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x : x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

In [18]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
corpus[1] # 수행된 결과에서 두번째 문서 출력, 첫번째 문서의 인덱스는 0

[(52, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 2),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 2),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 2),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 2),
 (86, 1),
 (87, 1),
 (88, 1),
 (89, 1)]

In [20]:
dictionary[67]

'feelings'

In [21]:
len(dictionary)

65284

LDA 모델 훈련시키기

In [22]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4) #print 4 words 
for topic in topics:
    print(topic)

#단어 앞에 붙는 숫자는 기여도임 20개가 할당되어 있음

(0, '0.017*"docs" + 0.009*"pseudo" + 0.009*"arafat" + 0.007*"netcom"')
(1, '0.017*"jesus" + 0.010*"christian" + 0.010*"bible" + 0.008*"church"')
(2, '0.026*"would" + 0.015*"like" + 0.014*"know" + 0.014*"think"')
(3, '0.018*"patients" + 0.014*"gordon" + 0.014*"doctor" + 0.014*"medical"')
(4, '0.023*"armenian" + 0.020*"armenians" + 0.019*"said" + 0.011*"went"')
(5, '0.019*"space" + 0.008*"nasa" + 0.007*"data" + 0.007*"research"')
(6, '0.016*"evidence" + 0.013*"science" + 0.010*"argument" + 0.009*"exist"')
(7, '0.016*"plane" + 0.011*"radius" + 0.010*"bandwidth" + 0.010*"points"')
(8, '0.018*"encryption" + 0.015*"chip" + 0.014*"keys" + 0.013*"clipper"')
(9, '0.024*"game" + 0.024*"team" + 0.017*"games" + 0.017*"play"')
(10, '0.027*"mail" + 0.025*"please" + 0.021*"thanks" + 0.015*"send"')
(11, '0.018*"cars" + 0.014*"tobacco" + 0.014*"engine" + 0.010*"radar"')
(12, '0.013*"year" + 0.011*"good" + 0.009*"last" + 0.007*"like"')
(13, '0.021*"drive" + 0.016*"card" + 0.014*"disk" + 0.013*"system"')

LDA 시각화 하기

In [23]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-2.1.2.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 524 kB/s eta 0:00:01
Collecting numexpr
  Downloading numexpr-2.7.1-cp37-cp37m-macosx_10_6_intel.whl (186 kB)
[K     |████████████████████████████████| 186 kB 921 kB/s eta 0:00:01
[?25hCollecting pytest
  Downloading pytest-5.4.1-py3-none-any.whl (246 kB)
[K     |████████████████████████████████| 246 kB 913 kB/s eta 0:00:01
[?25hCollecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 961 kB/s eta 0:00:01
[?25hCollecting funcy
  Downloading funcy-1.14.tar.gz (548 kB)
[K     |████████████████████████████████| 548 kB 2.4 MB/s eta 0:00:01
Collecting more-itertools>=4.0.0
  Downloading more_itertools-8.2.0-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 1.3 MB/s eta 0:00:01
Collecting packaging
  Downloading packaging-20.3-py2.py3-none-any.whl (37 kB)
Collecting py>=1.5.0
  Downloading py-1.

In [24]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)