# 토픽 모델링 - LDA

## 20 NewsGroup  데이터 사례

In [1]:
import numpy as np
import pandas as pd


In [5]:
from sklearn.datasets import fetch_20newsgroups
news =fetch_20newsgroups(subset = 'all', random_state=2021, remove=('headers', 'footers', 'quotes'))

In [6]:
df = pd.DataFrame({'article':news.data})
df.shape


(18846, 1)

In [7]:
df.article[0][:1000]

"\nJust in case the original poster was looking for a serious answer,\nI'll supply one.\n\nYes, even when steering no hands you do something quite similar\nto countersteering.  Basically to turn left, you to a quick wiggle\nof the bike to the right first, causing a counteracting lean to\noccur to the left.  It is a lot more difficult to do on a motorcycle\nthan a bicycle though, because of the extra weight.  (Ok, so my\nmotorcycle is heavy.  Maybe yous isn't.)"

In [8]:
#특수문자 제거
df['article'] = df.article.str.replace('[^A-Za-z]', ' ')

In [9]:
#소문자로 변환하고 3글자 이하 단어 제거
df['article'] = df.article.apply(lambda x : ' '.join(w.lower() for w in x.split() if len(w) > 3 ))
df.article[0][:1000]

'just case original poster looking serious answer supply even when steering hands something quite similar countersteering basically turn left quick wiggle bike right first causing counteracting lean occur left more difficult motorcycle than bicycle though because extra weight motorcycle heavy maybe yous'

## NLTK를 통해서 단어 토큰화

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# 불용어 처리
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = df.article.apply(lambda x : [w for w in x.split() if w not in stop_words]) #리스트 형태가 아니면 에러남

In [13]:
tokenized_doc[:5]

0    [case, original, poster, looking, serious, ans...
1    [thinking, sending, magazine, idea, parody, bo...
2    [dreamed, great, judgment, morning, dawned, tr...
3    [file, bignums, ripem, last, updated, april, r...
4    [peanut, butter, definitely, favorite, think, ...
Name: article, dtype: object

## DTM을 사용하지 않고 정수인코딩과 단어 집합 만들기  - gensim

In [15]:
from gensim import  corpora
dictionary = corpora.Dictionary(tokenized_doc)
len(dictionary)

83145

In [17]:
corpus = [dictionary.doc2bow(text) for text in tokenized_doc ]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]


In [21]:
dictionary[0], dictionary[1], dictionary[2], dictionary[3]

('answer', 'basically', 'bicycle', 'bike')

## LDA모델 훈련시키키

In [25]:
from gensim.models.ldamodel import LdaModel
NUM_TOPICS = 20 


In [30]:
ldamodel = LdaModel(
    corpus, num_topics=NUM_TOPICS, random_state=2021,
    id2word = dictionary, passes = 20
)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, i

(0, '0.014*"hockey" + 0.013*"team" + 0.007*"city" + 0.007*"april"')
(1, '0.011*"bike" + 0.010*"water" + 0.009*"engine" + 0.008*"cars"')
(2, '0.005*"bosnian" + 0.005*"serbs" + 0.004*"world" + 0.004*"deleted"')
(3, '0.011*"appears" + 0.009*"candida" + 0.009*"wolverine" + 0.008*"cover"')
(4, '0.013*"israel" + 0.011*"turkish" + 0.010*"armenian" + 0.010*"jews"')
(5, '0.012*"government" + 0.008*"president" + 0.007*"public" + 0.006*"would"')
(6, '0.020*"jesus" + 0.015*"church" + 0.013*"bible" + 0.012*"christ"')
(7, '0.012*"drive" + 0.010*"would" + 0.009*"windows" + 0.009*"system"')
(8, '0.020*"space" + 0.007*"nasa" + 0.007*"research" + 0.005*"earth"')
(9, '0.020*"file" + 0.013*"window" + 0.010*"server" + 0.010*"windows"')
(10, '0.023*"mail" + 0.020*"please" + 0.014*"send" + 0.013*"list"')
(11, '0.013*"health" + 0.012*"medical" + 0.008*"disease" + 0.008*"cancer"')
(12, '0.012*"people" + 0.011*"would" + 0.007*"think" + 0.006*"believe"')
(13, '0.005*"linux" + 0.004*"yalcin" + 0.004*"onur" + 0.00

# 훈련결과 시각화

In [28]:
!pip install pyLDAvis==2.1.2 #뒤에 버전을 붙여야 잘 설치됨

Collecting pyLDAvis==2.1.2
  Downloading pyLDAvis-2.1.2.tar.gz (1.6 MB)
[?25l[K     |▏                               | 10 kB 25.7 MB/s eta 0:00:01[K     |▍                               | 20 kB 32.3 MB/s eta 0:00:01[K     |▋                               | 30 kB 17.9 MB/s eta 0:00:01[K     |▉                               | 40 kB 12.8 MB/s eta 0:00:01[K     |█                               | 51 kB 5.7 MB/s eta 0:00:01[K     |█▏                              | 61 kB 5.9 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.5 MB/s eta 0:00:01[K     |█▋                              | 81 kB 6.2 MB/s eta 0:00:01[K     |█▉                              | 92 kB 6.0 MB/s eta 0:00:01[K     |██                              | 102 kB 5.2 MB/s eta 0:00:01[K     |██▎                             | 112 kB 5.2 MB/s eta 0:00:01[K     |██▍                             | 122 kB 5.2 MB/s eta 0:00:01[K     |██▋                             | 133 kB 5.2 MB/s eta 0:00:01[K

In [31]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis) 
#PC1 은 Principal Component

In [46]:
pyLDAvis.save_html(vis, 'news_group_20.html')

In [32]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(1, 0.21060997), (14, 0.76570576)]
1 번째 문서의 topic 비율은 [(1, 0.0484265), (3, 0.17474385), (5, 0.17966925), (7, 0.031786557), (11, 0.060401138), (12, 0.091925785), (14, 0.34559038), (15, 0.024157342), (19, 0.037320927)]
2 번째 문서의 topic 비율은 [(6, 0.36186185), (12, 0.12357302), (14, 0.45935187), (15, 0.027110968)]
3 번째 문서의 topic 비율은 [(5, 0.02760349), (7, 0.07531616), (8, 0.025203133), (9, 0.085529715), (10, 0.16337147), (11, 0.015387975), (12, 0.024777709), (14, 0.028632857), (18, 0.11858687), (19, 0.4116757)]
4 번째 문서의 topic 비율은 [(8, 0.31456077), (10, 0.06009591), (12, 0.047664586), (14, 0.5072697), (17, 0.052967172)]


In [39]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = []

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table.append([int(topic_num), round(prop_topic,4), topic_list])
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    df =  pd.DataFrame(topic_table )
    return(df)

In [40]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,14,0.7657,"[(1, 0.21060514), (14, 0.76571065)]"
1,1,14,0.3456,"[(1, 0.04842577), (3, 0.17474261), (5, 0.17966..."
2,2,14,0.4594,"[(6, 0.36186132), (12, 0.12357243), (14, 0.459..."
3,3,19,0.4117,"[(5, 0.027603593), (7, 0.07531575), (8, 0.0252..."
4,4,14,0.5073,"[(8, 0.31456113), (10, 0.06009001), (12, 0.047..."
5,5,12,0.3768,"[(0, 0.014734543), (2, 0.011818114), (5, 0.064..."
6,6,7,0.4279,"[(1, 0.21832576), (7, 0.42792884), (14, 0.2012..."
7,7,12,0.4393,"[(6, 0.39674446), (8, 0.02327392), (12, 0.4392..."
8,8,5,0.4025,"[(0, 0.030067118), (3, 0.049999997), (5, 0.402..."
9,9,14,0.2927,"[(5, 0.07136673), (7, 0.22600251), (11, 0.1598..."


## NUM_TOPICS 24

In [41]:
ldamodel2 = LdaModel(
    corpus, num_topics=24, random_state=2021,
    id2word = dictionary, passes = 20
)



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, i

In [42]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.014*"hockey" + 0.013*"team" + 0.007*"city" + 0.007*"april"')
(1, '0.011*"bike" + 0.010*"water" + 0.009*"engine" + 0.008*"cars"')
(2, '0.005*"bosnian" + 0.005*"serbs" + 0.004*"world" + 0.004*"deleted"')
(3, '0.011*"appears" + 0.009*"candida" + 0.009*"wolverine" + 0.008*"cover"')
(4, '0.013*"israel" + 0.011*"turkish" + 0.010*"armenian" + 0.010*"jews"')
(5, '0.012*"government" + 0.008*"president" + 0.007*"public" + 0.006*"would"')
(6, '0.020*"jesus" + 0.015*"church" + 0.013*"bible" + 0.012*"christ"')
(7, '0.012*"drive" + 0.010*"would" + 0.009*"windows" + 0.009*"system"')
(8, '0.020*"space" + 0.007*"nasa" + 0.007*"research" + 0.005*"earth"')
(9, '0.020*"file" + 0.013*"window" + 0.010*"server" + 0.010*"windows"')
(10, '0.023*"mail" + 0.020*"please" + 0.014*"send" + 0.013*"list"')
(11, '0.013*"health" + 0.012*"medical" + 0.008*"disease" + 0.008*"cancer"')
(12, '0.012*"people" + 0.011*"would" + 0.007*"think" + 0.006*"believe"')
(13, '0.005*"linux" + 0.004*"yalcin" + 0.004*"onur" + 0.00

In [44]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis2 = pyLDAvis.gensim.prepare(ldamodel2, corpus, dictionary)
pyLDAvis.display(vis2) 
#PC1 은 Principal Component

In [45]:
pyLDAvis.save_html(vis2, 'news_group_24.html')