In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from gensim import corpora
import gensim
import pyLDAvis.gensim_models

In [5]:
# 데이터셋 가오
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

print(f'문서의 총 개수 : {len(documents)}\n첫 번째 문서는 다음과 같습니다.\n{documents[0]}')


문서의 총 개수 : 11314
첫 번째 문서는 다음과 같습니다.
Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.



In [None]:
# 데이터 전처리
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())
stop_words = stopwords.words('english') # NLTK로부터 불용어를 받아옵니다.
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

In [5]:
NUM_TOPICS = 20 #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

(0, '0.013*"guns" + 0.012*"state" + 0.009*"police" + 0.009*"crime"')
(1, '0.042*"jesus" + 0.017*"christ" + 0.017*"church" + 0.013*"bible"')
(2, '0.018*"water" + 0.016*"gordon" + 0.015*"pitt" + 0.014*"banks"')
(3, '0.010*"people" + 0.010*"would" + 0.008*"think" + 0.006*"believe"')
(4, '0.051*"president" + 0.022*"jobs" + 0.016*"myers" + 0.016*"going"')
(5, '0.017*"available" + 0.016*"mail" + 0.013*"information" + 0.012*"please"')
(6, '0.009*"lines" + 0.008*"hanging" + 0.008*"mask" + 0.008*"specs"')
(7, '0.013*"government" + 0.009*"encryption" + 0.009*"public" + 0.008*"security"')
(8, '0.013*"drive" + 0.010*"system" + 0.010*"card" + 0.008*"scsi"')
(9, '0.024*"armenia" + 0.021*"henrik" + 0.021*"azerbaijan" + 0.020*"turkey"')
(10, '0.009*"much" + 0.006*"bike" + 0.005*"used" + 0.005*"better"')
(11, '0.012*"drugs" + 0.010*"drug" + 0.009*"smokeless" + 0.008*"cross"')
(12, '0.042*"windows" + 0.019*"files" + 0.019*"color" + 0.018*"file"')
(13, '0.018*"game" + 0.016*"team" + 0.013*"games" + 0.012

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [None]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서는\n', documents[i], '\n\n이 문서의 topic 비율은',topic_list)

In [19]:
topic_table = pd.DataFrame()

# 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.


for i, topic_list in enumerate(ldamodel[corpus]):
    doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
    doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
    # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
    # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
    # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
    # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

    # 모든 문서에 대해서 각각 아래를 수행
    for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
        if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
            topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list,documents[i]]), ignore_index=True)
            # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
        else:
            break
topic_table = topic_table.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topic_table.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중', '문서']
topic_table


Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중,문서
0,0,19.0,0.5766,"[(3, 0.40864384), (19, 0.57659274)]",Well i'm not sure about the story nad it did s...
1,1,3.0,0.3559,"[(1, 0.058209244), (2, 0.027612353), (3, 0.355...","\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,2,15.0,0.4219,"[(3, 0.13185781), (12, 0.017875379), (15, 0.42...",Although I realize that principle is not one o...
3,3,7.0,0.6520,"[(6, 0.016035704), (7, 0.65203613), (8, 0.0147...",Notwithstanding all the legitimate fuss about ...
4,4,13.0,0.5988,"[(13, 0.59876174), (15, 0.3039635), (18, 0.065...","Well, I will have to change the scoring on my ..."
...,...,...,...,...,...
11309,11309,0.0,0.2443,"[(0, 0.24432659), (3, 0.16911833), (6, 0.10236...","Danny Rubenstein, an Israeli journalist, will ..."
11310,11310,0.0,0.0500,"[(0, 0.05), (1, 0.05), (2, 0.05), (3, 0.05), (...",\n
11311,11311,15.0,0.5255,"[(13, 0.20270091), (15, 0.5255027), (16, 0.194...",\nI agree. Home runs off Clemens are always m...
11312,11312,12.0,0.4493,"[(2, 0.05821224), (4, 0.082804665), (8, 0.3651...",I used HP DeskJet with Orange Micros Grappler ...
