https://wikidocs.net/30708

#데이터 불러오기

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
dataset = fetch_20newsgroups(shuffle = True, random_state = 1, remove=('headers','footers','quotes'))
documents = dataset.data
targets = dataset.target_names

In [None]:
documents[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [None]:
print(targets)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


# 데이터 전처리
구두점 숫자 특수문자 제거

In [None]:
import pandas as pd

In [None]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ", regex=True)
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)

#데이터프레임의 모든 요소에 ()안의 걸 적용하는 apply. lambda x의 x는 열의 모든 문서들.
#x.split으로 나눈 단어 w들 중 if문으로 len이 3 넘는거만 선택. join으로 다시 붙임.
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [None]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

불용어 제거

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# NLTK로부터 불용어를 받아온다.
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.

In [None]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


#정수 인코딩, 단어 집합 생성

In [None]:
from gensim import corpora
#LDA 모델 작성에 필요한 것들

dictionary = corpora.Dictionary(tokenized_doc) #토큰화된 데이터를 사용해 사전 생성
corpus = [dictionary.doc2bow(text) for text in tokenized_doc] #BoW 형식(ID,빈도수)으로 말뭉치 corpus 생성
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0


[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [None]:
#학습된 dictionary 정
print(dictionary[66], len(dictionary))

faith 64281


# LDA 모델 훈련

In [None]:
import gensim

#토픽 개수(k) 20개, passes 15번 동작
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)


토픽별 (기여도 * 단어) 출력

In [None]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.021*"health" + 0.017*"medical" + 0.011*"disease" + 0.010*"pain"')
(1, '0.036*"scsi" + 0.023*"tobacco" + 0.014*"smokeless" + 0.009*"pointer"')
(2, '0.009*"went" + 0.009*"said" + 0.008*"like" + 0.007*"know"')
(3, '0.024*"file" + 0.016*"program" + 0.011*"files" + 0.011*"window"')
(4, '0.015*"year" + 0.015*"game" + 0.013*"team" + 0.009*"games"')
(5, '0.010*"people" + 0.010*"would" + 0.007*"jesus" + 0.006*"believe"')
(6, '0.020*"period" + 0.016*"play" + 0.012*"power" + 0.011*"pittsburgh"')
(7, '0.016*"greek" + 0.015*"church" + 0.007*"island" + 0.007*"book"')
(8, '0.008*"water" + 0.008*"ground" + 0.007*"power" + 0.007*"wire"')
(9, '0.058*"image" + 0.033*"graphics" + 0.026*"color" + 0.024*"jpeg"')
(10, '0.013*"system" + 0.007*"data" + 0.006*"used" + 0.006*"keys"')
(11, '0.022*"koresh" + 0.015*"compound" + 0.013*"batf" + 0.009*"financial"')
(12, '0.016*"armenian" + 0.014*"israel" + 0.014*"armenians" + 0.012*"jews"')
(13, '0.011*"would" + 0.010*"thanks" + 0.010*"drive" + 0.009*"know"')
(

문서별 (토픽, 기여도) 출력

In [None]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)


0 번째 문서의 topic 비율은 [(4, 0.023666443), (5, 0.10515881), (12, 0.2663743), (14, 0.5916604)]
1 번째 문서의 topic 비율은 [(4, 0.22501656), (5, 0.46248433), (8, 0.028466523), (14, 0.26295754)]
2 번째 문서의 topic 비율은 [(4, 0.035051692), (12, 0.39411533), (14, 0.5570961)]
3 번째 문서의 topic 비율은 [(2, 0.071659304), (3, 0.024593035), (5, 0.10538571), (10, 0.27763927), (13, 0.07683873), (14, 0.13667406), (19, 0.29686764)]
4 번째 문서의 topic 비율은 [(4, 0.8739692), (14, 0.09268617)]
