In [10]:
#20가지 주제의 뉴스 데이터
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [11]:
documents[0] #첫번째 뉴스

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [25]:
#뉴스 카테고리
dataset.target_names


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [13]:
news_df = pd.DataFrame({'document':documents})
# 알파벳 이외의 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3 이하인 단어 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x:x.lower())

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


In [14]:
news_df['clean_doc'][0]

'well sure about story seem biased what disagree with your statement that media ruin israels reputation that rediculous media most israeli media world having lived europe realize that incidences such described letter have occured media whole seem ignore them subsidizing israels existance europeans least same degree think that might reason they report more clearly atrocities what shame that austria daily reports inhuman acts commited israeli soldiers blessing received from government makes some holocaust guilt away after look jews treating other races when they power unfortunate'

In [15]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english') # 영어 불용어 사전
# 토큰화
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
# 불용어 제거
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

In [16]:
print(tokenized_doc[0])

['well', 'sure', 'story', 'seem', 'biased', 'disagree', 'statement', 'media', 'ruin', 'israels', 'reputation', 'rediculous', 'media', 'israeli', 'media', 'world', 'lived', 'europe', 'realize', 'incidences', 'described', 'letter', 'occured', 'media', 'whole', 'seem', 'ignore', 'subsidizing', 'israels', 'existance', 'europeans', 'least', 'degree', 'think', 'might', 'reason', 'report', 'clearly', 'atrocities', 'shame', 'austria', 'daily', 'reports', 'inhuman', 'acts', 'commited', 'israeli', 'soldiers', 'blessing', 'received', 'government', 'makes', 'holocaust', 'guilt', 'away', 'look', 'jews', 'treating', 'races', 'power', 'unfortunate']


In [17]:
# tf-idf 행렬을 만들기 위해 다시 역토큰화
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
news_df['clean_doc'] = detokenized_doc

In [19]:
news_df['clean_doc'][0]


'well sure story seem biased disagree statement media ruin israels reputation rediculous media israeli media world lived europe realize incidences described letter occured media whole seem ignore subsidizing israels existance europeans least degree think might reason report clearly atrocities shame austria daily reports inhuman acts commited israeli soldiers blessing received government makes holocaust guilt away look jews treating races power unfortunate'

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 상위 1000개의 단어만 처리, tfidf 계산
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000)
X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape # TF-IDF 행렬의 크기 확인


(11314, 1000)

In [26]:
X.toarray

<bound method _cs_matrix.toarray of <11314x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 250501 stored elements in Compressed Sparse Row format>>

In [21]:
from sklearn.decomposition import TruncatedSVD
# 행렬 특이값 분해, 11314개의 행을 20개로 축소, n_components 토픽수
svd_model = TruncatedSVD(n_components=20)
svd_model.fit(X)
len(svd_model.components_)

20

In [22]:
import numpy as np
# 토픽수 x 단어수
np.shape(svd_model.components_)

(20, 1000)

In [23]:
svd_model.components_

array([[ 0.01469448,  0.05019031,  0.02132608, ...,  0.07865964,
         0.01432355,  0.01788785],
       [-0.00534993,  0.01650851, -0.01644583, ..., -0.06347065,
        -0.01064398, -0.01905466],
       [ 0.00171528, -0.00359257, -0.01797318, ...,  0.05858313,
         0.02631841,  0.02233005],
       ...,
       [-0.01293685,  0.01375481, -0.00363862, ...,  0.01571131,
         0.00255201,  0.00475131],
       [ 0.0035336 , -0.01823163,  0.00355238, ...,  0.01771619,
        -0.010931  , -0.00216413],
       [-0.00272052,  0.00662116,  0.00443922, ..., -0.00826837,
        -0.00012804,  0.00227357]])

In [24]:
# 단어 집합 1000개의 단어
terms = vectorizer.get_feature_names()
#20개의 뉴스그룹별로 추출한 토픽 리스트 출력
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i
in topic.argsort()[:-n - 1:-1]])
get_topics(svd_model.components_,terms)
# 각 토픽의 핵심 키워드 추출
# LSA: 쉽고 빠르게 구현이 가능하지만 새로운 데이터가 추가되면 처음부터 다시 계산을 해야 하는 단점이 있음

Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32896), ('windows', 0.29084), ('card', 0.18087), ('drive', 0.17459), ('mail', 0.15101)]
Topic 3: [('game', 0.37065), ('team', 0.32407), ('year', 0.28269), ('games', 0.25203), ('season', 0.18404)]
Topic 4: [('drive', 0.53137), ('scsi', 0.19974), ('hard', 0.15674), ('disk', 0.15624), ('card', 0.14127)]
Topic 5: [('windows', 0.40874), ('file', 0.25169), ('window', 0.19249), ('files', 0.16016), ('program', 0.13906)]
Topic 6: [('space', 0.16574), ('government', 0.15538), ('chip', 0.15259), ('mail', 0.15097), ('information', 0.1338)]
Topic 7: [('like', 0.66428), ('bike', 0.13041), ('chip', 0.11462), ('know', 0.11426), ('sounds', 0.10244)]
Topic 8: [('card', 0.46318), ('video', 0.22426), ('sale', 0.21357), ('monitor', 0.16297), ('price', 0.14893)]
Topic 9: [('know', 0.46333), ('card', 0.33018), ('chip', 0.1685), ('government', 0.15505), ('video', 0.14544)]
Topic 1

