In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
# 20개의 토픽을 가진 11,314개 뉴스기사 데이터셋 

dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                            remove=('headers','footers','quotes'))
documents = dataset.data
len(documents) # 문서 수 총 11,314개

11314

In [2]:
dataset.target_names
# 20개의 토픽

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [11]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
print(len(stopwords))
print(stopwords)

179
{"hasn't", 'i', "should've", 'having', 'ain', 'aren', "won't", 'myself', "you'll", 'my', 'ours', 'where', 'up', 'wouldn', 'been', 'out', 'we', 'what', 'during', 'had', 'off', 've', "mightn't", 'shouldn', 'you', 'its', 'needn', 'itself', 'hadn', 'won', "it's", 'at', 'once', 'your', 's', 'yours', 'to', 'there', 'whom', 'here', "didn't", 'isn', 'about', 'which', 'have', "that'll", 'now', 'both', 'does', 'each', 'these', "you'd", 'under', 'them', 'themselves', 'the', 'himself', 'me', 'no', 'herself', 'hasn', 'their', 'than', "she's", 'do', 'very', 'after', 'a', 'all', 'y', 'until', 'because', 'can', "mustn't", 'doing', 'against', 'over', 'further', 'through', 'he', 'wasn', 'but', 'of', 'should', 'if', 'by', 'before', 'she', 'for', 'with', 'him', 'they', 'when', 'd', 'o', 'ma', 'yourselves', 'why', "wouldn't", "you're", 'was', 'on', 'more', 'ourselves', 'it', 'that', 'down', 'below', 'm', 'yourself', 't', 'haven', "haven't", 'being', 'not', 'while', 'weren', 'own', "don't", "shan't", "a

In [12]:
news_df = pd.DataFrame({'document':documents})

# 알파벳 이외 문자 제거 
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]"," ")

# 길이가 3 이하인 문자 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

# 소문자로 바꾸기 
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

# 불용어 제거 
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))

  after removing the cwd from sys.path.


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english',
                            max_features=1000, # 단어 천개만 추출 
                            max_df=0.5,
                            smooth_idf=True)

x = vectorizer.fit_transform(news_df['clean_doc'])

print(x.shape)
# 11314개 문서, 1000개 단어 
# 문서-단어 행렬 (DTM)
print(x)

(11314, 1000)
  (0, 663)	0.12579678422048154
  (0, 443)	0.16044476823657502
  (0, 495)	0.11696449109610255
  (0, 72)	0.1314390240262082
  (0, 514)	0.13329247332522626
  (0, 366)	0.12802751239278712
  (0, 716)	0.16076629236479573
  (0, 815)	0.1860578624833558
  (0, 733)	0.1650860017284942
  (0, 153)	0.15949923481234501
  (0, 731)	0.16276677998135436
  (0, 712)	0.12950552246491173
  (0, 894)	0.08872588336284767
  (0, 475)	0.16965522872699737
  (0, 230)	0.17232789867083467
  (0, 710)	0.1673880130842719
  (0, 986)	0.12271896638163997
  (0, 437)	0.34750677684180636
  (0, 530)	0.66718686100149
  (0, 842)	0.15408639160668727
  (0, 850)	0.15858098023021014
  (0, 868)	0.11258683260681689
  (1, 336)	0.21610968006327091
  (1, 580)	0.2090150657514671
  (1, 837)	0.1705873334780892
  :	:
  (11313, 162)	0.12246744316703376
  (11313, 186)	0.15901931792515023
  (11313, 911)	0.1696585245733748
  (11313, 97)	0.14464380399033505
  (11313, 946)	0.14195612945068303
  (11313, 515)	0.1445563959897329
  (11313

In [14]:
x.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.20185845, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23080563, 0.        ,
        0.        ]])

In [15]:
from sklearn.decomposition import TruncatedSVD

# 뉴스 토픽 총 20개 --> 토픽 모델링 
svd_model = TruncatedSVD(n_components=20, # 상위 20개 특이값만
                        algorithm='randomized',
                        n_iter=100,
                        random_state=122)

svd_model.fit(x)

svd_model.components_.shape
# (20, 1000) : Vt 행렬 크기 

(20, 1000)

In [16]:
svd_model.singular_values_
# 20개의 특이값 (내림차순)

array([16.67063607,  9.94205981,  8.2061179 ,  7.91123934,  7.62272699,
        7.31738569,  7.14730728,  6.91125279,  6.86782698,  6.73282362,
        6.62603555,  6.53236743,  6.48971321,  6.37002011,  6.22550494,
        6.18079939,  6.08149194,  5.98740576,  5.94626197,  5.90237972])

In [18]:
svd_model.components_

array([[ 0.01470105,  0.05015495,  0.02134726, ...,  0.07877501,
         0.01438332,  0.01790281],
       [-0.00533386,  0.01656067, -0.01645455, ..., -0.06350291,
        -0.01065062, -0.01901778],
       [ 0.00172532, -0.00376834, -0.01802467, ...,  0.05883215,
         0.02633999,  0.02240509],
       ...,
       [-0.01120005,  0.00433952,  0.00278103, ...,  0.02077478,
        -0.00121335,  0.00046334],
       [ 0.00173909,  0.01507013,  0.01111258, ..., -0.0908733 ,
        -0.00135117, -0.00540481],
       [ 0.00191827, -0.03624131, -0.00567909, ...,  0.03611942,
        -0.01425782, -0.00352156]])

In [19]:
terms = vectorizer.get_feature_names()
print(len(terms))
print(terms) # 1000개의 단어 피쳐들

1000
['ability', 'able', 'accept', 'access', 'according', 'account', 'action', 'actions', 'actual', 'actually', 'added', 'addition', 'additional', 'address', 'administration', 'advance', 'advice', 'agencies', 'agree', 'algorithm', 'allow', 'allowed', 'allows', 'amendment', 'america', 'american', 'americans', 'analysis', 'angeles', 'anonymous', 'answer', 'answers', 'anti', 'anybody', 'apparently', 'appear', 'appears', 'apple', 'application', 'applications', 'apply', 'appreciate', 'appreciated', 'approach', 'appropriate', 'april', 'arab', 'archive', 'area', 'areas', 'argument', 'arguments', 'armenia', 'armenian', 'armenians', 'arms', 'army', 'article', 'articles', 'asked', 'asking', 'assume', 'assuming', 'atheism', 'atheists', 'attack', 'attempt', 'author', 'authority', 'available', 'average', 'avoid', 'away', 'background', 'base', 'baseball', 'based', 'basic', 'basically', 'basis', 'begin', 'beginning', 'belief', 'beliefs', 'believe', 'best', 'better', 'bible', 'bike', 'bios', 'bits', '



In [20]:
n = 8
components = svd_model.components_

for index, topic in enumerate(components):
    print("Topic %d: " % (index+1), [terms[i] for i in topic.argsort()[:-n-1:-1]])

Topic 1:  ['like', 'know', 'people', 'think', 'good', 'time', 'thanks', 'make']
Topic 2:  ['thanks', 'windows', 'card', 'drive', 'mail', 'file', 'advance', 'files']
Topic 3:  ['game', 'team', 'year', 'games', 'season', 'players', 'good', 'play']
Topic 4:  ['drive', 'scsi', 'disk', 'hard', 'card', 'drives', 'problem', 'controller']
Topic 5:  ['windows', 'file', 'window', 'files', 'program', 'using', 'problem', 'running']
Topic 6:  ['government', 'chip', 'mail', 'space', 'information', 'encryption', 'data', 'sale']
Topic 7:  ['like', 'bike', 'know', 'chip', 'sounds', 'looks', 'look', 'sure']
Topic 8:  ['card', 'sale', 'video', 'offer', 'monitor', 'price', 'jesus', 'condition']
Topic 9:  ['know', 'card', 'chip', 'video', 'government', 'people', 'clipper', 'drivers']
Topic 10:  ['good', 'know', 'time', 'bike', 'jesus', 'problem', 'work', 'want']
Topic 11:  ['think', 'chip', 'good', 'thanks', 'clipper', 'need', 'encryption', 'mail']
Topic 12:  ['thanks', 'right', 'problem', 'good', 'bike', 

In [23]:
import numpy as np

a = np.array([1,2,3,6,4,9])
a.argsort() # 값들의 인덱스로 정렬! 

array([0, 1, 2, 4, 3, 5], dtype=int64)

In [24]:
a.sort() # 원본자체 변경 리스트 메소드 

In [25]:
a

array([1, 2, 3, 4, 6, 9])

In [26]:
a[:-5:-1]

array([9, 6, 4, 3])

# LSA의 한계

- 문서에 포함된 단어가 "가우시안 분포"(정규분포)를 따라야만 LSA 적용 가능 
- 문서 업데이트 시, 처음부터 다시 SVD 적용해줘야 하여 자원이 많이 소모됨