# LSA(잠재 의미 분석) 실습 예제 - NLP

In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
#총 문서 갯수
len(documents)

11314

In [2]:
#어떤 토픽의 문서가 있는지 확인
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
#간단한 데이터 전처리
news_df = pd.DataFrame({'document': documents})

# 알파벳 이외의 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]", " ")

# 길이가 3이하인 문자 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

# 소문자로 바꾸기
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  """


In [4]:
news_df

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist will speak...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,agree home runs clemens always memorable kinda...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet with orange micros grappler syste...


In [5]:
# 문서 TF-IDF 벡터화(1000개의 문서만)
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # 1,000개의 단어만 추출
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

X.shape # DTM의 행렬 크기 반환

(11314, 1000)

In [6]:
X

<11314x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 257724 stored elements in Compressed Sparse Row format>

In [7]:
# 뉴스 토픽이 20개!-> Truncated SVD 실행시, 상위 20개의 특이값만 사용
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

svd_model.components_.shape

(20, 1000)

In [8]:
#상위 20개 특이값 확인
svd_model.singular_values_

array([17.15952833,  9.93882749,  8.17139855,  7.92032011,  7.62377374,
        7.5257242 ,  7.25096862,  7.00623237,  6.88289372,  6.85602044,
        6.68476301,  6.56045782,  6.52895929,  6.42222944,  6.33939436,
        6.21686249,  6.17477882,  6.09487639,  6.00247117,  5.90654237])

In [9]:
#1000개의 단어 피처 값 받아오기
terms = vectorizer.get_feature_names()
len(terms)



1000

In [10]:
#주요 단어 확인
n = 8
components = svd_model.components_
for index, topic in enumerate(components):
    print('Topic %d: '%(index + 1), [terms[i] for i in topic.argsort()[: -n - 1: -1]])

Topic 1:  ['just', 'like', 'know', 'people', 'think', 'does', 'good', 'time']
Topic 2:  ['thanks', 'windows', 'card', 'drive', 'mail', 'file', 'advance', 'files']
Topic 3:  ['game', 'team', 'year', 'games', 'drive', 'season', 'good', 'players']
Topic 4:  ['drive', 'scsi', 'disk', 'hard', 'problem', 'drives', 'just', 'card']
Topic 5:  ['drive', 'know', 'thanks', 'does', 'just', 'scsi', 'drives', 'hard']
Topic 6:  ['just', 'like', 'windows', 'know', 'does', 'window', 'file', 'think']
Topic 7:  ['just', 'like', 'mail', 'bike', 'thanks', 'chip', 'space', 'email']
Topic 8:  ['does', 'know', 'chip', 'like', 'card', 'clipper', 'encryption', 'government']
Topic 9:  ['like', 'card', 'sale', 'video', 'offer', 'jesus', 'good', 'price']
Topic 10:  ['like', 'drive', 'file', 'files', 'sounds', 'program', 'window', 'space']
Topic 11:  ['people', 'like', 'thanks', 'card', 'government', 'windows', 'right', 'think']
Topic 12:  ['think', 'good', 'thanks', 'need', 'chip', 'know', 'really', 'bike']
Topic 1

### LDA(잠재 의미 분석) - sklearn 이용

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)
lda_top = lda_model.fit_transform(X)

print(lda_model.components_)
print(lda_model.components_.shape) 

[[ 1.63298459  7.2900724   8.54896182 ...  7.49193635  0.14374901
   1.59009138]
 [ 0.2896584   0.4117951   0.10113168 ...  0.67965637  0.10694658
   0.10144253]
 [ 4.28469739 38.81395323  4.73014492 ...  6.48742032  0.26542125
   0.1500921 ]
 ...
 [ 0.10114017  0.12831726  0.10621085 ...  0.10549643  0.10103415
   0.12082447]
 [ 0.10124113  0.10107427  0.1011614  ...  0.10302748  0.18381415
   0.10150428]
 [12.83266142 35.67976441 17.33997449 ... 96.14763995 27.25201553
  28.27229401]]
(10, 1000)


In [12]:
# 단어 집합. 1,000개의 단어가 저장됨.
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(lda_model.components_,terms)

Topic 1: [('jesus', 75.48), ('people', 57.01), ('bible', 49.68), ('christian', 47.54), ('christians', 46.57)]
Topic 2: [('like', 24.43), ('engine', 21.81), ('speed', 20.07), ('just', 19.91), ('miles', 19.39)]
Topic 3: [('windows', 132.29), ('thanks', 109.8), ('card', 105.85), ('drive', 105.1), ('know', 100.25)]
Topic 4: [('sale', 60.64), ('thanks', 56.21), ('mail', 54.39), ('email', 46.98), ('offer', 42.54)]
Topic 5: [('greek', 22.42), ('steve', 20.57), ('disease', 19.66), ('patients', 18.69), ('posting', 17.17)]
Topic 6: [('satellite', 17.07), ('yeah', 16.11), ('david', 15.61), ('year', 15.17), ('article', 14.52)]
Topic 7: [('games', 25.63), ('phone', 17.55), ('soon', 14.11), ('dave', 13.42), ('cars', 10.94)]
Topic 8: [('israel', 60.09), ('israeli', 44.95), ('arab', 25.55), ('jews', 19.82), ('deleted', 18.89)]
Topic 9: [('armenians', 31.64), ('armenian', 30.42), ('turkish', 27.17), ('turkey', 25.24), ('armenia', 15.84)]
Topic 10: [('people', 175.72), ('just', 162.97), ('think', 160.29

