# Topic Modeling - LSA (Latent Semantic Analysis)

https://wikidocs.net/24949

In [3]:
import pandas as pd
import nltk

from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [6]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','footers','quotes'))
documents = dataset.data

print(f" sample : {len(documents)}")

 sample : 11314


In [8]:
documents[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [11]:
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [12]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## `text preprocessing`

In [23]:
news_df = pd.DataFrame({'document':documents})
display(news_df.head(3))

Unnamed: 0,document
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...


In [24]:
news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ')
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ')


In [27]:
news_df['document'][1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [25]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [28]:
# 불용어 가져오기

stop_words = stopwords.words('english')
tokenized_doc= news_df['clean_doc'].apply(lambda x: x.split())
tokenzied_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])


In [30]:
print(tokenzied_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


##  `TF-IDF 행렬 만들기`


    - 불용어 제거를 위해 토큰화 작업을 수행했지만, TfidfVectorizer는 기본적으로 토큰화가 되어있지 않은 텍스트 데이터를 입력으로 사용함
    - TfidfVectorizer를 사용해서 TF-IDF 행렬을 만들기 위해서 토큰화 작업을 역으로 취소하는 작업 수행 
     => 역토큰화(Detokenization)

In [31]:
detokenzied_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenzied_doc.append(t)
    
news_df['clean_doc'] = detokenzied_doc

In [34]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [35]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000,
                             max_df = 0.5, smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

print(f"TF-IDF 행렬의 크기 : {X.shape}")

TF-IDF 행렬의 크기 : (11314, 1000)


## `Topic Modeling`

- TruncatedSVD 사용

In [38]:
import numpy as np

In [39]:
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)
len(svd_model.components_)

20

In [40]:
np.shape(svd_model.components_)

(20, 1000)

In [42]:
terms = vectorizer.get_feature_names_out()
terms

array(['ability', 'able', 'accept', 'access', 'according', 'account',
       'action', 'actions', 'actual', 'actually', 'added', 'addition',
       'additional', 'address', 'administration', 'advance', 'advice',
       'agencies', 'agree', 'algorithm', 'allow', 'allowed', 'allows',
       'amendment', 'america', 'american', 'americans', 'analysis',
       'angeles', 'anonymous', 'answer', 'answers', 'anti', 'anybody',
       'apparently', 'appear', 'appears', 'apple', 'application',
       'applications', 'apply', 'appreciate', 'appreciated', 'approach',
       'appropriate', 'april', 'arab', 'archive', 'area', 'areas', 'aren',
       'argument', 'arguments', 'armenia', 'armenian', 'armenians',
       'arms', 'army', 'article', 'articles', 'asked', 'asking', 'assume',
       'assuming', 'atheism', 'atheists', 'attack', 'attempt', 'author',
       'authority', 'available', 'average', 'avoid', 'away', 'background',
       'base', 'baseball', 'based', 'basic', 'basically', 'basis',
      

In [70]:
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print(f" topic : {idx+1} : {[(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n-1:-1]]}")
        
    
get_topics(svd_model.components_,terms)

 topic : 1 : [('just', 0.20887), ('like', 0.20469), ('know', 0.19349), ('people', 0.18318), ('think', 0.1697)]
 topic : 2 : [('thanks', 0.32763), ('windows', 0.28786), ('card', 0.18019), ('drive', 0.16864), ('mail', 0.15261)]
 topic : 3 : [('game', 0.34011), ('team', 0.30311), ('year', 0.26894), ('games', 0.23784), ('drive', 0.17472)]
 topic : 4 : [('drive', 0.46159), ('scsi', 0.17188), ('disk', 0.14451), ('hard', 0.13805), ('problem', 0.12763)]
 topic : 5 : [('drive', 0.39993), ('know', 0.28768), ('thanks', 0.24917), ('does', 0.24678), ('just', 0.17387)]
 topic : 6 : [('just', 0.55559), ('like', 0.23559), ('windows', 0.23078), ('know', 0.15795), ('does', 0.11156)]
 topic : 7 : [('just', 0.43264), ('like', 0.22858), ('mail', 0.15052), ('bike', 0.11698), ('thanks', 0.10025)]
 topic : 8 : [('does', 0.39692), ('know', 0.25192), ('chip', 0.22492), ('like', 0.17824), ('card', 0.15695)]
 topic : 9 : [('like', 0.42065), ('card', 0.32249), ('sale', 0.20267), ('video', 0.1571), ('offer', 0.1411

In [47]:
# VT 에 해당
svd_model.components_

array([[ 0.01409818,  0.04787982,  0.02051048, ...,  0.07483329,
         0.01377606,  0.01712888],
       [-0.00521437,  0.01742255, -0.01542793, ..., -0.06266609,
        -0.01075043, -0.01888594],
       [ 0.00243598, -0.00142246, -0.01848973, ...,  0.05823359,
         0.02412148,  0.02038499],
       ...,
       [ 0.00746824, -0.00689738, -0.00963879, ..., -0.03316437,
        -0.00649624, -0.0002525 ],
       [ 0.00352545,  0.01029423, -0.01284824, ...,  0.01336511,
        -0.01588696, -0.00127575],
       [-0.00221751,  0.00855738,  0.00623428, ..., -0.05112369,
        -0.00794476, -0.00491316]])

array([[972, 971, 996, ..., 324, 677, 323],
       [677, 365, 437, ..., 896,  89, 746],
       [437, 458, 438, ..., 971, 255, 626],
       [458, 746, 677, ..., 116, 826, 365],
       [458, 442, 116, ..., 893, 437, 255]], dtype=int64)