In [1]:
import nltk

In [2]:
import nltk 
from sklearn.datasets import fetch_20newsgroups
import pandas as pd 
from nltk.corpus import stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\이상은\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
# 뉴스 다운로드 및 전처리 
def get_news() : 
    # 20newsgroup 다운로드 
    dataset = fetch_20newsgroups(shuffle = True, random_state = 1, remove = ('headers', 'footers', 'quotes'))
    documents = dataset.data
    
    news_df = pd.DataFrame({'document' : documents})
    
    
    # 전처리 
    news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ") # 특수문자 제거
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
    # 길이가 3이하인 단어는 제거
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) # 전체 단어에 대한 소문자 변환 
    tokenized_doc = news_df['clean_doc'].apply(lambda x : x.split()) # 토큰화 
    
    stop_words = stopwords.words('english') # NLTK 불용어처리 
    return tokenized_doc.apply(lambda x: ' '.join([item for item in x if item not in stop_words]))

In [6]:
tokenized_docs = get_news()

In [7]:
tokenized_docs

0        well sure story seem biased disagree statement...
1        yeah expect people read actually accept hard a...
2        although realize principle strongest points wo...
3        notwithstanding legitimate fuss proposal much ...
4        well change scoring playoff pool unfortunately...
                               ...                        
11309    danny rubenstein israeli journalist speaking t...
11310                                                     
11311    agree home runs clemens always memorable kinda...
11312    used deskjet orange micros grappler system upd...
11313    argument murphy scared hell came last year han...
Name: clean_doc, Length: 11314, dtype: object

In [8]:
# 공백으로 토큰 분리 
def my_tokenizer(text) : 
    return text.split()

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tfidf_vect = TfidfVectorizer(tokenizer = my_tokenizer)
tfidf = tfidf_vect.fit_transform(tokenized_docs)

#모델 선언 
lda = LatentDirichletAllocation(n_components = 20)
lda_output = lda.fit_transform(tfidf)

In [10]:
lda_output

array([[0.00648215, 0.00648215, 0.00648215, ..., 0.70646002, 0.00648215,
        0.00648215],
       [0.00747962, 0.00747962, 0.00747962, ..., 0.59768233, 0.00747962,
        0.00747962],
       [0.00711621, 0.00711621, 0.00711621, ..., 0.67763737, 0.00711621,
        0.00711621],
       ...,
       [0.01171825, 0.01171825, 0.37940604, ..., 0.40966547, 0.01171825,
        0.01171825],
       [0.01313165, 0.01313165, 0.01313165, ..., 0.30708252, 0.01313165,
        0.01313165],
       [0.00555874, 0.00555874, 0.00555874, ..., 0.57912157, 0.00555874,
        0.00555874]])

In [12]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-2.1.2.tar.gz (1.6 MB)
Collecting numexpr
  Downloading numexpr-2.7.1-cp36-none-win_amd64.whl (90 kB)
Collecting pytest
  Downloading pytest-6.0.1-py3-none-any.whl (270 kB)
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
Collecting funcy
  Downloading funcy-1.14.tar.gz (548 kB)
Collecting iniconfig
  Downloading iniconfig-1.0.1-py3-none-any.whl (4.2 kB)
Collecting more-itertools>=4.0.0
  Downloading more_itertools-8.5.0-py3-none-any.whl (44 kB)
Collecting pluggy<1.0,>=0.12
  Downloading pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Collecting atomicwrites>=1.0; sys_platform == "win32"
  Downloading atomicwrites-1.4.0-py2.py3-none-any.whl (6.8 kB)
Collecting py>=1.8.2
  Downloading py-1.9.0-py2.py3-none-any.whl (99 kB)
Building wheels for collected packages: pyLDAvis, future, funcy
  Building wheel for pyLDAvis (setup.py): started
  Building wheel for pyLDAvis (setup.py): finished with status 'done'
  Created wheel for pyLDAvis: filen

In [13]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vect, mds='tsne')
pyLDAvis.display(vis)