# 영문데이터 LDA

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# 뉴스 다운로드 및 전처리

def get_news():
    # 20ewsgroup 다운로드
    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                remove=('headers','footers','quotes'))
    documents = dataset.data
    news_df = pd.DataFrame({'document': documents})
    #전처리
    news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]'," ") # 특수문자 제거
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) # 전체 단어 소문자화
    
    tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
    
    stop_words = stopwords.words('english') # NLTK 불용어 조회
    return tokenized_doc.apply(lambda x: ' '.join([item for item in x if item not in stop_words]))

In [None]:
tokenized_docs = get_news()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
tokenized_docs

0        well sure story seem biased disagree statement...
1        yeah expect people read actually accept hard a...
2        although realize principle strongest points wo...
3        notwithstanding legitimate fuss proposal much ...
4        well change scoring playoff pool unfortunately...
                               ...                        
11309    danny rubenstein israeli journalist speaking t...
11310                                                     
11311    agree home runs clemens always memorable kinda...
11312    used deskjet orange micros grappler system upd...
11313    argument murphy scared hell came last year han...
Name: clean_doc, Length: 11314, dtype: object

In [None]:
type(tokenized_docs)

pandas.core.series.Series

In [None]:
# 공백으로 토큰 분리
def my_tokenizer(text):
    return text.split()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tfidf_vect = TfidfVectorizer(tokenizer = my_tokenizer)
tfidf = tfidf_vect.fit_transform(tokenized_docs)
lda = LatentDirichletAllocation(n_components=20, max_iter=20, learning_method='online')

lda_output = lda.fit_transform(tfidf)

In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 2.8MB/s 
Collecting funcy
[?25l  Downloading https://files.pythonhosted.org/packages/ce/4b/6ffa76544e46614123de31574ad95758c421aae391a1764921b8a81e1eae/funcy-1.14.tar.gz (548kB)
[K     |████████████████████████████████| 552kB 16.9MB/s 
Building wheels for collected packages: pyLDAvis, funcy
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97712 sha256=cd2601c633237d98b2cce28917d07a64b393b7ef029d575833f8feb4a7490a74
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
  Building wheel for funcy (setup.py) ... [?25l[?25hdone
  Created wheel for funcy: filename=funcy-1.14-py2.py3-none-any.whl size=32042 sha256=4fd299e1

In [None]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vect, mds='tsne')
pyLDAvis.display(vis)