<a href="https://colab.research.google.com/github/insightcampus/202008-youth-bigdata/blob/master/hnjoo/topic_modeling/T.M_2_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## [토픽 모델링 실습2] LDA: 영어 문서 토픽 모델링

### 0. 사전준비

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### 1. 데이터 전처리

In [2]:
# 뉴스 다운로드 및 전처리
def get_news():
  # 20newsgroup 다운로드
  dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','footers','quotes'))
  documents = dataset.data

  news_df = pd.DataFrame({'document':documents})

  # 전처리
    # 특수 문자 제거
  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
    # 길이가 짧은 단어 제거 
  news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    # 전체 단어에 대한 소문자 변환
  news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) 
    
  # 토큰화
  tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())  
  # NLTK 불용어 조회
  stop_words = stopwords.words('english')  

  return tokenized_doc.apply(lambda x: ' '.join([item for item in x if item not in stop_words]))

In [3]:
# def get_news(apply_split=True):
#   # 20newsgroup 다운로드
#   dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','footers','quotes'))
#   documents = dataset.data

#   news_df = pd.DataFrame({'document':documents})

#   # 전처리
#     # 특수 문자 제거
#   news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ") 
#   news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
#     # 전체 단어에 대한 소문자 변환
#   news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) 
    
#   # 토큰화
#   tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())  
#   # NLTK 불용어 조회
#   stop_words = stopwords.words('english')  

#   if apply_split:
#     return tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
#   else : 
#     return tokenized_doc.apply(lambda x: ' '.join([item for item in x if item not in stop_words]))

In [4]:
# 공백으로 토큰 분리
def my_tokenizer(text):
    return text.split()

tokenized_docs = get_news()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
tokenized_docs

0        well sure story seem biased disagree statement...
1        yeah expect people read actually accept hard a...
2        although realize principle strongest points wo...
3        notwithstanding legitimate fuss proposal much ...
4        well change scoring playoff pool unfortunately...
                               ...                        
11309    danny rubenstein israeli journalist speaking t...
11310                                                     
11311    agree home runs clemens always memorable kinda...
11312    used deskjet orange micros grappler system upd...
11313    argument murphy scared hell came last year han...
Name: clean_doc, Length: 11314, dtype: object

### 2. 토픽 모델링

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tfidf_vect = TfidfVectorizer(tokenizer=my_tokenizer)
tfidf = tfidf_vect.fit_transform(tokenized_docs)

# 모델 선언
lda = LatentDirichletAllocation(n_components=20, max_iter=20, learning_method='online')
lda_output = lda.fit_transform(tfidf)

### 3. 시각화

In [None]:
!pip install pyLDAvis

In [10]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vect, mds='tsne')
pyLDAvis.display(vis)