# Topic model

In [7]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer    # stemmer : 어간추출기
from gensim import corpora, models

import gensim          # LDA Model
import pandas as pd
import string

#  Data Loading & pre-processing

In [8]:
# define a few documents 
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father." 
doc2 = "My father spends a lot of time driving my sister around to dance practice." 
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure." 
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better." 
doc5 = "Health experts say that Sugar is not good for your lifestyle."

In [9]:
df = pd.DataFrame({'id' : ['doc1','doc2','doc3','doc4','doc5'],
                   'contents' : [doc1, doc2, doc3, doc4, doc5]})

In [10]:
df.head(5)

Unnamed: 0,id,contents
0,doc1,Sugar is bad to consume. My sister likes to ha...
1,doc2,My father spends a lot of time driving my sist...
2,doc3,Doctors suggest that driving may cause increas...
3,doc4,Sometimes I feel pressure to perform well at s...
4,doc5,Health experts say that Sugar is not good for ...


In [13]:
docs = list(df['contents'])
docs

['Sugar is bad to consume. My sister likes to have sugar, but not my father.',
 'My father spends a lot of time driving my sister around to dance practice.',
 'Doctors suggest that driving may cause increased stress and blood pressure.',
 'Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.',
 'Health experts say that Sugar is not good for your lifestyle.']

In [20]:
# docs안의 문서를 읽어서 전처리 후 다시 저장
tokenizer = RegexpTokenizer(r'\w+')   # sentence -> word (space를 만나면 tokenize)
stop = get_stop_words('en')           # english stopwords list
stemmer = PorterStemmer()             # 전처리 후 문서저장
texts = []

for d in docs:
    if(d):
        raw = d.lower()     # 소문자화
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in stop]       # stopword가 아닌 token 골라내기
        print(stopped_tokens)                                       # stopword가 걸러진 token print
        
        stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens]  # token의 어간만 추출 (ex. spends -> spend)
        texts.append(stemmed_tokens)                                # 걸러진 token들을 'texts' list에 저장!

['sugar', 'bad', 'consume', 'sister', 'likes', 'sugar', 'father']
['father', 'spends', 'lot', 'time', 'driving', 'sister', 'around', 'dance', 'practice']
['doctors', 'suggest', 'driving', 'may', 'cause', 'increased', 'stress', 'blood', 'pressure']
['sometimes', 'feel', 'pressure', 'perform', 'well', 'school', 'father', 'never', 'seems', 'drive', 'sister', 'better']
['health', 'experts', 'say', 'sugar', 'good', 'lifestyle']


In [22]:
print(texts)

[['sugar', 'bad', 'consum', 'sister', 'like', 'sugar', 'father'], ['father', 'spend', 'lot', 'time', 'drive', 'sister', 'around', 'danc', 'practic'], ['doctor', 'suggest', 'drive', 'may', 'caus', 'increas', 'stress', 'blood', 'pressur'], ['sometim', 'feel', 'pressur', 'perform', 'well', 'school', 'father', 'never', 'seem', 'drive', 'sister', 'better'], ['health', 'expert', 'say', 'sugar', 'good', 'lifestyl']]


# Document-term matrix 

In [27]:
# LDA Model을 적용하기 위해서 document-term matrix 형태로 만듦
## 전체 문서가 포함하고 있는 모든 단어로 이루어진 사전
dictionary = corpora.Dictionary(texts)
print(dictionary)

Dictionary<34 unique tokens: ['bad', 'consum', 'father', 'like', 'sister']...>


In [25]:
## texts 문서를 이용하여 document-term matrix를 만듦 - 42set
corpus = [dictionary.doc2bow(text) for text in texts]
corpus  # (word's index value, frequency)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)],
 [(2, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1)],
 [(2, 1),
  (4, 1),
  (8, 1),
  (18, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1)],
 [(5, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)]]

In [29]:
# word frequency가 아닌 tfidf weight를 적용한 모델
tfidf = models.TfidfModel(corpus, id2word=dictionary)
print(tfidf)

TfidfModel<num_docs=5, num_nnz=42>


In [37]:
# tfidf를 corpus에 적용
corpus_tfidf = tfidf[corpus]
# LDA Model에 적용시키기 위해서, corpus_tfidf를 list형태로 변형
corpus_tfidf_list = [doc for doc in corpus_tfidf] 
corpus_tfidf_list

[[(0, 0.4715096067582428),
  (1, 0.4715096067582428),
  (2, 0.14965422842541531),
  (3, 0.4715096067582428),
  (4, 0.14965422842541531),
  (5, 0.5368829444421276)],
 [(2, 0.126430480321383),
  (4, 0.126430480321383),
  (6, 0.398339470162723),
  (7, 0.398339470162723),
  (8, 0.126430480321383),
  (9, 0.398339470162723),
  (10, 0.398339470162723),
  (11, 0.398339470162723),
  (12, 0.398339470162723)],
 [(8, 0.11648072836469117),
  (13, 0.3669911836371611),
  (14, 0.3669911836371611),
  (15, 0.3669911836371611),
  (16, 0.3669911836371611),
  (17, 0.3669911836371611),
  (18, 0.20893668382502806),
  (19, 0.3669911836371611),
  (20, 0.3669911836371611)],
 [(2, 0.1080649901332212),
  (4, 0.1080649901332212),
  (8, 0.1080649901332212),
  (18, 0.1938409983609264),
  (21, 0.3404760529532436),
  (22, 0.3404760529532436),
  (23, 0.3404760529532436),
  (24, 0.3404760529532436),
  (25, 0.3404760529532436),
  (26, 0.3404760529532436),
  (27, 0.3404760529532436),
  (28, 0.3404760529532436)],
 [(5, 0.2

# LDA Model Training

In [39]:
# LDA Model Training
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf_list, num_topics=3, id2word=dictionary, passes=50)

# result printing - upper 4 words
print(ldamodel.print_topics(num_topics=3, num_words=4))

[(0, '0.044*"sugar" + 0.041*"consum" + 0.041*"bad" + 0.041*"like"'), (1, '0.046*"health" + 0.046*"lifestyl" + 0.046*"say" + 0.046*"expert"'), (2, '0.030*"sugar" + 0.030*"father" + 0.030*"sister" + 0.030*"drive"')]


In [51]:
import pyLDAvis.gensim_models

In [52]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)