# LDA 토픽 모델링

## 1. 전처리

In [1]:
def clean_review(text: str) -> str:
            
    # 1. Remove HTML
    text = BeautifulSoup(text, "lxml").get_text()
    
    # 2. Remove Punctuations & lowercase
    text = "".join(v for v in text if v not in string.punctuation).lower().split()
    
    # 3. Remove Stop Word
    text = [word for word in text if word not in english_stopwords and word.islower()]
    
    # 4. lemmatization
    text = [lemmatizer.lemmatize(word) for word in text]
    
    return ' '.join(text)

In [47]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

english_stopwords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

x = pd.read_csv('../tripadvisor_crawler/Seoul_URL.csv')
document_list = x.title.values
documents = []
documents_title = []

for t in document_list:
    t = "_".join(t.split())
    try:
        x = pd.read_csv('../tripadvisor_crawler/review/'+t+'-Seoul.csv')
        d = x.review_body.values
        d = d[:100]
        d = sum([clean_review(s).split() for s in d], [])
        documents.append(d)
        documents_title.append(t)
    except:
        continue

In [27]:
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from collections import Counter

dictionary = corpora.Dictionary(documents)

# 자주 등장하지 않는 단어 정리
min_count = 5
word_counter = Counter((word for words in documents for word in words))
removal_word_idxs = {
    dictionary.token2id[word] for word, count in word_counter.items()
    if count < min_count
}

dictionary.filter_tokens(removal_word_idxs)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in documents]

## 2. Modeling

In [32]:
from gensim.models.ldamodel import LdaModel

lda_model = LdaModel(corpus=corpus,
id2word=dictionary,
num_topics=10,
chunksize=100,
passes=30,
alpha='auto',
per_word_topics=True)

In [33]:
# Compute Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Perplexity:  -7.11347453764033
Coherence Score:  0.42784726890013713


## 3. 토픽 - 키워드 시각화

In [34]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis

pyLDAvis.enable_notebook()
prepared_data = gensimvis.prepare(lda_model, corpus, dictionary, mds='mmds')
pyLDAvis.display(prepared_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [35]:
pyLDAvis.save_html(prepared_data, 'lda.html')

## 4. 문서별 토픽

In [57]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, titles=documents_title, texts=documents):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus][0]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break

    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    titles = pd.Series(titles)
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, titles], axis=1) 
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

joint_documents = [" ".join(d) for d in documents]

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus,
                                                  titles = documents_title, texts=joint_documents)
 
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
 
# Show
df_dominant_topic.head(15)

IndexError: index 283 is out of bounds for axis 0 with size 10

In [54]:
len(joint_documents)

424

In [44]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)],
                                            axis=0)

# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.9721,"museum, korean, visit, history, palace, korea,...","[c, museum, displayed, paichai, hakdang, korea..."
1,1.0,0.7153,"university, street, place, area, shop, restaur...","[shop, cinema, cafe, restaurantsvery, crowded,..."
2,2.0,0.809,"place, traditional, area, house, village, shop...","[couple, hour, went, hanok, village, relaxing,..."
3,3.0,0.9813,"food, shop, street, market, place, shopping, g...","[initially, bit, confused, intimidated, setup,..."
4,4.0,0.9487,"park, place, seoul, walk, temple, beautiful, c...","[park, built, pine, forestthere, lot, huge, pi..."
5,5.0,0.5012,"park, olympic, stadium, world, game, festival,...","[world, peace, gate, built, lead, summer, olym..."
6,6.0,0.9086,"seoul, view, get, tower, time, go, station, to...","[magnificent, view, seoul, seoul, sky, observa..."
7,8.0,0.9803,"show, good, time, fun, great, performance, seo...","[nanta, show, lot, slapstick, humour, audience..."
8,9.0,0.9936,"place, museum, mall, take, fun, photo, book, l...","[trick, eye, museum, pretty, fun, definitely, ..."
