# Import Data

In [1]:
from os import listdir
from os.path import isfile, join
data_path = '../tripadvisor_crawler/review/'
onlyfiles = [f for f in listdir(data_path) if isfile(join(data_path, f))]

In [2]:
onlyfiles

['Bukchon_Hanok_Village-Seoul.csv',
 'Changdeokgung_Palace-Seoul.csv',
 'Gyeongbokgung_Palace-Seoul.csv',
 'Insadong-Seoul.csv',
 'Kwangjang_Market-Seoul.csv',
 'Myeongdong_Shopping_Street-Seoul.csv',
 'The_War_Memorial_of_Korea-Seoul.csv']

In [3]:
import pandas as pd

In [4]:
documents = pd.DataFrame(columns=['index', 'reveiw'])

In [5]:
for idx, file_name in enumerate(onlyfiles):
    data = pd.read_csv(data_path + onlyfiles[idx], error_bad_lines=False)
    data2 = data.apply(lambda x: x[0] + '. ' + x[1], axis=1)
    data3 = data2 + ' '
    data4 = data3.sum()
    documents.loc[idx] = [onlyfiles[idx].split('.')[0], data4]

In [6]:
documents.head()

Unnamed: 0,index,reveiw
0,Bukchon_Hanok_Village-Seoul,Nice walk. We had a couple of hours and we wen...
1,Changdeokgung_Palace-Seoul,"Serene palace. Honestly, I prefer this palace ..."
2,Gyeongbokgung_Palace-Seoul,Definitely worth stepping out of the airport f...
3,Insadong-Seoul,Chic area. Loved this Korean neighborhood with...
4,Kwangjang_Market-Seoul,Lots of “street food” options. Was initially a...


# Data Pre-processing

* Tokenization
> lower, punctuation 제거, 글자 길이 3개 미만 단어 제거, stopwords제거, 

* stopwords
> 제거

* 글자 길이 3개 미만 단어
> 제거

In [7]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MASTER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
stemmer = PorterStemmer()

In [9]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

어떻게 전처리가 되는 지 보자!

In [15]:
WTS = 0 # want to see
REVIEW_COLUMNS = 1 # review column is 1st

doc_sample = documents.loc[WTS].values[REVIEW_COLUMNS]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words[:10])
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample)[:10])

original document: 
['Nice', 'walk.', 'We', 'had', 'a', 'couple', 'of', 'hours', 'and', 'we']


 tokenized and lemmatized document: 
['nice', 'walk', 'coupl', 'hour', 'go', 'hanok', 'villag', 'relax', 'stroll', 'disappoint']


In [16]:
processed_docs = documents['reveiw'].map(preprocess)
processed_docs[:5]

0    [nice, walk, coupl, hour, go, hanok, villag, r...
1    [seren, palac, honestli, prefer, palac, compar...
2    [definit, worth, step, airport, transit, place...
3    [chic, area, love, korean, neighborhood, cool,...
4    [lot, street, food, option, initi, confus, int...
Name: reveiw, dtype: object

In [17]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [18]:
# dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abil
1 abit
2 abl
3 abound
4 abroad
5 absolut
6 absorb
7 abund
8 abus
9 academi
10 accent


In [19]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [20]:
bow_corpus[1]

[(0, 5),
 (1, 1),
 (2, 82),
 (3, 1),
 (5, 50),
 (6, 6),
 (7, 2),
 (8, 1),
 (10, 8),
 (11, 2),
 (12, 2),
 (13, 127),
 (15, 2),
 (16, 8),
 (17, 1),
 (18, 4),
 (19, 11),
 (20, 3),
 (21, 1),
 (25, 2),
 (26, 1),
 (27, 19),
 (28, 7),
 (29, 2),
 (30, 76),
 (32, 28),
 (34, 54),
 (35, 1),
 (36, 1),
 (37, 6),
 (39, 5),
 (42, 26),
 (43, 101),
 (44, 8),
 (45, 1),
 (47, 1),
 (48, 20),
 (49, 80),
 (50, 1),
 (51, 5),
 (52, 3),
 (53, 12),
 (54, 29),
 (55, 1),
 (56, 1),
 (59, 1),
 (62, 9),
 (64, 1),
 (66, 69),
 (67, 12),
 (70, 6),
 (71, 1),
 (72, 38),
 (74, 1),
 (82, 9),
 (89, 1),
 (92, 2),
 (93, 1),
 (94, 9),
 (96, 8),
 (99, 3),
 (101, 2),
 (102, 5),
 (103, 2),
 (104, 123),
 (107, 15),
 (109, 4),
 (110, 4),
 (113, 1),
 (114, 220),
 (115, 5),
 (116, 1),
 (117, 3),
 (118, 3),
 (119, 1),
 (121, 5),
 (123, 1),
 (124, 2),
 (126, 6),
 (128, 4),
 (130, 89),
 (134, 5),
 (136, 6),
 (138, 1),
 (139, 57),
 (140, 2),
 (143, 3),
 (146, 12),
 (148, 2),
 (149, 3),
 (150, 2),
 (151, 4),
 (152, 5),
 (154, 22),
 (156, 

In [21]:
len(bow_corpus)

7

In [22]:
bow_corpus

[[(0, 1),
  (1, 6),
  (2, 77),
  (3, 2),
  (4, 1),
  (5, 36),
  (6, 7),
  (7, 2),
  (8, 2),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 35),
  (14, 10),
  (15, 4),
  (16, 11),
  (17, 1),
  (18, 3),
  (19, 7),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 2),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 61),
  (28, 3),
  (29, 3),
  (30, 179),
  (31, 1),
  (32, 14),
  (33, 1),
  (34, 13),
  (35, 1),
  (36, 1),
  (37, 2),
  (38, 1),
  (39, 2),
  (40, 1),
  (41, 1),
  (42, 47),
  (43, 12),
  (44, 3),
  (45, 2),
  (46, 1),
  (47, 13),
  (48, 8),
  (49, 6),
  (50, 4),
  (51, 4),
  (52, 5),
  (53, 17),
  (54, 27),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 6),
  (60, 1),
  (61, 1),
  (62, 9),
  (63, 1),
  (64, 7),
  (65, 1),
  (66, 112),
  (67, 8),
  (68, 1),
  (69, 1),
  (70, 3),
  (71, 1),
  (72, 6),
  (73, 2),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 4),
  (80, 1),
  (81, 2),
  (82, 6),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 7),
  (

# LDA

## LDA Train

현재 문서의 개수가 7개이므로 그냥 2개로 하겠다. 추후에 perplexity($\sim$ likelihood)를 이용하여 주제 개수에 대한 cross-validation 진행

In [23]:
NUM_TOPICS = 2 #20개의 토픽, k=2
# ldamodel = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics = NUM_TOPICS, id2word=dictionary)
# ldamodel.save('ldamodel')
ldamodel = gensim.models.ldamodel.LdaModel.load('ldamodel')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.030*"palac" + 0.021*"place" + 0.019*"visit" + 0.019*"shop"')
(1, '0.030*"shop" + 0.020*"place" + 0.019*"street" + 0.016*"food"')


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


각 단어 앞에 붙은 수치는 단어의 해당 토픽에 대한 기여도를 보여줍니다. 

또한 맨 앞에 있는 토픽 번호는 0부터 시작하므로 총 20개의 토픽은 0부터 19까지의 번호가 할당되어져 있습니다.

passes는 알고리즘의 동작 횟수를 말하는데, 알고리즘이 결정하는 토픽의 값이 적절히 수렴할 수 있도록 충분히 적당한 횟수를 정해주면 됩니다. 

여기서는 총 15회를 수행하였습니다.

여기서는 num_words=4로 총 4개의 단어만 출력하도록 하였습니다.

## LDA Visualization

In [24]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, bow_corpus, dictionary)
pyLDAvis.display(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


$$r(w, k | \lambda)=\lambda \log \left(\phi_{k w}\right)+(1-\lambda) \log \left(\frac{\phi_{k w}}{p_{w}}\right)$$

lambda가 1이면 토픽이 해당 word를 뱉는 확률로 rank를 세우고

lambda가 0이면 토픽이 해당 word를 뱉는 확률을 단어가 등장하는 빈도로 조정한 것으로 rank를 세운다.

특정 반복되는 단어가 많으면 문서에 대한 토픽을 해석하는 게 어려우니 0에 가깝게 두는 게 용이하다.

$\lambda=0.1$로 둔 다음에 해석을 하면

1번 토픽에 가까운 애들은 **전통**과 관련된 단어가 나오고

2번 토픽에 가까운 애들은 **도시**에 관련된 단어가 나오는 것을 보니

가장 분산을 잘잡는 벡터 PC1은 도시화와 관련된 것 같다.

현재 PC1에 다 쏠린 이유는 토픽을 2개로 잡아서이다.

In [25]:
# pyLDAvis.save_html(vis, 'lda.html')
# pyLDAvis.save_json(vis, 'lda.json')

## LDA topic distribution for each documents

In [33]:
def make_topictable_per_doc(ldamodel, corpus, texts, indices):
    topic_table = pd.DataFrame()
    for i, topic_list in zip(indices, (ldamodel[corpus][idx] for idx in indices)):
        doc = sorted(topic_list, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(i), int(topic_num) + 1, round(prop_topic,4), topic_list]), ignore_index=True)
                # int(topic_num) + 1 to match the plotted topic indexing
            else:
                break
    return(topic_table)

In [34]:
topictable = make_topictable_per_doc(ldamodel, bow_corpus, processed_docs, [0, 1, 2, 3, 4, 5, 6])
topictable.columns = ['여행지', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable['여행지'] = documents['index']
topictable

Unnamed: 0,여행지,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,Bukchon_Hanok_Village-Seoul,2.0,0.5965,"[(0, 0.40347362), (1, 0.5965264)]"
1,Changdeokgung_Palace-Seoul,1.0,1.0,"[(0, 0.99997973)]"
2,Gyeongbokgung_Palace-Seoul,1.0,1.0,"[(0, 0.9999736)]"
3,Insadong-Seoul,2.0,0.9788,"[(0, 0.02121191), (1, 0.97878814)]"
4,Kwangjang_Market-Seoul,2.0,0.9999,"[(1, 0.9998571)]"
5,Myeongdong_Shopping_Street-Seoul,2.0,1.0,"[(1, 0.9999579)]"
6,The_War_Memorial_of_Korea-Seoul,1.0,0.8869,"[(0, 0.8868781), (1, 0.113121934)]"


1번 토픽에 가까운 애들은 **전통**
> 창덕궁, 경복궁, 전쟁기념관

2번 토픽에 가까운 애들은 **도시**
> 북촌한옥, 인사동, 광장마켓, 명동

북촌한옥이 2번 토픽인 건 조금 아쉬운 결과인데 토픽의 비중을 봤을 때 60% vs 40%인 것과 