# Topic Modeing
### Jeff Ho

This notebook shows results from topic modeling. I will use LDA to generate topics based on the corpus.

**Goal:**
Provide recommendations to specific hotels

**Deliverables:**
1. Method to identify topics within reviews
2. Method to assign scores for each topic to each hotel

**Why?**
Understand each hotel’s performance beyond “happy”/“not happy”.


In [2]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Phrases
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

import nltk
nltk.download('wordnet')

from pprint import pprint

[nltk_data] Downloading package wordnet to /Users/Jeff/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data
df_happy = pd.read_csv('hotel_happy_reviews.csv')
# display(df_happy.head())
print(len(df_happy))

df_not_happy = pd.read_csv('hotel_not_happy_reviews.csv')
# display(df_not_happy.head())
print(len(df_not_happy))

#join dataframes together
df = df_happy.append(df_not_happy,ignore_index=True)
display(df.sample(5))
print(len(df))

26521
12411


Unnamed: 0,User_ID,Description,Is_Response,hotel_ID
20099,id39751,Was very well located for everything we needed...,happy,4
8495,id22878,The Carlyle Hotel in DuPont Circle was extreme...,happy,8
668,id11331,I highly recommend the Andez hotel. My wife an...,happy,1
8217,id22451,Everything was spectacular. I travel a great d...,happy,4
15869,id33584,I love the Four Seasons Seattle. The room was ...,happy,8


38932


### Data pre-processing

In [3]:
# Function to perform lemmatizing and pre-processing

# Stemming — words are reduced to their root form.
# Lemmatizizing — words in third person are changed to first person and verbs in past and future tenses are changed into present.
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Remove stop words
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:# potentially short words too: and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
# Make sure function works

doc_sample = df.loc[4310,'Description']
print('original review: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized review: ')
print(preprocess(doc_sample))
# works!

original review: 
['My', 'partner', 'and', 'I', 'just', 'completed', 'a', 'two-night', 'stay', 'at', 'the', 'Arctic', 'Club,', 'during', 'our', 'first-ever', 'trip', 'to', 'Seattle.', 'I', "can't", 'say', 'enough', 'great', 'things', 'about', 'this', 'property.', 'Not', 'only', 'does', 'it', 'have', 'history', 'in', 'spades', '(it', 'began', 'its', 'life', 'as', 'a', 'turn-of-the-twentieth-century', "gentleman's", 'club),', 'but', 'the', 'super-friendly', 'staff', 'and', 'prime', 'Seattle', 'location', 'near', 'historic', 'Pioneer', 'Square', 'make', 'it', 'a', 'must-consider', 'hotel.\r\nThe', 'rooms,', 'as', 'noted', 'by', 'other', 'reviewers,', 'are', 'very', 'nicely', 'appointed', 'and', 'pay', 'off', 'the', 'overall', 'historic', 'theme-credibility.', 'We', 'stayed', 'on', 'points,', 'but', 'were', 'upgraded', 'to', 'a', 'corner', 'king', 'with', 'whirlpool.', 'The', 'room', 'overall', 'was', 'very', 'comfortable', '--', 'great', 'bed', 'and', 'bathroom,', 'in', 'particular.\r\nIn

In [38]:
%%time

# Pre-process all reviews
processed_docs = df['Description'].map(preprocess)
processed_docs[:10]

CPU times: user 1min 25s, sys: 1.35 s, total: 1min 27s
Wall time: 1min 33s


0    [stay, husband, son, way, alaska, cruis, love,...
1    [room, nice, clear, updat, recent, clean, bed,...
2    [wife, stay, glorious, citi, sf, expens, littl...
3    [boyfriend, stay, fairmont, recent, trip, san,...
4    [step, time, squar, nice, room, stay, night, g...
5    [wife, kid, stay, valentin, weekend, nice, hot...
6    [high, recommend, hawthorn, terrac, afford, co...
7    [hotel, clean, nice, locat, good, free, shuttl...
8    [stay, elan, th, th, octob, like, return, day,...
9    [stay, night, happi, locat, min, walk, walk, f...
Name: Description, dtype: object

In [39]:
%%time

# Add bigrams and trigrams to docs
bigram = Phrases(processed_docs)
trigram = Phrases(bigram[processed_docs])
for idx in range(len(processed_docs)):
    for token in bigram[processed_docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            processed_docs[idx].append(token)
    for token in trigram[processed_docs[idx]]:
        if '_' in token:
            # Token is a trigram, add to document.
            processed_docs[idx].append(token)

CPU times: user 54.4 s, sys: 480 ms, total: 54.9 s
Wall time: 55.2 s


In [66]:
# save pre-processed reviews.
df.join(
    pd.DataFrame(processed_docs).rename(columns={'Description':'processed_review'})
).to_csv('pre_processed_reviews.csv')

In [40]:
%%time

# Create dictionary from dataset:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 alaska
1 alaska_cruis
2 ask
3 bed
4 best
5 citi
6 cruis
7 cruis_ship
8 delici
9 dinner
10 enjoy
CPU times: user 5.17 s, sys: 34.4 ms, total: 5.2 s
Wall time: 5.29 s


In [41]:
# How many items in the dictionary?
print(len(dictionary.items()))
dictionary[523]

38288


'girlfriend'

In [42]:
# Filter out extreme values of the dictionary.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [47]:
%%time

# Create a dictionary reporting how many words and how many times those words appear in each review
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# bow_corpus[4310]

bow_doc_4310 = bow_corpus[4310]
# print words that appear more than once
for i in range(len(bow_doc_4310)):
    if bow_doc_4310[i][1]>1:
        print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                         dictionary[bow_doc_4310[i][0]], 
                                                         bow_doc_4310[i][1]))

Word 14 ("great") appears 2 time.
Word 61 ("nice") appears 2 time.
Word 64 ("point") appears 2 time.
Word 110 ("overal") appears 2 time.
Word 135 ("thing") appears 2 time.
Word 170 ("trip") appears 2 time.
Word 615 ("pioneer_squar") appears 3 time.
Word 621 ("seattl") appears 4 time.
Word 714 ("club") appears 3 time.
Word 825 ("properti") appears 2 time.
Word 1116 ("histor") appears 2 time.
Word 1230 ("near") appears 2 time.
Word 1828 ("futur_trip") appears 3 time.
Word 2390 ("pike_place") appears 3 time.
Word 4699 ("beat_path") appears 3 time.
Word 6243 ("oh_yeah") appears 3 time.
CPU times: user 3.36 s, sys: 749 ms, total: 4.11 s
Wall time: 4.53 s


### Train LDA model

In [44]:
# Guess 10 topics for now
%time lda_model = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary, passes=2)

CPU times: user 1min 30s, sys: 841 ms, total: 1min 30s
Wall time: 1min 8s


In [45]:
top_topics = lda_model.top_topics(bow_corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / 10
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)

Average topic coherence: -1.8576.
[([(0.03463333, 'new_york'),
   (0.031360663, 'time_squar'),
   (0.02025954, 'time'),
   (0.016527908, 'holiday_inn'),
   (0.015917344, 'locat'),
   (0.014613494, 'new'),
   (0.012641918, 'squar'),
   (0.011588677, 'york'),
   (0.010841425, 'great'),
   (0.0106599415, 'staff'),
   (0.009727528, 'nyc'),
   (0.009246683, 'th'),
   (0.009007723, 'good'),
   (0.008935182, 'central_park'),
   (0.008082985, 'clean'),
   (0.007236051, 'central'),
   (0.006547909, 'night'),
   (0.0064682886, 'friend'),
   (0.00597356, 'help'),
   (0.005810652, 'walk')],
  -1.3337978825983157),
 ([(0.032495447, 'walk'),
   (0.030828588, 'walk_distanc'),
   (0.019079227, 'locat'),
   (0.016126912, 'block_away'),
   (0.01357869, 'minut_walk'),
   (0.013474562, 'distanc'),
   (0.013400719, 'free'),
   (0.012215326, 'bed_comfort'),
   (0.011115284, 'good'),
   (0.010827124, 'block'),
   (0.010722507, 'free_internet'),
   (0.010494703, 'great'),
   (0.0103773875, 'clean'),
   (0.009

In [53]:
lda_model.save('lda.model')

In [8]:
lda_copy = gensim.models.LdaModel.load('lda.model')
lda_copy.show_topics(num_words=5)

[(0,
  '0.016*"great" + 0.012*"staff" + 0.011*"servic" + 0.010*"locat" + 0.009*"chicago"'),
 (1,
  '0.035*"new_york" + 0.031*"time_squar" + 0.020*"time" + 0.017*"holiday_inn" + 0.016*"locat"'),
 (2,
  '0.039*"park" + 0.015*"car" + 0.012*"valet_park" + 0.011*"area" + 0.011*"night"'),
 (3,
  '0.011*"check" + 0.009*"san_diego" + 0.008*"credit_card" + 0.008*"day" + 0.008*"night"'),
 (4,
  '0.026*"custom_servic" + 0.024*"staff" + 0.021*"servic" + 0.014*"great" + 0.013*"busi"'),
 (5,
  '0.011*"night" + 0.010*"bed" + 0.009*"check" + 0.008*"desk" + 0.008*"tell"'),
 (6,
  '0.028*"breakfast" + 0.020*"san_francisco" + 0.015*"continent_breakfast" + 0.015*"good" + 0.013*"locat"'),
 (7,
  '0.018*"year_old" + 0.018*"suit" + 0.011*"bed" + 0.010*"area" + 0.010*"kid"'),
 (8,
  '0.032*"walk" + 0.031*"walk_distanc" + 0.019*"locat" + 0.016*"block_away" + 0.014*"minut_walk"'),
 (9,
  '0.015*"floor" + 0.013*"th_floor" + 0.009*"bathroom" + 0.008*"good" + 0.008*"nois"')]

### Results from topic modeling
Many of the topics seem to make sense (e.g., 2: parking, 4: service, 6: breakfast, 7: kid, 8: distance). Others seem to be focused on geography (e.g., 1: new york), which could be because of the imbalanced data (e.g., too many reviews for the NY hotel.

**Next steps:** Get topic scores for each review / hotel?

**Validation:** Look at strongest reviews for each topic to interpret. 

### Get topic scores for each review

In [70]:
get_document_topics = [lda_model.get_document_topics(review) for review in bow_corpus]
display(get_document_topics[4310])
# This review is mostly topic 8, but also some parts 3 and 4 (and weaky 7)

[(3, 0.23514672), (4, 0.28776655), (7, 0.10101567), (8, 0.37018186)]

In [77]:
# Did the assignment work? Let's check this one....
print(df.loc[4,'Description'])
display(get_document_topics[4])
#Topic 1 is the one about NY and Times Square, so yes.

Steps off Times Square, nice rooms, stayed - nights, great for a short visit.


[(1, 0.9249818)]

In [106]:
#Get columns representing topic affinities for assigning scores
ts = get_document_topics
review_topics = pd.DataFrame(columns=range(10))
for t in ts:
    dummy_dict = {}
    for tup in t:
        dummy_dict[tup[0]] = [tup[1]]
#     print(dummy_dict)
#     display(pd.DataFrame.from_dict(dummy_dict))
    review_topics = review_topics.append(pd.DataFrame.from_dict(dummy_dict))
display(review_topics.fillna(0))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.614366,0.000000,0.000000,0.259785,0.000000,0.000000,0.000000,0.042424,0.072890,0.000000
0,0.000000,0.146320,0.167900,0.294863,0.032901,0.000000,0.000000,0.000000,0.141913,0.210382
0,0.230113,0.000000,0.051075,0.000000,0.000000,0.000000,0.710055,0.000000,0.000000,0.000000
0,0.000000,0.000000,0.000000,0.000000,0.748277,0.000000,0.235713,0.000000,0.000000,0.000000
0,0.000000,0.924982,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
0,0.119976,0.000000,0.109693,0.519192,0.000000,0.000000,0.000000,0.012953,0.016369,0.218998
0,0.157669,0.000000,0.000000,0.288282,0.162744,0.130608,0.150564,0.000000,0.000000,0.097118
0,0.000000,0.000000,0.623791,0.000000,0.000000,0.000000,0.357593,0.000000,0.000000,0.000000
0,0.000000,0.000000,0.121528,0.135007,0.113633,0.623513,0.000000,0.000000,0.000000,0.000000


In [107]:
# Write review topics to a csv
review_topics.to_csv('review_topic_probabilities.csv')