## Topic modeling using Yelp hotel reviews

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
# Read in data
reviews_df = pd.read_sql_table("review",
                              "sqlite:///yelpHotelData.db")
reviews_df

Unnamed: 0,date,reviewID,reviewerID,reviewContent,rating,usefulCount,coolCount,funnyCount,flagged,hotelID
0,6/8/2011,MyNjnxzZVTPq,IFTr6_6NI4CgCVavIL9k5g,Let me begin by saying that there are two kind...,5,18,11,28,N,tQfLGoolUMu2J0igcWcoZg
1,8/30/2011,BdD7fsPqHQL73hwENEDT-Q,c_-hF15XgNhlyy_TqzmdaA,The only place inside the Loop that you can st...,3,0,3,4,N,tQfLGoolUMu2J0igcWcoZg
2,6/26/2009,BfhqiyfC,CiwZ6S5ZizAFL5gypf8tLA,I have walked by the Tokyo Hotel countless tim...,5,12,14,23,N,tQfLGoolUMu2J0igcWcoZg
3,9/16/2010,Ol,nf3q2h-kSQoZK2jBY92FOg,"If you are considering staying here, watch thi...",1,8,2,6,N,tQfLGoolUMu2J0igcWcoZg
4,2/5/2010,i4HIAcNTjabdpG1K4F5Q2g,Sb3DJGdZ4Rq__CqxPbae-g,"This place is disgusting, absolutely horrible,...",3,11,4,9,N,tQfLGoolUMu2J0igcWcoZg
...,...,...,...,...,...,...,...,...,...,...
688324,2/14/2008,QOb5gxPnxMDuw6uNv1siFw,e7B7IsZlRT8LbFj8FcY78w,I loved this hotel. The price was very afford...,5,2,2,0,YR,9xny0IlJqTInobC6W-UxbA
688325,2/10/2008,D32IzMjxSiFVmFGbUbXI1Q,e7B7IsZlRT8LbFj8FcY78w,"Listen, if I had to come to NYC and get a hote...",2,2,0,0,YR,PmmTXis1gCL34mg2bZ9gtw
688326,1/12/2008,amtvE4O7FdMUHs0loFOc3Q,e7B7IsZlRT8LbFj8FcY78w,"If you don't like this bar, I pretty much hate...",5,0,0,0,YR,Mr6zu_hWk2CodBdqqMWQjg
688327,1/12/2008,GxmoeUXze6U7mfJLood57w,e7B7IsZlRT8LbFj8FcY78w,I super big heart Eiji. Let me start by sayin...,5,0,0,0,YR,-zetzVfO4X0dpiiTmjdeKg


In [4]:
# Select only reviewID, reviewContent text, rating
df = reviews_df[["reviewID", "reviewContent", "rating"]]

In [5]:
# Take smaller sample to ease computation time
df = df.sample(1000)

In [6]:
# Steps:
    # 1: Instantiate vectorizer with parameters: vectorizer
    # 2: Vectorize column of text with fit_transform: X
    # 3: Instantiate LDA model with parameters: lda
    # 4: Fit LDA model to vectorized text: doc_topics

In [7]:
# 1 Instantiate vectorizer
vec = CountVectorizer(stop_words="english",
                    lowercase=True,
                    ngram_range=(1, 1), # lower bound,upper bound: 1,1 only unigrams, 1,2 unigrams and bigrams, 2,2 only bigrams, etc...,
                    min_df=5, # ignore rare words (appear in less than 5 documents)
                    max_df=0.7) # ignore common words (appear in more than 70% of documents)

In [8]:
# 2 Create dtm
X = vec.fit_transform(df["reviewContent"])

In [9]:
# CHECK: Documents, terms
X.shape

(1000, 2115)

In [10]:
# 3 Instantiate LDA
lda = LatentDirichletAllocation(n_components=10,
                                       random_state=42)

In [11]:
# 4
doc_topics = lda.fit_transform(X)

In [12]:
import pyLDAvis.sklearn
lda_viz = pyLDAvis.sklearn.prepare(lda_model=lda,
                                  dtm=X,
                                  vectorizer=vec,
                                  sort_topics=False)

In [13]:
pyLDAvis.display(lda_viz)

# pyLDAvis
# Left panel: 
    # global view of topic model
    # centers of circle are distance between topics then projected onto two dimensions
    # area of circles is the overall prevalence of the topic in the whole topic model
    # examine how prevalent each topic is
    # examine how topics relate to each other
# Right panel:
    # Bars represent individual terms that are most useful for interpreting selected topic on left
    # Blue bar represents corpus wide frequencies
    # Red bar represents topic-specific frequencies
    # examine the meaning of each topic

## Filter data to look at only bad reviews

In [14]:
small_bad = df[df["rating"] == 1]

In [15]:
# 1 Instantiate vectorizer, maybe adjust parameters?
vec = CountVectorizer(stop_words="english",
                    lowercase=True,
                    ngram_range=(1, 1), # lower bound,upper bound: 1,1 only unigrams, 1,2 unigrams and bigrams, 2,2 only bigrams, etc...,
                    min_df=5, # ignore rare words (appear in less than 5 documents)
                    max_df=0.7) # ignore common words (appear in more than 70% of documents)

In [16]:
# 2 Create dtm
X = vec.fit_transform(small_bad["reviewContent"])

In [17]:
# CHECK: Documents, terms
X.shape

(57, 134)

In [18]:
# 3 Instantiate LDA
lda = LatentDirichletAllocation(n_components=3)

In [19]:
# 4
doc_topics = lda.fit_transform(X)

In [20]:
import pyLDAvis.sklearn
lda_viz = pyLDAvis.sklearn.prepare(lda_model=lda,
                                  dtm=X,
                                  vectorizer=vec,
                                  sort_topics=False)

In [21]:
pyLDAvis.display(lda_viz)