## Topic modeling using Yelp hotel reviews

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Read in data
reviews_df = pd.read_sql_table("review",
                              "sqlite:///yelpHotelData.db")

In [None]:
# Select only reviewID, reviewContent text, rating
df = reviews_df[["reviewID", "reviewContent", "rating"]]

In [None]:
# Take smaller sample to ease computation time
df = df.sample(1000)

In [None]:
# Steps:
    # 1: Instantiate vectorizer with parameters: vectorizer
    # 2: Vectorize column of text with fit_transform: X
    # 3: Instantiate LDA model with parameters: lda
    # 4: Fit LDA model to vectorized text: doc_topics

In [None]:
# 1 Instantiate vectorizer
vec = CountVectorizer(stop_words="english",
                    lowercase=True,
                    ngram_range=(1, 1), # lower bound,upper bound: 1,1 only unigrams, 1,2 unigrams and bigrams, 2,2 only bigrams, etc...,
                    min_df=5, # ignore rare words (appear in less than 5 documents)
                    max_df=0.7) # ignore common words (appear in more than 70% of documents)

In [None]:
# 2 Create dtm
X = vec.fit_transform(df["reviewContent"])

In [None]:
# CHECK: Documents, terms
X.shape

In [None]:
# 3 Instantiate LDA
lda = LatentDirichletAllocation(n_components=10,
                                       random_state=42)

In [None]:
# 4
doc_topics = lda.fit_transform(X)

In [None]:
import pyLDAvis.sklearn
lda_viz = pyLDAvis.sklearn.prepare(lda_model=lda,
                                  dtm=X,
                                  vectorizer=vec,
                                  sort_topics=False)

In [None]:
pyLDAvis.display(lda_viz)

# pyLDAvis
# Left panel: 
    # global view of topic model
    # centers of circle are distance between topics then projected onto two dimensions
    # area of circles is the overall prevalence of the topic in the whole topic model
    # examine how prevalent each topic is
    # examine how topics relate to each other
# Right panel:
    # Bars represent individual terms that are most useful for interpreting selected topic on left
    # Blue bar represents corpus wide frequencies
    # Red bar represents topic-specific frequencies
    # examine the meaning of each topic

## Filter data to look at only bad reviews

In [None]:
small_bad = df[df["rating"] == 1]

In [None]:
# 1 Instantiate vectorizer, maybe adjust parameters?
vec = CountVectorizer(stop_words="english",
                    lowercase=True,
                    ngram_range=(1, 1), # lower bound,upper bound: 1,1 only unigrams, 1,2 unigrams and bigrams, 2,2 only bigrams, etc...,
                    min_df=5, # ignore rare words (appear in less than 5 documents)
                    max_df=0.7) # ignore common words (appear in more than 70% of documents)

In [None]:
# 2 Create dtm
X = vec.fit_transform(small_bad["reviewContent"])

In [None]:
# CHECK: Documents, terms
X.shape

In [None]:
# 3 Instantiate LDA
lda = LatentDirichletAllocation(n_components=3)

In [None]:
# 4
doc_topics = lda.fit_transform(X)

In [None]:
import pyLDAvis.sklearn
lda_viz = pyLDAvis.sklearn.prepare(lda_model=lda,
                                  dtm=X,
                                  vectorizer=vec,
                                  sort_topics=False)

In [None]:
pyLDAvis.display(lda_viz)