In [1]:
import numpy as np
import pandas as pd

import re
import spacy
from spacy.tokens import Token
from spacy.matcher import Matcher

import pickle

In [2]:
with open('df_nbc.pickle', 'rb') as handle:
    df_nbc = pickle.load(handle)

In [4]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews
0,2020-02-15,5.0,Cute place to study ! They have individuals t...
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....
3,2020-02-28,4.0,"Okay, you know how there are a million artisan..."
4,2020-01-18,5.0,Everything but the parking situation is great ...


In [9]:
nlp = spacy.load('en_core_web_md')

In [10]:
# include with medium library - for some reason is_stop does not work without it
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

12

By identifying patterns in our descriptions, we get a clearer idea on what these descriptions are. In the modeling below, we then further identify groups of these patterns that are commonly talked about together in the reviews. Look for simple, descriptive patterns in the sentences such as the ones below.

In [179]:
# latte is (very) delicious
p_something_is = [{"POS": "NOUN"}, {"POS": "VERB"}, {"POS": "ADV", "OP": "?"},
           {"POS": "ADJ"}]

# (suitable) environment for studying
p_something_for = [{"POS": "ADJ", "OP": "?"}, {"POS": "NOUN"}, {"POS": "ADP"}, {"POS": "VERB"}]

# terrible customer (service)
p_desc = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# parking in (the) back / lots of plugs
p_something_of = [{"POS": "NOUN"}, {"POS": "ADP"}, {"POS": "DET", "OP": "?"}, {"POS": "NOUN"}]

# layout has seats
p_something_has = [{"POS": "NOUN"}, {"POS": "VERB"}, {"POS": "NOUN"}]

In [176]:
patterns = {'something_is': p_something_is,  'something_for': p_something_for, 'desc': p_desc, 
            'something_of': p_something_of, 'something_has': p_something_has}

In [177]:
matcher = Matcher(nlp.vocab)
for key in patterns.keys():
    matcher.add(key, None, patterns[key])

Below I am trying to capture statements that are more descriptive that we can use as inputs into the topic model. However, notice that for descriptions that are longer than 2 words, now that I've captured the descriptions, I can probably condense them into two words (the first and the last) without losing too much meaning. For example, "parking in the back" - I don't need this entire statement, or n-gram, to be included in the dataset. Instead, I can condense to the bigram, "parking back" and still understand that this person is describing parking that's in the back. 

In [178]:
matches = matcher(review_doc)
for _, start, end in matches:
    print(review_doc[start:end])

Cute place
table for studying
lots of plugs
parking in the back
back behind the store
more selection
selection of food
green tea
macchiato is amazing
level is minimal
suitable environment
suitable environment for studying
environment for studying
great experience


In [4]:
def text_preprocess(string):
    
    string = string.strip()
    string = string.lower()
    string = re.sub('\n', '', string)
    string = re.sub('[!#?,.:";]', '', string)
    
    return string

In [9]:
def create_word_vector(review, matcher, include_all=False):
    
    review = text_preprocess(review)
    review_doc = nlp(review)
    
    tokens = set()
    
    matches = matcher(review_doc)
    for _, beg, end in matches:
        
        # get lemma form of first word in match
        first_word = review_doc[beg].lemma_
        # get lemma form of last word in match
        last_word = review_doc[end-1].lemma_
        
        # don't include anything with pronouns
        # might change later
        if first_word == '-PRON-'or last_word == '-PRON-':
            continue
        
        # assign first and last word alphabetically with min/max funcs
        # e.g. we don't want both ('cute', 'place') and ('place', 'cute') in dataset, just one of them
        word1 = min(first_word, last_word)
        word2 = max(first_word, last_word)
        
        # add pair to tokens set
        tokens.add((word1, word2))
        
    # if we want to include all tokens in set    
    if include_all:
        for token in review_doc:
            if not token.is_stop and not token.lemma_.isspace():
                tokens.add((token.lemma_,))
                
    return list(tokens)

In [93]:
df_nbc['ngrams'] = df_nbc['reviews'].apply(create_word_vector)

In [94]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,ngrams
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,"[(level, minimal), (experience, great), (lot, ..."
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...","[(high, note), (delicious, drink), (key, vibe)..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,"[(cup, handle), (charcoal, surprise), (interio..."
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...","[(only, worker), (menu, scarce), (coffee, fanc..."
4,2020-01-18,5.0,Everything but the parking situation is great ...,"[(koreatown, shop), (latte, spanish), (drink, ..."


In [152]:
# create column with all tokens, not just extracted statements
token_list = [create_word_vector(review, matcher, True) for review in df_nbc['reviews']]

In [153]:
df_nbc['tokens'] = np.array(token_list)

In [154]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,ngrams,tokens
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,"[(level, minimal), (experience, great), (lot, ...","[(make,), (lot, plug), (study, table), (noise,..."
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...","[(high, note), (delicious, drink), (key, vibe)...","[(croissant,), (lemonade,), (blue,), (dark, fl..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,"[(cup, handle), (charcoal, surprise), (interio...","[($,), (cup, handle), (wilshire,), (smooth, te..."
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...","[(only, worker), (menu, scarce), (coffee, fanc...","[(menu, scarce), (okay,), (coffee, fancé), (tu..."
4,2020-01-18,5.0,Everything but the parking situation is great ...,"[(koreatown, shop), (latte, spanish), (drink, ...","[(reasonably,), (latte, spanish), (downside,),..."


In [63]:
df_nbc['ngrams'][0]

[('level', 'minimal'),
 ('experience', 'great'),
 ('lot', 'plug'),
 ('study', 'table'),
 ('food', 'selection'),
 ('back', 'parking'),
 ('environment', 'suitable'),
 ('more', 'selection'),
 ('amazing', 'macchiato'),
 ('cute', 'place'),
 ('green', 'tea'),
 ('environment', 'table')]

In [95]:
df_nbc['ngrams'][18]

[('drink', 'other'),
 ('cool', 'interior'),
 ('good', 'spot'),
 ('layout', 'seat'),
 ('barista', 'nice'),
 ('attention', 'what'),
 ('attention', 'much'),
 ('basic', 'drink')]

## Modeling - LDA

In [135]:
from sklearn.feature_extraction.text import CountVectorizer
X = df_nbc['ngrams']
count_vec = CountVectorizer(analyzer=lambda x:x)
X_train = count_vec.fit_transform(X)

In [99]:
X_train

<180x1110 sparse matrix of type '<class 'numpy.int64'>'
	with 1504 stored elements in Compressed Sparse Row format>

In [100]:
from sklearn.decomposition import LatentDirichletAllocation

In [139]:
LDA = LatentDirichletAllocation(n_components=5, random_state=42)

In [140]:
LDA.fit(X_train)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=5, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## Some notes...

Topic modeling involves identifying topics that are latent in the text that contain them. LDA assumes that a corpus is generated by a topic, which is generated by words belonging to that topic. However, with reviews, this might be an incorrect assumption to make because a reviewer isn't necessarily keeping to a certain topic in a review; a review rather might convey a positive or negative tone about many parts of the experience. They might talk about the quality of the products, the service, the ambiance of the store - a wide range of topics. Therefore, we might not find a particular theme in each of the LDA components, but more of things that are commonly said together in comments. Something to try out later is applying this with kMeans.

I think it might be meaningful to be able to identify reviews that contain mentions of specific products and see what these reviews are saying about that product. Extract the entire review this mention is in but also take the specific sentences talking about the product and do a sentiment analysis to see if they're generally positive or negative.

Again, we won't necessarily be able to identify common themes or patterns. Rather, these are common statements that are mentioned together in groups of reviews.

For instances, reviews in the first group (topic 0) tend to talk about good coffee, good service, the parking, and the place being overall a great spot to go. Going down a little, reviews in the third group (topic 2) talk about free wifi, lots of outlets, good coffee, and the spot being a favorite of customers.

In [141]:
for i, topic in enumerate(LDA.components_):
    print(f'Top 15 words/phrases for topic #{i}')
    print([count_vec.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

Top 15 words/phrases for topic #0
[('number', 'outlet'), ('first', 'time'), ('small', 'table'), ('k', 'shop'), ('friendly', 'staff'), ('front', 'parking'), ('free', 'parking'), ('coffee', 'great'), ('back', 'lot'), ('lot', 'people'), ('good', 'service'), ('good', 'place'), ('drink', 'good'), ('great', 'place'), ('coffee', 'good')]


Top 15 words/phrases for topic #1
[('environment', 'great'), ('nice', 'place'), ('hour', 'late'), ('back', 'lot'), ('price', 'reasonable'), ('decor', 'minimal'), ('coffee', 'cup'), ('iced', 'latte'), ('coffee', 'nothing'), ('small', 'table'), ('cup', 'free'), ('ktown', 'shop'), ('barista', 'nice'), ('coffee', 'late'), ('coffee', 'great')]


Top 15 words/phrases for topic #2
[('coffee', 'nothing'), ('great', 'place'), ('favorite', 'spot'), ('instrumental', 'music'), ('menu', 'minimal'), ('drink', 'other'), ('good', 'spot'), ('friendly', 'staff'), ('side', 'sweet'), ('free', 'wifi'), ('cafe', 'cute'), ('lot', 'small'), ('lot', 'outlet'), ('back', 'lot'), ('co

In [160]:
topic_results = LDA.transform(X_train)
df_nbc['topic_statements'] = topic_results.argmax(axis=1)

In [161]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,ngrams,tokens,topic_statements
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,"[(level, minimal), (experience, great), (lot, ...","[(make,), (lot, plug), (study, table), (noise,...",3
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...","[(high, note), (delicious, drink), (key, vibe)...","[(croissant,), (lemonade,), (blue,), (dark, fl...",1
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,"[(cup, handle), (charcoal, surprise), (interio...","[($,), (cup, handle), (wilshire,), (smooth, te...",3
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...","[(only, worker), (menu, scarce), (coffee, fanc...","[(menu, scarce), (okay,), (coffee, fancé), (tu...",2
4,2020-01-18,5.0,Everything but the parking situation is great ...,"[(koreatown, shop), (latte, spanish), (drink, ...","[(reasonably,), (latte, spanish), (downside,),...",4


In [172]:
df_nbc['topic_statements'].value_counts()

3    50
4    39
0    37
2    30
1    24
Name: topic_statements, dtype: int64

## Some reviews belonging to each group...

In [173]:
from random import randint

def print_reviews(df, topic_col, topic_num, n=5):
    df_topic = df[df[topic_col] == topic_num]['reviews']
    max_range = len(df_topic)
    for i in range(n):
        rand_num = randint(0, max_range)
        print(rand_num)
        print(df_topic.iloc[rand_num])
        print('\n')
        print('------------------------------------------------------------------')

In [174]:
for i in range(5):
    print('REVIEWS FOR TOPIC ' + str(i) + ":")
    print('\n')
    print_reviews(df_nbc, 'topic_statements', i)

REVIEWS FOR TOPIC 0:


21
I think I'm in love with this place! The moment I walked in the first time, I felt as though the atmosphere was made for me. I don't want to describe the atmosphere as "chill," but honestly, I cannot think of a better word. The decorations are minimal, the tables are minimal, even the employees seem minimal. this place is a minimalist's dream. The service was nice, everyone seemed so patient. Parking is a private lot in the back, but it's usually full. The music they play there is just quite enough for you to block it out if you want to. 
Honey truffle latte: YUM! The flavor of the truffle was perfectly done. Usually I don't enjoy truffle because the flavor is too dominant, but this cup of latte was great. I had a sip of this cold, and it just didn't do the truffle any justice compared to the hot. If you order this, definitely opt for the hot option.
NBC latte: yum! Absolutely perfect amount of cinnamon in this! This is similar to a Spanish latte at Urth Caffe

## Identifying products in reviews

Using regex and spaCy to come up with an efficient way of identifying sentences and reviews with certain products... 

Here, we can identify reviews that mention specific products. We can take those sentences, see what kind of things are being said about the product, and do sentiment analysis and trend out positive and negative reviews.

There would be an argument that if we're generalizing the products to just ones like "mocha" or "latte", the regex patterns below are superfluous but good to have in case we do want to be more specific with products in the future.

In [7]:
mocha_p = '(earl ?gr(a|e)y ?)?(mocha)'

latte_p = '(matcha ?|charcoal ?)?(latte|matcha)'

coffee_p = '(espresso|americano|cappucino|cortado|nbc|ktown)'

tea_p = '(green ?|black ?|rose ?|chrysanthemum ?)?(tea|chai)'
            
drinks_p = '(blue ?)?(lemonade)'

food_p = '(egg ?avocado ?|penne ? basil ? pesto ?)?(salad)|(arugula ?)?(toast)|(pastrami ?cheese ?|cheese ?)?(sandwich)'


In [37]:
from textblob import TextBlob

def get_prod_sents(reviews, patterns):
    '''
    Searches each review if product type is mentioned in review. 
    Returns two nested lists - one with lists for each product type where each element of each list is either
    an empty string or the sentence(s) that contain the product mention, and another with lists for each product
    type where each element of each list is the sentiment score corresponding to the sentence(s) mentioning the 
    product
    '''
    
    item_types = len(patterns)
    prod_matches = [[] for i in range(item_types)]
    sentiment_scores = [[] for i in range(item_types)]
    
    for review in reviews:
        review = text_preprocess(review)
        review_doc = nlp(review)
        
        for i in range(item_types):
            prod_sents = None
            sentiment_score = None
            
            for match in re.finditer(patterns[i], review):
                start, end = match.span()
                span = review_doc.char_span(start, end)
                
                if span is not None:
                    if prod_sents:
                        prod_sents = prod_sents + '. ' + span.sent.text
                    else:
                        prod_sents = span.sent.text
                        
            if prod_sents:
                sent_blob = TextBlob(prod_sents)
                sentiment_score = sent_blob.sentiment.polarity
                    
            prod_matches[i].append(prod_sents)
            sentiment_scores[i].append(sentiment_score)
            
    return prod_matches, sentiment_scores

def append_prod_sents(df, pattern_names, patterns):
    '''
    Inserts columns into dataframe for each product type mention, if exists in the review,
    and the associated sentiment score
    '''
    
    reviews = df['reviews']
    prod_sents, sent_scores = get_prod_sents(reviews, patterns)
    
    for i, name in enumerate(pattern_names):
        mentions_col = 'mentions_' + name
        df[mentions_col] = prod_sents[i]
        
        sentiment_col = 'sentiment_of_' + name
        df[sentiment_col] = sent_scores[i]
        
    return df



In [38]:
pattern_names = ['mocha', 'latte', 'coffee', 'tea', 'misc_drinks', 'food']
patterns = [mocha_p, latte_p, coffee_p, tea_p, drinks_p, food_p]
df_nbc = append_prod_sents(df_nbc, pattern_names, patterns)

In [39]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,mentions_mocha,mentions_latte,mentions_coffee,mentions_tea,mentions_misc_drinks,mentions_food,sentiment_of_mocha,sentiment_of_latte,sentiment_of_coffee,sentiment_of_tea,sentiment_of_misc_drinks,sentiment_of_food
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,,,,their green tea and macchiato is amazing,,,,,,0.2,,
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...",,it was sparkling and super unique and tasty a ...,,,so i'm curious how they make the cream i had t...,,,0.328048,,,-0.05,
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,,* i believe they have a free parking lot in th...,,,,,,0.319048,,,,
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...",,,,,,,,,,,,
4,2020-01-18,5.0,Everything but the parking situation is great ...,,everything but the parking situation is great ...,everything but the parking situation is great ...,,,,,0.409375,0.409375,,,


In [40]:
df_nbc.describe()

Unnamed: 0,sentiment_of_mocha,sentiment_of_latte,sentiment_of_coffee,sentiment_of_tea,sentiment_of_misc_drinks
count,4.0,83.0,48.0,19.0,5.0
mean,0.14375,0.294257,0.269998,0.166816,0.131667
std,0.251143,0.274686,0.241402,0.308651,0.166082
min,-0.125,-0.25,-0.25,-0.308929,-0.05
25%,-0.03125,0.088542,0.106037,-0.075714,-0.05
50%,0.1375,0.271667,0.265,0.2,0.241667
75%,0.3125,0.447917,0.405156,0.295312,0.25
max,0.425,1.0,1.0,1.0,0.266667


In [45]:
def view_reviews(df, product_type, view_num=5, sentiment='p'):
    '''
    Print out view_num of reviews related to product type that are either contain positive sentiments('p')
    or negative sentiments('n'). If view_num is greater than number of actual reviews with sentiment type,
    will print out all reviews. 
    '''
    
    sentiment_col = 'sentiment_of_' + product_type
    mentions_col = 'mentions_' + product_type
    
    if sentiment == 'p':
        reviews = df_nbc[df_nbc[sentiment_col] >= 0][mentions_col]
        statement = 'Positive Reviews About ' + product_type
    else:
        reviews = df_nbc[df_nbc[sentiment_col] < 0][mentions_col]
        statement = 'Negative Reviews About ' + product_type
        
    num_reviews = len(reviews)    
    print_num = min(num_reviews, view_num)
    
    print(statement)
    print('-----------------------------------------------------------------')
    for i in range(print_num):
        print(reviews.iloc[i])
        print('-----------------------------------------------------------------')
        

In [46]:
view_reviews(df_nbc, 'latte')

Positive Reviews About latte
-----------------------------------------------------------------
everything but the parking situation is great about this koreatown coffee shop drinks are great and reasonably pricedtheir nbc latte is like a spanish latte good lighting for studying/working has free wifi open til 2 amclean bathroom friendly staff mix of comfy seats
-----------------------------------------------------------------
been to nbc handful of times and all times the service has been great drinks tasted exceptional everything is clean and will be back based on our experience drinkssuper solid bf. and i always get the iced nbc latte charcoal latte or london smog latte
-----------------------------------------------------------------
a frequent-customer cardbut the seating at nbc was very limitedi was stuck sitting on a bench that was very wobbly so definitely not too comfortable for a place you're going to study at for hours
----------------------------------------------------------

In [55]:
df_nbc[df_nbc['sentiment_of_coffee'] >= 0]['reviews'].iloc[0]

"Everything but the parking situation is great about this Koreatown coffee shop. \n\nDrinks are great and reasonably priced.\n\nTheir NBC latte is like a Spanish latte! \n\nGood lighting for studying/working. \n\nHas free WiFi. \n\nOpen til 2 AM!!!!\n\nClean bathroom! \n\nFriendly staff. \n\nMix of comfy seats. \n\n\nOnly downside is their parking. they have only two spots in the parking lot behind. If you Park in the other spots you may get yelled at in Korean! You have been warned!! \n\nIf you are staying for a long time, it's better if you find street parking. Good luck!"