In [8]:
import numpy as np
import pandas as pd

import re
import spacy
from spacy.tokens import Token
from spacy.matcher import Matcher

import pickle

In [2]:
with open('df_nbc.pickle', 'rb') as handle:
    df_nbc = pickle.load(handle)

In [3]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews
0,2020-02-15,5.0,Cute place to study ! They have individuals t...
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....
3,2020-02-28,4.0,"Okay, you know how there are a million artisan..."
4,2020-01-18,5.0,Everything but the parking situation is great ...


In [4]:
nlp = spacy.load('en_core_web_md')

In [5]:
# products on their menu
products = ['espresso', 'latte', 'americano', 'cappucino', 'cortado', 'matcha', 'tea', 'chai', 
            'lemonade', 'ade', 'salad', 'toast', 'sandwich']

# add products to be custom token attribute to be able to recognize in rule based matching
is_product_getter = lambda token: token.test in products
Token.set_extension('is_product', getter=is_product_getter)

In [89]:
p_something_is = [{"POS": "NOUN"}, {"POS": "VERB"}, {"POS": "ADV", "OP": "?"},
           {"POS": "ADJ"}]

p_something_for = [{"POS": "NOUN"}, {"POS": "ADP"}, {"POS": "VERB"}]

p_desc = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

p_something_of = [{"POS": "NOUN"}, {"POS": "ADP"}, {"POS": "DET", "OP": "?"}, {"POS": "NOUN"}]

p_something_has = [{"POS": "NOUN"}, {"POS": "VERB"}, {"POS": "NOUN"}]

In [91]:
patterns = {'something_is': p_something_is,  'something_for': p_something_for, 'desc': p_desc, 
            'something_of': p_something_of, 'something_has': p_something_has}

In [92]:
matcher = Matcher(nlp.vocab)
for key in patterns.keys():
    matcher.add(key, None, patterns[key])

Below I am trying to capture statements that are more descriptive that we can use as inputs into the topic model. However, notice that for descriptions that are longer than 2 words, now that I've captured the descriptions, I can probably condense them into two words (the first and the last) without losing too much meaning. For example, "parking in the back" - I don't need this entire statement, or n-gram, to be included in the dataset. Instead, I can condense to the bigram, "parking back" and still understand that this person is describing parking that's in the back. 

In [35]:
matches = matcher(review_doc)
for _, start, end in matches:
    print(review_doc[start:end])

Cute place
table for studying
lots of plugs
parking in the back
back behind the store
more selection
selection of food
green tea
macchiato is amazing
level is minimal
suitable environment
environment for studying
great experience


In [88]:
def text_preprocess(string):
    
    string = string.strip()
    string = string.lower()
    string = re.sub('\n', '', string)
    
    return string

def create_word_vector(review):
    
    include_all = False
    
    review = text_preprocess(review)
    review_doc = nlp(review)
    
    tokens = set()
    
    matches = matcher(review_doc)
    for _, beg, end in matches:
        
        # get lemma form of first word in match
        first_word = review_doc[beg].lemma_
        # get lemma form of last word in match
        last_word = review_doc[end-1].lemma_
        
        # don't include anything with pronouns
        # might change later
        if first_word == '-PRON-'or last_word == '-PRON-':
            continue
        
        # assign first and last word alphabetically with min/max funcs
        # e.g. we don't want both ('cute', 'place') and ('place', 'cute') in dataset, just one of them
        word1 = min(first_word, last_word)
        word2 = max(first_word, last_word)
        
        # add pair to tokens set
        tokens.add((word1, word2))
        
    # if we want to include all tokens in set    
    if include_all:
        for token in review_doc:
            if not token.is_stop:
                tokens.add(token.lemma_)
                
    return list(tokens)

In [93]:
df_nbc['ngrams'] = df_nbc['reviews'].apply(create_word_vector)

In [94]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,ngrams
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,"[(level, minimal), (experience, great), (lot, ..."
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...","[(high, note), (delicious, drink), (key, vibe)..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,"[(cup, handle), (charcoal, surprise), (interio..."
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...","[(only, worker), (menu, scarce), (coffee, fanc..."
4,2020-01-18,5.0,Everything but the parking situation is great ...,"[(koreatown, shop), (latte, spanish), (drink, ..."


In [63]:
df_nbc['ngrams'][0]

[('level', 'minimal'),
 ('experience', 'great'),
 ('lot', 'plug'),
 ('study', 'table'),
 ('food', 'selection'),
 ('back', 'parking'),
 ('environment', 'suitable'),
 ('more', 'selection'),
 ('amazing', 'macchiato'),
 ('cute', 'place'),
 ('green', 'tea'),
 ('environment', 'table')]

In [95]:
df_nbc['ngrams'][18]

[('drink', 'other'),
 ('cool', 'interior'),
 ('good', 'spot'),
 ('layout', 'seat'),
 ('barista', 'nice'),
 ('attention', 'what'),
 ('attention', 'much'),
 ('basic', 'drink')]

In [80]:
df_nbc['reviews'][18]

"I like to judge a coffee shop by some of their most basic drinks - drip coffee, an americano, etc. and the ice americano I ordered did not disappoint. It was smooth and not too acidic, just how I like it. I didn't pay too much attention to what their menu has to offer as I had my dog with me (which I'm not entirely sure if they allows dogs - their Yelp page says yes but they didn't have any signs up. However, the barista was super nice and he was ok with my pup being there). \n\nThe interior was pretty cool - their layout had seats that would be a good spot to bring your friends and hang out. I'd definitely come here again to try some of their other drinks and snacks."

In [44]:
s = 'the place is cute. it is a cute place'
test_doc = nlp(s)
matches = matcher(test_doc)
for _, beg, end in matches:
    print(test_doc[beg:end])

place is cute
cute place


In [81]:
s = 'the barrista was super nice'
test_doc = nlp(s)
matches = matcher(test_doc)
for _, beg, end in matches:
    print(test_doc[beg:end])