In [96]:
import numpy as np
import pandas as pd

import re
import spacy

import pickle

In [3]:
with open('df_nbc.pickle', 'rb') as handle:
    df_nbc = pickle.load(handle)

In [4]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews
0,2020-02-15,5.0,Cute place to study ! They have individuals t...
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....
3,2020-02-28,4.0,"Okay, you know how there are a million artisan..."
4,2020-01-18,5.0,Everything but the parking situation is great ...


In [10]:
type(df_nbc['reviews'][0])

str

In [71]:
nlp = spacy.load('en_core_web_md')

In [11]:
review = df_nbc['reviews'][0]

In [12]:
review

"Cute place to study  ! They have individuals table for studying! Also lots of plugs !! And parking in the back behind the store.\n\nHowever I wish they have more selection of food. :) Their green tea and macchiato is amazing! \n\nThe noise level is minimal, not too loud, which makes it's a suitable environment for studying.\n\nThe tables are a little small, so I wouldn't recommend group studying. Overall great experience here !"

In [97]:
review = re.sub('\n', '', review)

In [72]:
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

12

In [159]:
review_doc = nlp(review)
for token in review_doc:
    print(f"{token.text:{15}} {token.pos_:{5}} {token.head.text:{10}} {token.dep_:{5}} \
          {token.lemma_:{10}} {token.is_stop:{5}} {token.head.is_stop:{5}}")

Cute            ADJ   place      amod            cute           0     0
place           NOUN  place      ROOT            place          0     0
to              PART  study      aux             to             1     0
study           VERB  place      relcl           study          0     0
                SPACE study                                     0     0
!               PUNCT place      punct           !              0     0
They            PRON  have       nsubj           -PRON-         1     1
have            VERB  have       ROOT            have           1     1
individuals     NOUN  table      nsubj           individual     0     0
table           NOUN  have       dobj            table          0     1
for             ADP   table      prep            for            1     0
studying        VERB  for        pcomp           study          0     1
!               PUNCT have       punct           !              0     1
Also            ADV   lots       advmod           also          

In [110]:
for chunk in review_doc.noun_chunks:
    print(chunk.text)

Cute place
They
individuals
Also lots
plugs
And parking
the back
the store
I
they
more selection
food
Their green tea
macchiato
The noise level
it
a suitable environment
The tables
I
group
Overall great experience


In [23]:
from spacy import displacy

In [99]:
options = {'compact': 'True'}
displacy.render(review_doc, style='dep', jupyter=True)

In [100]:
review_doc

Cute place to study  ! They have individuals table for studying! Also lots of plugs !! And parking in the back behind the store.However I wish they have more selection of food. :) Their green tea and macchiato is amazing! The noise level is minimal, not too loud, which makes it's a suitable environment for studying.The tables are a little small, so I wouldn't recommend group studying. Overall great experience here !

In [103]:
excluded_deps = {'ccomp', 'punct', 'prep', 'pobj'}
dep = []
for token in review_doc:
    if (not token.is_stop) and (not token.head.is_stop) and (token.dep_ not in excluded_deps):
        word1 = token.lemma_
        word2 = token.head.lemma_
        
        if (word1 == word2) or word1.isspace() or word2.isspace():
            continue
            
        dep.append((word1, word2))

dep doesn't capture anything in sentences: "Also lots of plugs !! And parking in the back behind the store.However I wish they have more selection of food." 

Investigate patterns to include information from these excluded sentences. Look into how to incorporate or extract information from more meaningful phrases like "lots of plugs"

In [104]:
dep

[('cute', 'place'),
 ('study', 'place'),
 ('individual', 'table'),
 ('green', 'tea'),
 ('macchiato', 'tea'),
 ('noise', 'level'),
 ('suitable', 'environment'),
 ('environment', 'be'),
 ('little', 'small'),
 ('not', 'recommend'),
 ('group', 'study'),
 ('overall', 'experience'),
 ('great', 'experience')]

In [128]:
#create functions to apply to each review to generate word array
def valid_token(token):
    excluded_deps = {'ccomp', 'punct', 'prep', 'pobj'}
    if (not token.is_stop) and (not token.head.is_stop) and (token.dep_ not in excluded_deps):
        return True
    else:
        return False

def create_word_arr(review):
    
    review = re.sub('\n', '', review)
    review_doc = nlp(review)
    
    dep = set()
    for token in review_doc:
        if valid_token(token):
            word1 = token.lemma_
            word2 = token.head.lemma_

            if (word1 == word2) or word1.isspace() or word2.isspace():
                continue

            dep.add((word1, word2))
            
    return list(dep)

In [129]:
df_nbc['tokens'] = df_nbc['reviews'].apply(create_word_arr)

In [130]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,tokens
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,"[(cute, place), (environment, be), (great, exp..."
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...","[(friendly, guy), (strawberry, pocky), (ugh, g..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,"[(deal, way), (parking, lot), (park, believe),..."
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...","[(dull, attitude), (coffee, artisan), (4, star..."
4,2020-01-18,5.0,Everything but the parking situation is great ...,"[(long, time), (reasonably, price), (good, lig..."


In [125]:
df_nbc['reviews'][2]

"Time* Around 5pm on a Tuesday. \n\nWait* None. But around 6pm people starts coming in. \n\nParking* I believe they have a free parking lot in the rear where you can get in from wilshire, but I parked on the street on Van Ness and walked over. \n\nFood* Charcoal Latte. It is to my surprise that the charcoal latte is on the sweet side, because I was expecting more of a bitter taste. It actually suits my taste because it is very easy to drink and has a very smooth texture. But as you keep drinking it, it gets sweeter, so I probably will order less sweet or try something else next time. Also, it is interesting that they chose to use cups without a handle, makes me want to sip it slowly. \n\nPrice* $5.25, a little cheaper than the coffee shops in the area. \n\nService* The cashier/barista is smiley and polite and he seems to know the menu pretty well as I heard him explaining drinks to others. I also like how he will tell you to enjoy once he brings out the drink. He is very patient too as

In [131]:
df_nbc['tokens'][2]

[('deal', 'way'),
 ('parking', 'lot'),
 ('park', 'believe'),
 ('sweet', 'order'),
 ('free', 'lot'),
 ('slowly', 'sip'),
 ('time', 'take'),
 ('chilling~', 'study'),
 ('5.25', 'price'),
 ('order', 'get'),
 ('actually', 'suit'),
 ('time', 'try'),
 ('place', 'cram'),
 ('try', 'order'),
 ('smooth', 'texture'),
 ('cashier', 'barista'),
 ('drink', 'bring'),
 ('people', 'start'),
 ('5', 'p.m.'),
 ('enjoy', 'tell'),
 ('order', 'take'),
 ('van', 'ness'),
 ('take', 'customer'),
 ('6', 'p.m.'),
 ('taste', 'suit'),
 ('sip', 'want'),
 ('bring', 'tell'),
 ('charcoal', 'latte'),
 ('walk', 'park'),
 ('polite', 'smiley'),
 ('menu', 'know'),
 ('drink', 'explain'),
 ('use', 'choose'),
 ('rectangular', 'interior'),
 ('fact', 'love'),
 ('sweet', 'get'),
 ('little', 'cheap'),
 ('power', 'outlet'),
 ('p.m.', 'come'),
 ('probably', 'order'),
 ('place', 'recommend'),
 ('$', '5.25'),
 ('bitter', 'taste'),
 ('food', 'latte'),
 ('coffee', 'shop'),
 ('way', 'notice'),
 ('leave', 'cram'),
 ('not', 'cram'),
 ('start'

In [112]:
for sent in review_doc.sents:
    print(sent)

Cute place to study  !
They have individuals table for studying!
Also lots of plugs !!
And parking in the back behind the store.
However I wish they have more selection of food. :)
Their green tea and macchiato is amazing!
The noise level is minimal, not too loud, which makes it's a suitable environment for studying.
The tables are a little small, so I wouldn't recommend group studying.
Overall great experience here !


In [117]:
nlp.vocab['have'].is_stop

True

In [134]:
df_nbc['review_ratings'].value_counts()

5.0    123
4.0     45
3.0      6
1.0      4
2.0      2
Name: review_ratings, dtype: int64

## Matcher Testing

Testing the spaCy's Phrase Matcher capabilities
https://spacy.io/usage/rule-based-matching#phrasematcher

Want to find phrases or sentences with phrases that follow certain rules and patterns

In [135]:
from spacy.matcher import Matcher

In [142]:
test_sent = "The green tea latte is very delicious. I would get it again. The latte is awesome!"

In [144]:
# later, I add products to named entities and place into pattern rather then specific product
# can add boolean that token is specific product then add boolean into pattern
# to do so, reference https://spacy.io/usage/processing-pipelines#custom-components-attributes
pattern = [{"LOWER": "latte"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]

In [141]:
matcher = Matcher(nlp.vocab)

In [145]:
test_doc = nlp(test_sent)
matcher.add('latteIS', None, pattern)
matches = matcher(test_doc)

In [147]:
matches

[(6330969863298014687, 3, 7), (6330969863298014687, 15, 18)]

In [156]:
_, start, end = matches[0]
#start and end are the indices of the tokens, not the indices of the individual characters
span = test_doc[start:end]

In [152]:
span

latte is very delicious

In [157]:
#the sentence the match is in
span.sent

The green tea latte is very delicious.

In [158]:
#indices of the individual characters where match starts and ends
print(span.start_char)
print(span.end_char)
#indices of the individual characters where sentence of match starts and ends
print(span.sent.start_char)
print(span.sent.end_char)

14
37
0
38


In [160]:
for token in test_doc:
    print(f"{token.text:{15}} {token.pos_:{5}} {token.head.text:{10}} {token.dep_:{5}} \
          {token.lemma_:{10}} {token.is_stop:{5}} {token.head.is_stop:{5}}")

The             DET   latte      det             the            1     0
green           ADJ   latte      amod            green          0     0
tea             NOUN  latte      compound           tea            0     0
latte           NOUN  is         nsubj           latte          0     1
is              VERB  is         ROOT            be             1     1
very            ADV   delicious  advmod           very           1     0
delicious       ADJ   is         acomp           delicious      0     1
.               PUNCT is         punct           .              0     1
I               PRON  get        nsubj           -PRON-         1     1
would           VERB  get        aux             would          1     1
get             VERB  get        ROOT            get            1     1
it              PRON  get        dobj            -PRON-         1     1
again           ADV   get        advmod           again          1     1
.               PUNCT get        punct           .         

In [164]:
from spacy.tokens import Token

# test products
products = ['espresso', 'latte', 'americano', 'cappucino', 'cortado', 'matcha', 'tea', 'chai', 
            'lemonade', 'ade', 'salad', 'toast', 'sandwich']
is_product_getter = lambda token: token.test in products
Token.set_extension('is_product', getter=is_product_getter)

In [165]:
pattern2 = [{"is_product": True}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]

In [166]:
matcher2 = Matcher(nlp.vocab)
matcher2.add('latteIS', None, pattern2)
matches2 = matcher2(test_doc)

In [167]:
matches

[(6330969863298014687, 3, 7), (6330969863298014687, 15, 18)]

In [173]:
for _, start, end in matches:
    print(test_doc[start:end])

latte is very delicious
latte is awesome


In [169]:
test_sent2 = "The chicken sandwich is not delicious. I would not get it again. However, the latte is awesome!"
test_doc2 = nlp(test_sent2)
matcher3 = Matcher(nlp.vocab)
matcher3.add("productIs", None, pattern2)
matches3 = matcher3(test_doc2)

In [170]:
matches3

[(6330969863298014687, 3, 7), (6330969863298014687, 15, 18)]

In [172]:
for _, start, end in matches3:
    print(test_doc2[start:end])

sandwich is not delicious
latte is awesome


In [174]:
for token in test_doc2:
    print(f"{token.text:{15}} {token.pos_:{5}} {token.head.text:{10}} {token.dep_:{5}} \
          {token.lemma_:{10}} {token.is_stop:{5}} {token.head.is_stop:{5}}")

The             DET   sandwich   det             the            1     0
chicken         NOUN  sandwich   compound           chicken        0     0
sandwich        NOUN  is         nsubj           sandwich       0     1
is              VERB  is         ROOT            be             1     1
not             ADV   is         neg             not            1     1
delicious       ADJ   is         acomp           delicious      0     1
.               PUNCT is         punct           .              0     1
I               PRON  get        nsubj           -PRON-         1     1
would           VERB  get        aux             would          1     1
not             ADV   get        neg             not            1     1
get             VERB  get        ROOT            get            1     1
it              PRON  get        dobj            -PRON-         1     1
again           ADV   get        advmod           again          1     1
.               PUNCT get        punct           .          

In [175]:
df_nbc['reviews'][1]

"Love the rose croissant here, and the drinks are delicious! \nIt was $3.75, so a bit more pricey than I would have liked for just a croissant, but the flavor was worth it. ugh the rose petal got me. It tasted distinctly like strawberry pocky, so I'm curious how they make the cream. \n\nI had the blue  lemonade, it was sparkling and super unique and tasty. \nA bit tart, but refreshing nonetheless. \nLoved the earl grey latte, super sultry and the flavor was light and had some high notes of earthy dark flavor. \n\nThe cafe has a really low key vibe, perfect for hanging out with friends, or working on a project. the power outlets are abundant and the wifi is strong ( using it right now ) \n\nthe cashiers are friendly and well dressed korean guys  :) \n\nIndie music is a plus too! \n\nDefinitely will be coming back with friends."

Checkpoint Notes:
<br>1: add optional product types in front of patterns e.g. for product pattern, add in optional patterns in front of product like 'earl' and 'grey' to capture when earl grey latte is mentioned rather than just latte
<br>2: add these matches to ngram vector that will be used to pass into unsupervised model (LDA, kMeans)
<br>3: add adjectives to ngram vector e.g. cheap, pricey
<br>4: add rule for parking e.g. lots of parking, free parking, ...