In [2]:
import numpy as np
import pandas as pd

import re
import spacy
from spacy.tokens import Token
from spacy.matcher import Matcher

import pickle

In [3]:
with open('df_nbc.pickle', 'rb') as handle:
    df_nbc = pickle.load(handle)

In [4]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews
0,2020-02-15,5.0,Cute place to study ! They have individuals t...
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....
3,2020-02-28,4.0,"Okay, you know how there are a million artisan..."
4,2020-01-18,5.0,Everything but the parking situation is great ...


In [11]:
nlp = spacy.load('en_core_web_md')

In [12]:
# include with medium library - for some reason is_stop does not work without it
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

12

In [5]:
# products on their menu
products = ['espresso', 'latte', 'americano', 'cappucino', 'cortado', 'matcha', 'tea', 'chai', 
            'lemonade', 'ade', 'salad', 'toast', 'sandwich']

# add products to be custom token attribute to be able to recognize in rule based matching
is_product_getter = lambda token: token.test in products
Token.set_extension('is_product', getter=is_product_getter)

By identifying patterns in our descriptions, we get a clearer idea on what these descriptions are. In the modeling below, we then further identify groups of these patterns that are commonly talked about together in the reviews. 

In [179]:
# latte is (very) delicious
p_something_is = [{"POS": "NOUN"}, {"POS": "VERB"}, {"POS": "ADV", "OP": "?"},
           {"POS": "ADJ"}]

# (suitable) environment for studying
p_something_for = [{"POS": "ADJ", "OP": "?"}, {"POS": "NOUN"}, {"POS": "ADP"}, {"POS": "VERB"}]

# terrible customer (service)
p_desc = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# parking in (the) back / lots of plugs
p_something_of = [{"POS": "NOUN"}, {"POS": "ADP"}, {"POS": "DET", "OP": "?"}, {"POS": "NOUN"}]

# layout has seats
p_something_has = [{"POS": "NOUN"}, {"POS": "VERB"}, {"POS": "NOUN"}]

In [176]:
patterns = {'something_is': p_something_is,  'something_for': p_something_for, 'desc': p_desc, 
            'something_of': p_something_of, 'something_has': p_something_has}

In [177]:
matcher = Matcher(nlp.vocab)
for key in patterns.keys():
    matcher.add(key, None, patterns[key])

Below I am trying to capture statements that are more descriptive that we can use as inputs into the topic model. However, notice that for descriptions that are longer than 2 words, now that I've captured the descriptions, I can probably condense them into two words (the first and the last) without losing too much meaning. For example, "parking in the back" - I don't need this entire statement, or n-gram, to be included in the dataset. Instead, I can condense to the bigram, "parking back" and still understand that this person is describing parking that's in the back. 

In [178]:
matches = matcher(review_doc)
for _, start, end in matches:
    print(review_doc[start:end])

Cute place
table for studying
lots of plugs
parking in the back
back behind the store
more selection
selection of food
green tea
macchiato is amazing
level is minimal
suitable environment
suitable environment for studying
environment for studying
great experience


In [9]:
def text_preprocess(string):
    
    string = string.strip()
    string = string.lower()
    string = re.sub('\n', '', string)
    string = re.sub('[!#?,.:";]', '', string)
    
    return string

def create_word_vector(review, matcher, include_all=False):
    
    review = text_preprocess(review)
    review_doc = nlp(review)
    
    tokens = set()
    
    matches = matcher(review_doc)
    for _, beg, end in matches:
        
        # get lemma form of first word in match
        first_word = review_doc[beg].lemma_
        # get lemma form of last word in match
        last_word = review_doc[end-1].lemma_
        
        # don't include anything with pronouns
        # might change later
        if first_word == '-PRON-'or last_word == '-PRON-':
            continue
        
        # assign first and last word alphabetically with min/max funcs
        # e.g. we don't want both ('cute', 'place') and ('place', 'cute') in dataset, just one of them
        word1 = min(first_word, last_word)
        word2 = max(first_word, last_word)
        
        # add pair to tokens set
        tokens.add((word1, word2))
        
    # if we want to include all tokens in set    
    if include_all:
        for token in review_doc:
            if not token.is_stop and not token.lemma_.isspace():
                tokens.add((token.lemma_,))
                
    return list(tokens)

In [93]:
df_nbc['ngrams'] = df_nbc['reviews'].apply(create_word_vector)

In [94]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,ngrams
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,"[(level, minimal), (experience, great), (lot, ..."
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...","[(high, note), (delicious, drink), (key, vibe)..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,"[(cup, handle), (charcoal, surprise), (interio..."
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...","[(only, worker), (menu, scarce), (coffee, fanc..."
4,2020-01-18,5.0,Everything but the parking situation is great ...,"[(koreatown, shop), (latte, spanish), (drink, ..."


In [152]:
# create column with all tokens, not just extracted statements
token_list = [create_word_vector(review, matcher, True) for review in df_nbc['reviews']]

In [153]:
df_nbc['tokens'] = np.array(token_list)

In [154]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,ngrams,tokens
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,"[(level, minimal), (experience, great), (lot, ...","[(make,), (lot, plug), (study, table), (noise,..."
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...","[(high, note), (delicious, drink), (key, vibe)...","[(croissant,), (lemonade,), (blue,), (dark, fl..."
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,"[(cup, handle), (charcoal, surprise), (interio...","[($,), (cup, handle), (wilshire,), (smooth, te..."
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...","[(only, worker), (menu, scarce), (coffee, fanc...","[(menu, scarce), (okay,), (coffee, fancé), (tu..."
4,2020-01-18,5.0,Everything but the parking situation is great ...,"[(koreatown, shop), (latte, spanish), (drink, ...","[(reasonably,), (latte, spanish), (downside,),..."


In [155]:
df_nbc['tokens'][0]

[('make',),
 ('lot', 'plug'),
 ('study', 'table'),
 ('noise',),
 ('lot',),
 ('minimal',),
 ('environment', 'suitable'),
 ('small',),
 ('group', 'overall'),
 ('studyingthe',),
 ('wish',),
 ('level', 'minimal'),
 ('cute',),
 ('overall',),
 ('parking',),
 ('be',),
 ('study',),
 ('loud',),
 ('not',),
 ('table',),
 ('back', 'parking'),
 ('suitable',),
 ('level',),
 ('plug',),
 ('food',),
 ('storehowever',),
 ('experience',),
 ('experience', 'great'),
 ('little',),
 ('more', 'selection'),
 ('place',),
 ('amazing', 'macchiato'),
 ('green',),
 ('green', 'tea'),
 ('environment', 'table'),
 ('great',),
 ('environment',),
 (')',),
 ('tea',),
 ('recommend',),
 ('individual',),
 ('food', 'selection'),
 ('selection',),
 ('group',),
 ('amazing',),
 ('cute', 'place'),
 ('back', 'storehowever'),
 ('macchiato',)]

In [63]:
df_nbc['ngrams'][0]

[('level', 'minimal'),
 ('experience', 'great'),
 ('lot', 'plug'),
 ('study', 'table'),
 ('food', 'selection'),
 ('back', 'parking'),
 ('environment', 'suitable'),
 ('more', 'selection'),
 ('amazing', 'macchiato'),
 ('cute', 'place'),
 ('green', 'tea'),
 ('environment', 'table')]

In [95]:
df_nbc['ngrams'][18]

[('drink', 'other'),
 ('cool', 'interior'),
 ('good', 'spot'),
 ('layout', 'seat'),
 ('barista', 'nice'),
 ('attention', 'what'),
 ('attention', 'much'),
 ('basic', 'drink')]

In [80]:
df_nbc['reviews'][18]

"I like to judge a coffee shop by some of their most basic drinks - drip coffee, an americano, etc. and the ice americano I ordered did not disappoint. It was smooth and not too acidic, just how I like it. I didn't pay too much attention to what their menu has to offer as I had my dog with me (which I'm not entirely sure if they allows dogs - their Yelp page says yes but they didn't have any signs up. However, the barista was super nice and he was ok with my pup being there). \n\nThe interior was pretty cool - their layout had seats that would be a good spot to bring your friends and hang out. I'd definitely come here again to try some of their other drinks and snacks."

## Modeling - LDA

In [135]:
from sklearn.feature_extraction.text import CountVectorizer
X = df_nbc['ngrams']
count_vec = CountVectorizer(analyzer=lambda x:x)
X_train = count_vec.fit_transform(X)

In [99]:
X_train

<180x1110 sparse matrix of type '<class 'numpy.int64'>'
	with 1504 stored elements in Compressed Sparse Row format>

In [100]:
from sklearn.decomposition import LatentDirichletAllocation

In [139]:
LDA = LatentDirichletAllocation(n_components=5, random_state=42)

In [140]:
LDA.fit(X_train)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=5, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## Some notes...

Topic modeling involves identifying topics that are latent in the text that contain them. LDA assumes that a corpus is generated by a topic, which is generated by words belonging to that topic. However, with reviews, this might be an incorrect assumption to make because a reviewer might not stick to a certain theme. The review might convey a positive or negative tone but I would say it isn't always the case that a reviewer sticks to talking only about one or two things in a review. They might talk about the quality of the products, the service, the ambiance of the store i.e. a wide range of topics. Therefore, we might not find a particular theme in each of the LDA components, but more of things that are commonly said together in comments. Something to try out later is applying this with kMeans.

I think it might be meaningful to be able to identify reviews that contain mentions of specific products and see what these reviews are saying about that product. Extract the entire review this mention is in but also take the specific sentences talking about the product and do a sentiment analysis to see if they're generally positive or negative.

In [141]:
for i, topic in enumerate(LDA.components_):
    print(f'Top 15 words/phrases for topic #{i}')
    print([count_vec.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

Top 15 words/phrases for topic #0
[('number', 'outlet'), ('first', 'time'), ('small', 'table'), ('k', 'shop'), ('friendly', 'staff'), ('front', 'parking'), ('free', 'parking'), ('coffee', 'great'), ('back', 'lot'), ('lot', 'people'), ('good', 'service'), ('good', 'place'), ('drink', 'good'), ('great', 'place'), ('coffee', 'good')]


Top 15 words/phrases for topic #1
[('environment', 'great'), ('nice', 'place'), ('hour', 'late'), ('back', 'lot'), ('price', 'reasonable'), ('decor', 'minimal'), ('coffee', 'cup'), ('iced', 'latte'), ('coffee', 'nothing'), ('small', 'table'), ('cup', 'free'), ('ktown', 'shop'), ('barista', 'nice'), ('coffee', 'late'), ('coffee', 'great')]


Top 15 words/phrases for topic #2
[('coffee', 'nothing'), ('great', 'place'), ('favorite', 'spot'), ('instrumental', 'music'), ('menu', 'minimal'), ('drink', 'other'), ('good', 'spot'), ('friendly', 'staff'), ('side', 'sweet'), ('free', 'wifi'), ('cafe', 'cute'), ('lot', 'small'), ('lot', 'outlet'), ('back', 'lot'), ('co

In [160]:
topic_results = LDA.transform(X_train)
df_nbc['topic_statements'] = topic_results.argmax(axis=1)

In [161]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,ngrams,tokens,topic_statements
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,"[(level, minimal), (experience, great), (lot, ...","[(make,), (lot, plug), (study, table), (noise,...",3
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...","[(high, note), (delicious, drink), (key, vibe)...","[(croissant,), (lemonade,), (blue,), (dark, fl...",1
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,"[(cup, handle), (charcoal, surprise), (interio...","[($,), (cup, handle), (wilshire,), (smooth, te...",3
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...","[(only, worker), (menu, scarce), (coffee, fanc...","[(menu, scarce), (okay,), (coffee, fancé), (tu...",2
4,2020-01-18,5.0,Everything but the parking situation is great ...,"[(koreatown, shop), (latte, spanish), (drink, ...","[(reasonably,), (latte, spanish), (downside,),...",4


In [172]:
df_nbc['topic_statements'].value_counts()

3    50
4    39
0    37
2    30
1    24
Name: topic_statements, dtype: int64

## Some reviews belonging to each "topic"...

In [173]:
from random import randint

def print_reviews(df, topic_col, topic_num, n=5):
    df_topic = df[df[topic_col] == topic_num]['reviews']
    max_range = len(df_topic)
    for i in range(n):
        rand_num = randint(0, max_range)
        print(rand_num)
        print(df_topic.iloc[rand_num])
        print('\n')
        print('------------------------------------------------------------------')

In [174]:
for i in range(5):
    print('REVIEWS FOR TOPIC ' + str(i) + ":")
    print('\n')
    print_reviews(df_nbc, 'topic_statements', i)

REVIEWS FOR TOPIC 0:


21
I think I'm in love with this place! The moment I walked in the first time, I felt as though the atmosphere was made for me. I don't want to describe the atmosphere as "chill," but honestly, I cannot think of a better word. The decorations are minimal, the tables are minimal, even the employees seem minimal. this place is a minimalist's dream. The service was nice, everyone seemed so patient. Parking is a private lot in the back, but it's usually full. The music they play there is just quite enough for you to block it out if you want to. 
Honey truffle latte: YUM! The flavor of the truffle was perfectly done. Usually I don't enjoy truffle because the flavor is too dominant, but this cup of latte was great. I had a sip of this cold, and it just didn't do the truffle any justice compared to the hot. If you order this, definitely opt for the hot option.
NBC latte: yum! Absolutely perfect amount of cinnamon in this! This is similar to a Spanish latte at Urth Caffe

### using all tokens...

In [156]:
X_tok = df_nbc['tokens']
count_vec_tok = CountVectorizer(min_df=0.1, analyzer=lambda x:x)
X_tok_train = count_vec_tok.fit_transform(X_tok)

In [157]:
LDA_tok = LatentDirichletAllocation(n_components=7, random_state=42)

In [158]:
LDA_tok.fit(X_tok_train)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [159]:
for i, topic in enumerate(LDA_tok.components_):
    print(f'Top 15 words/phrases for topic #{i}')
    print([count_vec.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

Top 15 words/phrases for topic #0
[('550ml', 'approximate'), ('amount', 'good'), ('amazing', 'jay'), ('almond', 'latte'), ('ample', 'seating'), ('anybody', 'paradise'), ('ambience', 'quiet'), ('1pm', 'afternoon'), ('acidic', 'blend'), ('alternative', 'milk'), ('anything', 'home'), ('advantage', 'good'), ('about.this', 'cafe'), ('ambience', 'prettt'), ('almond', 'granule')]


Top 15 words/phrases for topic #1
[('ambiance', 'comparable'), ('amount', 'strength'), ('almond', 'granule'), ('10th', 'drink'), ('a.m.', 'early'), ('acidic', 'blend'), ('actual', 'orange'), ('amount', 'right'), ("area.they're", 'parking'), ('amazing', 'waffle'), ('500ml', 'beaker'), ('aesthetic', 'simple'), ('alley', 'same'), ('amount', 'sweetness'), ('abundant', 'outlet')]


Top 15 words/phrases for topic #2
[('ambience', 'prettt'), ('amount', 'right'), ('accessible', 'outlet'), ('amount', 'sweetness'), ('afternoon', 'coffee'), ('ambience', 'quiet'), ('area', 'shop'), ('aftertaste', 'earthy'), ('a.m.', 'early'), 

In [44]:
s = 'the place is cute. it is a cute place'
test_doc = nlp(s)
matches = matcher(test_doc)
for _, beg, end in matches:
    print(test_doc[beg:end])

place is cute
cute place


In [81]:
s = 'the barrista was super nice'
test_doc = nlp(s)
matches = matcher(test_doc)
for _, beg, end in matches:
    print(test_doc[beg:end])

## Identifying products in reviews

This is not the most efficient way to identify products especially if this list gets long and the algorithm would have to scan through each item and check if it's in each review. 

For a later exercise: figure out how to train spaCy model to add more entities to entity recognition feature.

https://datascience.stackexchange.com/questions/57650/is-there-any-way-to-define-custom-entities-in-spacy

In [195]:
mocha = ['earl gray mocha', 'mocha']

latte = ['latte',  'matcha', 'charcoal']

coffee = ['espresso', 'americano', 'cappucino', 'cortado', 'nbc', 'ktown']

tea = ['tea', 'chai', 'green tea', 'black tea', 'rose tea', 'chrysanthemum tea']
            
drinks = ['lemonade', 'ade']

food = ['salad', 'toast', 'sandwich']

In [228]:
def check_drinks(df, items):
    
    num_item_types = len(items)
    #for each item type, have a list that will contain 0 or 1 for each review 
    #depending if that review mentions an item from that item type
    matches = [[] for i in range(num_item_types)]
    
    for review in df['reviews']:
        review = text_preprocess(review)
        
        for i in range(num_item_types):
            item_type = items[i]
            in_review = 0
            for item in item_type:
                if item in review:
                    in_review = 1
            matches[i].append(in_review)
    
    return matches

In [229]:
items = [mocha, latte, coffee, tea, drinks, food]
matches = check_drinks(df_nbc, items)

In [243]:
for i in range(6):
    print(sum(matches[i]))

4
99
49
29
16
1


In [238]:
df_nbc['has_mocha'] = matches[0]
df_nbc['has_latte'] = matches[1]
df_nbc['has_coffee'] = matches[2]
df_nbc['has_tea'] = matches[3]
df_nbc['has_drinks'] = matches[4]
df_nbc['has_food'] = matches[5]

In [242]:
df_nbc[df_nbc['has_mocha'] == 1]['reviews'].iloc[0]

"I come here every week to get some work done. It's not too loud and I like how there are outlets by every table. So far, I've tried the NBC, earl grey mocha, green tea latte, and chai tea latte, and they're all pretty good! I usually stick with the NBC with oat milk though because I need the caffeine fix - It's a velvety, smooth Spanish latte served with cinnamon sticks inside. For pastries, I tried their croissants (green tea raspberry croissant, and butter) and coffee cake, and I highly recommend the coffee cake! The croissants are fairly mediocre, but the coffee cake's rich and buttery.. it tastes perfect with your coffee. I haven't seen the cake recently though.. I think it's sold out by the time I get there."

In [244]:
df_nbc[df_nbc['has_latte'] == 1]['reviews'].iloc[80]

"I'm always looking forward to new cafes in ktwn as many tend to become too busy overtime. the vibe's really nice and quiet here (well, depends on the number of people in the cafe but we came during the quiet time.) this is definitely our new go-to. btw, I got the vanilla latte with soy milk and i loved it so much! not too sweet"

In [245]:
df_nbc[df_nbc['has_coffee'] == 1]['reviews'].iloc[18]

"So, I was in L.A. yesterday (4/27) and was looking for a pick me up. Found this little gem on google and decided to try it with my family. We came and I ordered the NBC Latte and my daughter ordered the iced Truffle Honey latte. And they were both absolutely delicious! The guy at the counter was the sweetest. I can't wait to come back and have another latte. :)"

In [246]:
df_nbc[df_nbc['has_tea'] == 1]['reviews'].iloc[25]

"They offer pu're tea! The tea actually dissolves in the water! Obsessed with their tea selection and also got a chance to try their signature NBC latte. Check in and order was easy breezy - and so friendly. This shop is an undiscovered gem and I intend to drink my way through their menu. Beverages range from $3-6 and plenty of parking in the back and on the street...unusual for ktown."

In [249]:
df_nbc[df_nbc['has_drinks'] == 1]['reviews'].iloc[0]

"Love the rose croissant here, and the drinks are delicious! \nIt was $3.75, so a bit more pricey than I would have liked for just a croissant, but the flavor was worth it. ugh the rose petal got me. It tasted distinctly like strawberry pocky, so I'm curious how they make the cream. \n\nI had the blue  lemonade, it was sparkling and super unique and tasty. \nA bit tart, but refreshing nonetheless. \nLoved the earl grey latte, super sultry and the flavor was light and had some high notes of earthy dark flavor. \n\nThe cafe has a really low key vibe, perfect for hanging out with friends, or working on a project. the power outlets are abundant and the wifi is strong ( using it right now ) \n\nthe cashiers are friendly and well dressed korean guys  :) \n\nIndie music is a plus too! \n\nDefinitely will be coming back with friends."

In [250]:
df_nbc[df_nbc['has_food'] == 1]['reviews'].iloc[0]

"I love supporting small businesses and trying unique coffee, so I stopped by Nothing But Coffee. I would rate my recent visit 3.5 stars.\n\nINTERIOR\nI was very impressed by the spacious and clean interior! Most of the new coffee shops are small and barely have any tables to accommodate their guests. Nothing But Coffee had a lot of small tables, seats on the bar, and some chairs. I also loved that they had electrical outlets for those who wish to stay to work or study. The decor is simple and very neat. The coffee shop was very empty when I went, probably because it hasn't been long since it opened. \n\nCOFFEE\nI ordered the most expensive drink on their menu, the truffle honey latte for $6. The one other unique item I saw on their menu was their NBC latte but I decided to try the drink they were advertising on their front entrance. I ordered it iced and less sweet. The barista was very friendly checked up on me to make sure that I liked the drink and the sweetness adjustment. The lat

## Notes

continue testing to see if I can apply regex and/or rule based matching to identify products faster

Using regex and spaCy to come up with more efficient way of identifying sentences and reviews with certain products... 

In [7]:
mocha_p = '(earl ?gr(a|e)y ?)?(mocha)'

latte_p = '(matcha ?|charcoal ?)?(latte|matcha)'

coffee_p = '(espresso|americano|cappucino|cortado|nbc|ktown)'

tea_p = '(green ?|black ?|rose ?|chrysanthemum ?)?(tea|chai)'
            
drinks_p = '(blue ?)?(lemonade)'

food_p = '(egg ?avocado ?|penne ? basil ? pesto ?)?(salad)|(arugula ?)?(toast)|(pastrami ?cheese ?|cheese ?)?(sandwich)'


In [20]:
def get_prod_sents(reviews, patterns):
    
    item_types = len(patterns)
    prod_matches = [[] for i in range(item_types)]
    
    for review in reviews:
        review = text_preprocess(review)
        review_doc = nlp(review)
        
        for i in range(item_types):
            prod_sents = ''
            
            for match in re.finditer(patterns[i], review):
                start, end = match.span()
                span = review_doc.char_span(start, end)
                
                if span is not None:
                    prod_sents += span.sent.text
                    
            prod_matches[i].append(prod_sents)
            
    return prod_matches

def append_prod_sents(df, pattern_names, patterns):
    
    reviews = df['reviews']
    prod_sents = get_prod_sents(reviews, patterns)
    
    for i, name in enumerate(pattern_names):
        col_name = 'mentions_' + name
        df[col_name] = prod_sents[i]
        
    return df



In [21]:
pattern_names = ['mocha', 'latte', 'coffee', 'tea', 'misc_drinks', 'food']
patterns = [mocha_p, latte_p, coffee_p, tea_p, drinks_p, food_p]
df_nbc = append_prod_sents(df_nbc, pattern_names, patterns)

In [22]:
df_nbc.head()

Unnamed: 0,review_dates,review_ratings,reviews,mentions_mocha,mentions_latte,mentions_coffee,mentions_tea,mentions_misc_drinks,mentions_food
0,2020-02-15,5.0,Cute place to study ! They have individuals t...,,,,their green tea and macchiato is amazing,,
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...",,it was sparkling and super unique and tasty a ...,,,so i'm curious how they make the cream i had t...,
2,2020-03-03,4.0,Time* Around 5pm on a Tuesday. \n\nWait* None....,,* i believe they have a free parking lot in th...,,,,
3,2020-02-28,4.0,"Okay, you know how there are a million artisan...",,,,,,
4,2020-01-18,5.0,Everything but the parking situation is great ...,,everything but the parking situation is great ...,everything but the parking situation is great ...,,,


In [26]:
df_nbc[df_nbc['mentions_misc_drinks'] != '']

Unnamed: 0,review_dates,review_ratings,reviews,mentions_mocha,mentions_latte,mentions_coffee,mentions_tea,mentions_misc_drinks,mentions_food
1,2020-03-04,4.0,"Love the rose croissant here, and the drinks a...",,it was sparkling and super unique and tasty a ...,,,so i'm curious how they make the cream i had t...,
34,2018-08-05,5.0,Don't let the name deceive you! They have thin...,,really cheap my favorites so far are the match...,the nbc latte was pretty good too,,really cheap my favorites so far are the match...,
70,2018-03-07,5.0,There's been so many new coffee places opening...,,so its definitely a place where you can chat w...,there's been so many new coffee places opening...,,so its definitely a place where you can chat w...,
84,2018-02-27,4.0,Very comfortable spot! Bright and long stretch...,,,,,i got their hibiscus lemonade which was a deli...,
103,2018-04-19,5.0,I think I'm in love with this place! The momen...,,there is just quite enough for you to block it...,but i definitely prefer the nbcit's own parkin...,,the flavors were more mild but they melded tog...,
