# Intro to NLP
- https://www.kaggle.com/matleonard/intro-to-nlp

In [39]:
import spacy
import pandas as pd

In [3]:
nlp = spacy.load('en_core_web_sm')



In [5]:
doc = nlp("Tea is healthy and calming, don't you think?")

In [6]:
## Tokenizing
for token in doc:
    print(token)

Tea
is
healthy
and
calming
,
do
n't
you
think
?


In [7]:
## Text prepocessing
print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword'))
print("-"*40)
for token in doc:
    print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}")

Token 		Lemma 		Stopword
----------------------------------------
Tea		tea		False
is		be		True
healthy		healthy		False
and		and		True
calming		calming		False
,		,		False
do		do		True
n't		not		True
you		-PRON-		True
think		think		False
?		?		False


In [18]:
## Pattern matching
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

In [19]:
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]

matcher.add("TerminologyList", patterns)

In [20]:
text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") 
matches = matcher(text_doc)
print(matches)

[(3766102292120407359, 17, 19), (3766102292120407359, 22, 24), (3766102292120407359, 30, 32), (3766102292120407359, 33, 35)]


In [28]:
match_id, start, end = matches[0]
print(nlp.vocab.strings[match_id], text_doc[start:end])

match_id, start, end = matches[1]
print(nlp.vocab.strings[match_id], text_doc[start:end])

match_id, start, end = matches[2]
print(nlp.vocab.strings[match_id], text_doc[start:end])

TerminologyList iPhone 11
TerminologyList Galaxy Note
TerminologyList iPhone XS


# Exercise
- find food on menu in review

In [40]:
data = pd.read_json(r'restaurant.json')
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
109,lDJIaF4eYRF4F7g6Zb9euw,lb0QUR5bc4O-Am4hNq9ZGg,r5PLDU-4mSbde5XekTXSCA,4,2,0,0,I used to work food service and my manager at ...,2013-01-27 17:54:54
1013,vvIzf3pr8lTqE_AOsxmgaA,MAmijW4ooUzujkufYYLMeQ,r5PLDU-4mSbde5XekTXSCA,4,0,0,0,We have been trying Eggplant sandwiches all ov...,2015-04-15 04:50:56
1204,UF-JqzMczZ8vvp_4tPK3bQ,slfi6gf_qEYTXy90Sw93sg,r5PLDU-4mSbde5XekTXSCA,5,1,0,0,Amazing Steak and Cheese... Better than any Ph...,2011-03-20 00:57:45
1251,geUJGrKhXynxDC2uvERsLw,N_-UepOzAsuDQwOUtfRFGw,r5PLDU-4mSbde5XekTXSCA,1,0,0,0,Although I have been going to DeFalco's for ye...,2018-07-17 01:48:23
1354,aPctXPeZW3kDq36TRm-CqA,139hD7gkZVzSvSzDPwhNNw,r5PLDU-4mSbde5XekTXSCA,2,0,0,0,"Highs: Ambience, value, pizza and deserts. Thi...",2018-01-21 10:52:58


In [41]:
menu = ["Cheese Steak", "Cheesesteak", "Steak and Cheese", "Italian Combo", "Tiramisu", "Cannoli",
        "Chicken Salad", "Chicken Spinach Salad", "Meatball", "Pizza", "Pizzas", "Spaghetti",
        "Bruchetta", "Eggplant", "Italian Beef", "Purista", "Pasta", "Calzones",  "Calzone",
        "Italian Sausage", "Chicken Cutlet", "Chicken Parm", "Chicken Parmesan", "Gnocchi",
        "Chicken Pesto", "Turkey Sandwich", "Turkey Breast", "Ziti", "Portobello", "Reuben",
        "Mozzarella Caprese",  "Corned Beef", "Garlic Bread", "Pastrami", "Roast Beef",
        "Tuna Salad", "Lasagna", "Artichoke Salad", "Fettuccini Alfredo", "Chicken Parmigiana",
        "Grilled Veggie", "Grilled Veggies", "Grilled Vegetable", "Mac and Cheese", "Macaroni",  
         "Prosciutto", "Salami"]

In [60]:
## Find items in single review
index_of_review_to_test_on = 14
text_to_test_on = data.text.iloc[index_of_review_to_test_on]

review_doc = nlp(text_to_test_on)

matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

menu_tokens_list = [token for item in menu]

matcher.add("MENU", None, review_doc)

matches = matcher(review_doc)

matches

match_id, start, end = matches[0]
print(nlp.vocab.strings[match_id], text_to_test_on[start:end])

MENU The Il Purista sandwich has become a staple of my life. Mozzarel
