# get menu from the locu api

In [10]:
import json, os
import pickle
import requests
import nltk
import pickle, re
import string 
import unicodedata

from nltk import word_tokenize, sent_tokenize

In [2]:
# read from the previously generated menu file 
def readBuzs():
    if os.path.isfile(BUZ_PICKLE_FILE):
        buzs = pickle.load(open(BUZ_PICKLE_FILE, 'rb'))
    else:
        buzs = {}
    if os.path.isfile(BUZ_MENU_PICKLE_FILE):
        buzs_with_menu = pickle.load(open(BUZ_MENU_PICKLE_FILE, 'rb'))
    else:
        buzs_with_menu = {}
    return (buzs, buzs_with_menu)

In [5]:
BUZ_PICKLE_FILE = 'buzs.pickle'
BUZ_MENU_PICKLE_FILE = 'buzs_with_menu.pickle'

buzs, buzs_with_menu = readBuzs()
count = 0
with open("business.json") as f:
    for line in f:
        buz = json.loads(line)
        count += 1
        # Skip if this venue is not a restaurant 
        if "Restaurants" not in buz["categories"]:
            continue
        # Skip if this restaurant has been read 
        if buz["business_id"] in buzs:
            continue
        # Dictionary that map "business_id" to corresponding restaurant
        buzs[buz["business_id"]] = buz
        url = "https://api.locu.com/v2/venue/search"
        params = {
            "fields" : [ "name", "menus" ],
            "venue_queries" : [
                {
                    "name" : buz["name"],
                    "location" : {"region" : buz["state"], "locality" : buz["city"], "postal_code" : buz["postal_code"]},
                    "menus" : { "$present" : True },
                }
            ],
            "api_key" : "af31648d127d362397e9ad94c8df49665111f52c",
        }
        # Equals to data=json.dumps(params)
        response = requests.post(url, json=params)
        if response.status_code != 200:
            print('Status code error:', response.status_code)
            #print(response.text)
            del buzs[buz['business_id']]
            break
        content = json.loads(response.content)
        if len(content["venues"]) == 0:
            continue
        print("New buz_with_menu count:", count)
        buz['venues'] = content['venues']
        buzs_with_menu[buz['business_id']] = buz


pickle.dump(buzs, open('buzs.pickle', 'wb'))
pickle.dump(buzs_with_menu, open('buzs_with_menu.pickle', 'wb'))
print('Final result')
print('Num of buzs:', len(buzs))
print('Num of buzs with menu:', len(buzs_with_menu))

Status code error: 429
Final result
Num of buzs: 0
Num of buzs with menu: 3442


# del venue in buzs, append menu

In [7]:
'''
Extract menu info from raw data.
Append menu into the buz data structure
Delete unrelated information from buz
'''

buzs = pickle.load(open('buzs_with_menu.pickle', 'rb'))
buzs_simple = {}

count = 0
for buz_id, buz in buzs.items():
    count += 1
    #print(count, buz_id)
    if ('venues' not in buz) or (len(buz['venues']) == 0):
        print('No venues for {}'.format(buz_id))
    venue = buz['venues'][0]
    if ('menus' not in venue) or (len(venue['menus']) == 0):
        print('No menus for {}'.format(buz_id))
    menus = venue['menus']
    food = []
    for menu in menus:
        for section in menu['sections']:
            for subsection in section['subsections']:
                for content in subsection['contents']:
                    food.append(content)
    del buz['venues']
    buz['menu'] = food
    buzs_simple[buz_id] = buz
pickle.dump(buzs_simple, open('buzs_simple.pickle', 'wb'))

# tokenize review

In [11]:
'''
Tokenize business reviews. Each review becomes a list of tokens. Remove the sentences that have more than
50 tokens, remove token string that has more than 20 characters
Input file: review.json
            buzs_simple.pickle
Output file: tokenized_reviews.pickle
'''

REVIEW_FILE_NAME = 'review.json'
BUZ_FILE_NAME = 'buzs_simple.pickle'

buzs = pickle.load(open(BUZ_FILE_NAME, 'rb'))
buz_ids = buzs.keys()

buz_reviews = {buz_id: [] for buz_id in buz_ids}

# tokenize reviews
total_count = 0
relevant_count = 0
with open(REVIEW_FILE_NAME) as f:
    for line in f:
        total_count += 1
        review = json.loads(line)
        buz_id = review['business_id']
        if buz_id in buz_ids:
            relevant_count += 1
            # Remove punctuation and Foreign characters
            review_text = review['text']
            review_text = unicodedata.normalize('NFD', review_text).encode('ascii', 'ignore').decode('utf-8')
            sent_tokens = sent_tokenize(review_text)
            sents = []
            for sent in sent_tokens:
                sent = re.sub(r'(w/)', 'with', sent)
                sent = re.sub(r'[\d]|[^\w\s]', '', sent)
                tokens = word_tokenize(sent)
                tokens = [token.lower() for token in tokens if len(token) < 20]
                if len(tokens) > 50:
                    continue
                sents.append(tokens)
            review['text'] = sents
            buz_reviews[buz_id].append(review)
        if total_count % 10000 == 0:
            print('Total review processed:', total_count)
            print('Relevant review:', relevant_count)

pickle.dump(buz_reviews, open('tokenized_reviews.pickle', 'wb'))

Total review processed: 10000
Relevant review: 645
Total review processed: 20000
Relevant review: 1059
Total review processed: 30000
Relevant review: 1786
Total review processed: 40000
Relevant review: 1974
Total review processed: 50000
Relevant review: 2251
Total review processed: 60000
Relevant review: 3204
Total review processed: 70000
Relevant review: 3702
Total review processed: 80000
Relevant review: 4153
Total review processed: 90000
Relevant review: 4431
Total review processed: 100000
Relevant review: 5262
Total review processed: 110000
Relevant review: 6013
Total review processed: 120000
Relevant review: 6348
Total review processed: 130000
Relevant review: 6779
Total review processed: 140000
Relevant review: 7129
Total review processed: 150000
Relevant review: 7828
Total review processed: 160000
Relevant review: 8668
Total review processed: 170000
Relevant review: 9139
Total review processed: 180000
Relevant review: 9611
Total review processed: 190000
Relevant review: 10259
To

Total review processed: 1530000
Relevant review: 85952
Total review processed: 1540000
Relevant review: 86835
Total review processed: 1550000
Relevant review: 87352
Total review processed: 1560000
Relevant review: 87996
Total review processed: 1570000
Relevant review: 88565
Total review processed: 1580000
Relevant review: 89112
Total review processed: 1590000
Relevant review: 89944
Total review processed: 1600000
Relevant review: 90275
Total review processed: 1610000
Relevant review: 90860
Total review processed: 1620000
Relevant review: 91766
Total review processed: 1630000
Relevant review: 92188
Total review processed: 1640000
Relevant review: 92791
Total review processed: 1650000
Relevant review: 93531
Total review processed: 1660000
Relevant review: 94721
Total review processed: 1670000
Relevant review: 95127
Total review processed: 1680000
Relevant review: 95578
Total review processed: 1690000
Relevant review: 96012
Total review processed: 1700000
Relevant review: 96743
Total revi

Total review processed: 3000000
Relevant review: 186238
Total review processed: 3010000
Relevant review: 186814
Total review processed: 3020000
Relevant review: 187609
Total review processed: 3030000
Relevant review: 188441
Total review processed: 3040000
Relevant review: 189256
Total review processed: 3050000
Relevant review: 190247
Total review processed: 3060000
Relevant review: 191282
Total review processed: 3070000
Relevant review: 192240
Total review processed: 3080000
Relevant review: 193204
Total review processed: 3090000
Relevant review: 193858
Total review processed: 3100000
Relevant review: 194701
Total review processed: 3110000
Relevant review: 195335
Total review processed: 3120000
Relevant review: 196110
Total review processed: 3130000
Relevant review: 196969
Total review processed: 3140000
Relevant review: 198034
Total review processed: 3150000
Relevant review: 198903
Total review processed: 3160000
Relevant review: 199504
Total review processed: 3170000
Relevant review:

Total review processed: 4470000
Relevant review: 280648
Total review processed: 4480000
Relevant review: 281231
Total review processed: 4490000
Relevant review: 282049
Total review processed: 4500000
Relevant review: 282462
Total review processed: 4510000
Relevant review: 283122
Total review processed: 4520000
Relevant review: 283181
Total review processed: 4530000
Relevant review: 284540
Total review processed: 4540000
Relevant review: 285325
Total review processed: 4550000
Relevant review: 285818
Total review processed: 4560000
Relevant review: 286978
Total review processed: 4570000
Relevant review: 287658
Total review processed: 4580000
Relevant review: 288484
Total review processed: 4590000
Relevant review: 288756
Total review processed: 4600000
Relevant review: 290702
Total review processed: 4610000
Relevant review: 291053
Total review processed: 4620000
Relevant review: 291703
Total review processed: 4630000
Relevant review: 292072
Total review processed: 4640000
Relevant review:

# tokenize menu

In [12]:
'''
Tokenize menu data.
Input file: buzs_simple.pickle
Output file: buzs_tokenized.pickle
contains a dictionary from buz_id -> business.
'''

BUZ_FILE_NAME = 'buzs_simple.pickle'
TOKENIZED_FILE_NAME = 'buzs_tokenized.pickle'

buzs = pickle.load(open(BUZ_FILE_NAME, 'rb'))

count = 0
for buz_id, buz in buzs.items():
    count += 1
    menu = []
    for food in buz['menu']:
        if food['type'] != 'ITEM' or 'name' not in food:
            continue
        food['name'] = re.sub(r'(w/)',"with",food['name'])
        food['name'] = re.sub(r'[\d]|(\([^)]*\))|[^\w\s]', '', food['name'])
        food['name'] = unicodedata.normalize('NFD', food['name']).encode('ascii', 'ignore').decode('utf-8')
        sent_tokens = sent_tokenize(food['name'])
        tokens = word_tokenize(food['name'])
        tokens = [token.lower() for token in tokens]
        food['name'] = tokens
        menu.append(food)
    buz['menu'] = menu
    if count % 50 == 0:
        print('count:', count)
        print(buz['menu'][0])

pickle.dump(buzs, open(TOKENIZED_FILE_NAME, 'wb'))

count: 50
{'description': 'seared chicken & vegetable dumplings, asian slaw, traditional soy dipping sauce', 'name': ['potstickers'], 'price': '10.25', 'type': 'ITEM'}
count: 100
{'type': 'ITEM', 'description': 'Mixed greens, tomato, red onion, cucumber, sweet bell pepper and alfalfa sprouts tossed and garnished with strips of fresh mango and mint served with our delicious homemade ginger dressing', 'name': ['bamboo', 'legend', 'salad'], 'price': '7.99', 'option_groups': [{'text': 'Meat', 'options': [{'name': 'Chicken', 'price': '+2.00'}, {'name': 'Shrimp', 'price': '+3.00'}], 'type': 'OPTION_ADD'}]}
count: 150
{'description': 'Prosciutto | basil | marinara', 'name': ['fried', 'stuffed', 'provolone'], 'price': '7.00', 'type': 'ITEM'}
count: 200
{'description': 'A third-pound* 100% Angus beef patty with its own special seasoning on a premium bakery style bun, hickory smoked bacon, two slices of melty American cheese and topped off with crisp red onions and Crinkle-cut pickles.', 'name':

count: 2100
{'price': '8.00', 'type': 'ITEM', 'name': ['espresso', 'love'], 'description': 'Patrón XO cafe, espresso & dark chocolate. Like a great coffeehouse experience while hanging out in a bar, which is way more fun.'}
count: 2150
{'price': '6.25', 'type': 'ITEM', 'name': ['stuffed', 'banana', 'peppers'], 'description': 'With bread and butter'}
count: 2200
{'price': '1.50', 'type': 'ITEM', 'name': ['miso', 'shiru'], 'description': 'soy bean soup'}
count: 2250
{'price': '4.99', 'type': 'ITEM', 'name': ['eggs', 'lots', 'of', 'homefries', 'double', 'texas', 'toast']}
count: 2300
{'type': 'ITEM', 'name': ['whopper', 'sandwich', 'meal'], 'description': 'Our WHOPPER® Sandwich Meal is a ¼ lb. of savory flame-broiled beef topped with juicy tomatoes, fresh cut lettuce, creamy mayonnaise, crunchy pickles, and sliced white onions on a soft sesame seed bun. Served with a small side of piping hot, thick cut French Fries or golden Onion Rings and a small fountain drink of your choice to make it

# match menu and food entity in review, label them using 'BILOU'

In [13]:
def is_match(start_idx, review_tokens, food_tokens):
    matched_count = len(food_tokens)
    if start_idx + len(food_tokens) > len(review_tokens):
        return False
    if review_tokens[start_idx] != food_tokens[0].lower():
        return False
    for idx in range(len(food_tokens)):
        if review_tokens[start_idx + idx].lower() != food_tokens[idx].lower():
            matched_count = matched_count - 1
            if matched_count/len(food_tokens) < 2/3:
                return False
    return True

In [14]:
REVIEW_FILE_NAME = 'tokenized_reviews.pickle'
BUZ_FILE_NAME = 'buzs_tokenized.pickle'
REVIEW_LABEL_FILE_NAME = 'labeled_reviews.txt'
buz_reviews = pickle.load(open(REVIEW_FILE_NAME, 'rb'))
buzs = pickle.load(open(BUZ_FILE_NAME, 'rb'))

In [16]:
review_count = 0
matched_sent_count = 0
tokenized_reviews = []
with open(REVIEW_LABEL_FILE_NAME, 'w') as f:
    for buz_id, reviews in buz_reviews.items():
        menu = buzs[buz_id]['menu']
        for review in reviews:
            review_count += 1
            if review_count % 1000 == 0:
                print('review count:', review_count)
                print('sentence count:', matched_sent_count)
            for tokens in review['text']: 
                labeled_tokens = []
                if len(tokens) == 0:
                    continue
                idx = 0
                matched_food_count = 0
                total_food_length=0
                while idx < len(tokens):
                    # check match
                    match = False
                    for food in menu:
                        food_tokens = food['name']
                        if len(food_tokens) < 1:
                            continue
                        match = is_match(idx, tokens, food_tokens)
                        if match:
                            matched_food_count += 1
                            food_len = len(food_tokens)
                            matched_len = (food_len-1, food_len)[tokens[idx+food_len-1]==food_tokens[-1]]
                            if food_len == 1:
                                labeled_tokens.append((tokens[idx], 'U'))
                            else:
                                labeled_tokens.append((tokens[idx], 'B'))
                                for i in range(1, matched_len - 1):
                                    labeled_tokens.append((tokens[idx + i], 'I'))
                                labeled_tokens.append((tokens[idx + matched_len - 1], 'L'))
                            idx += matched_len
                            total_food_length += matched_len
                            break
                    if not match:
                        labeled_tokens.append((tokens[idx], 'O'))
                        idx += 1
                
                if total_food_length/len(tokens)>=0.4 or matched_food_count >= 3:
                    matched_sent_count += 1
                    temp=[]
                    for token in labeled_tokens:
                        temp.append(token[0])
                        f.write(token[0]+'\t'+token[1]+'\n')
                    f.write('\n')
                    tokenized_reviews.append(temp)
    
print('final review count:', review_count)
print('final matched sentence count:', matched_sent_count)       

final review count: 301773
final matched sentence count: 21402


# Take the labeled_review json file as input, generate txt file (deprecated)

In [120]:
REVIEW_LABEL_TXT_NAME = 'labeled_reviews.txt'
review_count = 0

with open(REVIEW_LABEL_FILE_NAME, 'r') as json_data, open(REVIEW_LABEL_TXT_NAME, 'w') as txt_data:
    for line in json_data:
        review_count += 1
        if review_count % 1000 == 0:
            print('processed reviews: ', review_count)
        review = json.loads(line)
        tokens = review['text']
        for token in tokens:
            txt_data.write(token[0]+'\t'+token[1]+'\n')
        txt_data.write('\n')
    print('final count ', review_count)

final count  501
