## Efficient Yelp API Calls (Core)
- Brian Lafferty
- 7.26.2022

### Loading Libraries

In [23]:
# Imports and libraries
import numpy as np
import pandas as pd
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

### loading and saving credentials for Yelp

In [24]:
# loading credentials from secret folder
with open('/Users/laffe/.secret/yelp_api.json') as f:
    login = json.load(f)
# instantiate YelpAPI variable and give it my key
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

### define search area and terms

In [25]:
# setting the variables to the parameters of the API

# search #1
#LOCATION = 'Whispering Pines, NC, 28327'
#TERM = 'Pizza'

# search #2
LOCATION = 'Apex, NC, 27502'
TERM = 'Italian'

### creating a file and folder to store search

In [26]:
# creating a folder to save the data in
folder_name = "Data/"
os.makedirs(folder_name, exist_ok = True)

In [27]:
# using the new folder to create file path, with the search terms as the name
JSON_FILE = folder_name+f"{LOCATION.split(',')[0]}-{TERM}.json"

In [28]:
# check to see if JSON_FILE exists already
file_exists = os.path.isfile(JSON_FILE)
# if it does not exist
if file_exists == False:
    # create any needed folders
    # get the folder name only
    folder = os.path.dirname(JSON_FILE)
    # if JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder, exist_ok = True)
        
    # tell users and save empty list
    print(f"This {JSON_FILE} was not found! Saving empty list to file.")
    
    # save the empty file
    with open(JSON_FILE, 'w') as f:
        json.dump([],f)
# if it exits, inform user
else:
    print(f"This {JSON_FILE} already exists.")

This Data/Apex-Italian.json was not found! Saving empty list to file.


In [29]:
# loading the previous file and finding the length, this will be used for the offset
with open(JSON_FILE, 'r') as f:
    previous_results = json.load(f)

# set offset based on previous results
num_results = len(previous_results)
print(f'- {num_results} previous results found.')

- 0 previous results found.


In [30]:
# using yelp_api variables and search_query method to perform the API call
results = yelp_api.search_query(location = LOCATION, term = TERM, offset = num_results)


In [31]:
# how many total results were found
total_results = results['total']
print(f'The total number of results from the search was {total_results}')
# how many businesses were returned
results_per_page = len(results['businesses'])


The total number of results from the search was 630


In [32]:
# use math.ceil to round up for the total number of pages of results
num_pages = math.ceil((results['total']-num_results)/ results_per_page)
num_pages

32

In [33]:
# adding the previous search to the save file using extend
previous_results.extend(results['businesses'])
with open(JSON_FILE, 'w') as f:
    json.dump(previous_results, f)

In [34]:
# creating a for loop to run the needed api calls to get the full list
# also using the tqdm notebook to practice making a status bar
for i in tqdm_notebook(range(1, num_pages + 1)):
    time.sleep(.2)
    # read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    # save the number of results for use as the offset
    num_results = len(previous_results)
    # using number of results as the offset
    results = yelp_api.search_query(location = LOCATION, term = TERM, offset = num_results)
    
    # append new results and save eh file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE, 'w') as f:
        json.dump(previous_results, f)

  0%|          | 0/32 [00:00<?, ?it/s]

In [36]:
# converting the results from .json to a dataframe
# search #1
#pizza_df = pd.read_json(JSON_FILE)
#pizza_df.head()
italian_df = pd.read_json(JSON_FILE)
italian_df.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,oawiDpF6mCdTML91aiR40w,tarantini-italian-restaurant-chapel-hill,Tarantini Italian Restaurant,https://s3-media3.fl.yelpcdn.com/bphoto/hIBw5n...,False,https://www.yelp.com/biz/tarantini-italian-res...,233,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 35.854042, 'longitude': -79.024952}","[restaurant_reservation, delivery]",$$,"{'address1': '50160 Governors Dr', 'address2':...",19199424240,(919) 942-4240,18405.149641
1,LjvHGYAC113ZGUs0mue43w,daniels-restaurant-apex,Daniel's Restaurant,https://s3-media2.fl.yelpcdn.com/bphoto/fl9meJ...,False,https://www.yelp.com/biz/daniels-restaurant-ap...,496,"[{'alias': 'italian', 'title': 'Italian'}]",4.0,"{'latitude': 35.75246, 'longitude': -78.87659}",[],$$,"{'address1': '1430 W Williams St', 'address2':...",19193031006,(919) 303-1006,5585.960919
2,40W1_4-Bu-RlZWRRUbQt6w,mamma-mia-italian-bistro-apex,Mamma Mia Italian Bistro,https://s3-media2.fl.yelpcdn.com/bphoto/gjNCOc...,False,https://www.yelp.com/biz/mamma-mia-italian-bis...,257,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,"{'latitude': 35.7355611601136, 'longitude': -7...",[delivery],$$,"{'address1': '708 Laura Duncan Rd', 'address2'...",19193632228,(919) 363-2228,7138.531189
3,JG30eHEEBhri9TJHcr79tw,mezza-luna-pizzeria-apex,Mezza Luna Pizzeria,https://s3-media1.fl.yelpcdn.com/bphoto/s6gPZl...,False,https://www.yelp.com/biz/mezza-luna-pizzeria-a...,71,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.5,"{'latitude': 35.759033, 'longitude': -78.877387}",[delivery],,"{'address1': '1763 W Williams St', 'address2':...",19193036686,(919) 303-6686,6124.697491
4,kwYYV0NdzLwXo8lhWKUMNw,osteria-g-apex-2,Osteria G,https://s3-media2.fl.yelpcdn.com/bphoto/E8-aHI...,False,https://www.yelp.com/biz/osteria-g-apex-2?adju...,146,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,"{'latitude': 35.66757, 'longitude': -78.79956}",[delivery],,"{'address1': '5160 Sunset Lake Rd', 'address2'...",19842297480,(984) 229-7480,11737.328646


failed to check the df for duplicates

In [40]:
# checking for duplicates
# search #1
#pizza_df.duplicated().sum()
#apex_italian = pd.DataFrame(italian_df)
#apex_italian.duplicated().sum()

In [41]:
# saving the final dataframe as a csv.gz
# search #1
#pizza_df.to_csv('Data/Whispering_Pines_Pizza.csv.gz', compression = 'gzip', index = False)
italian_df.to_csv('Data/Apex_Italian.csv.gz', compression = 'gzip', index = False)