# **Yelp API Core**

_John Andrew Dixon_

---

##### **Imports**

In [178]:
import json, math, os, time
import numpy as np
import pandas as pd
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

##### **Help Functions**

In [179]:
def create_json_file(JSON_FILE, delete_if_exists=False):

    # Check if the specified file exists
    file_exists = os.path.isfile(JSON_FILE)



    # If the file does not exist
    if not file_exists:

        # Notify that the file is being created.
        print(f"[i] File does not exist: creating {JSON_FILE} now.")

        # Get the folder name where the file will be saved.
        folder = os.path.dirname(JSON_FILE)

        # If the file is contained within a folder create it.
        if len(folder) > 0:
            os.makedirs(folder, exist_ok=True)

        # Save an empty list to the file that was created.
        with open(JSON_FILE, "w") as f:
            json.dump([], f)



    # If the file does exist
    else:

        # If the user wants to delete the file
        if delete_if_exists:
            print(f"[i] File {JSON_FILE} exists. Deleting it now...")
            # Remove the file
            os.remove(JSON_FILE)
            # Recursively call the function with the delete_if_exists 
            # argument's default to create the new file
            create_json_file(JSON_FILE)

        # If the user does not want to delete the file
        else:
            # Notify that it exists
            print(f"[i] File {JSON_FILE} exists.")

---

## **Efficient YelpAPI Calls**

##### _API Setup_

In [180]:
# Load YelpAPI credentials from the JSON files
with open("/Users/johna/.secret/yelp_api.json") as f:
    login = json.load(f)

# Instantiate YelpAPI object
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)
yelp_api

<yelpapi.yelpapi.YelpAPI at 0x24701fed4c0>

In [181]:
# Create the location and term strings
LOCATION = "San Diego, CA"
TERM = "beer"
display(LOCATION, TERM)

'San Diego, CA'

'beer'

In [182]:
# Specify the file name with the search terms as part of the name
JSON_FILE = f"Data/results_in_progress_{LOCATION.replace(' ', '')}_{TERM.replace(' ', '')}.json"
JSON_FILE

'Data/results_in_progress_SanDiego,CA_beer.json'

In [183]:
# Call the helper function to aid in creating the file
create_json_file(JSON_FILE, delete_if_exists=True)

[i] File Data/results_in_progress_SanDiego,CA_beer.json exists. Deleting it now...
[i] File does not exist: creating Data/results_in_progress_SanDiego,CA_beer.json now.


In [184]:
# Load previous results.
with open(JSON_FILE, "r") as f:
    previous_results = json.load(f)

# Notify how many previous results there were.
n_results = len(previous_results)
print(f"There are {n_results} previous results.")

There are 0 previous results.


In [185]:
# Preform an API call with the results offset.
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                                offset=n_results)

# Show the total amount of results for verification and future reference.
print(f"There are {results['total']} total search results.")

There are 4500 total search results.


In [186]:
# Print the amount of results per page
results_per_page = len(results['businesses'])
print(f"There are {results_per_page} results per page.")

There are 20 results per page.


In [187]:
# Create the number of pages by getting the total results and subtracting
# the previous results and then diving this by the amount of results per page
# given by len(results["businesses"]).
n_pages = math.ceil((results["total"] - n_results) / results_per_page)
print(f"There are {n_pages} pages.")

There are 225 pages.


In [188]:
# Loop through each page
for i in tqdm_notebook(range(1, n_pages + 1)):

    # Load previous results.
    with open(JSON_FILE, "r") as f:
        previous_results = json.load(f)

    # Save the amount of previous results.
    n_results = len(previous_results)

    # Exit out of the loop if the results exceed the limit  
    if (n_results + results_per_page) > 1000:
        print("Exceeded result limit of 1000. Stopping loop...")
        break

    # Preform an API call with the results offset.
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM,
                                    offset=n_results)
    
    # Append new results
    previous_results.extend(results["businesses"])

    # Dump all current iteration results to target file
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results, f)

    # Add a 200 ms pause for better progress bar visualization
    time.sleep(.2)

  0%|          | 0/225 [00:00<?, ?it/s]

Exceeded result limit of 1000. Stopping loop...


In [189]:
# Create the final DataFrame
final_df = pd.read_json(JSON_FILE)
# Display the final DataFrame's head and tail
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price
0,v1GulCBkuV31WR2K3kFQfA,the-gärten-san-diego,The Gärten,https://s3-media3.fl.yelpcdn.com/bphoto/Q-EMyi...,False,https://www.yelp.com/biz/the-g%C3%A4rten-san-d...,10,"[{'alias': 'beergardens', 'title': 'Beer Garde...",4.5,"{'latitude': 32.764478, 'longitude': -117.199122}",[],"{'address1': '5322 Banks St', 'address2': '', ...",,,11731.220811,
1,GkMleu5sddDUBmqKJ7tkdA,taproom-beer-co-san-diego,TapRoom Beer Co,https://s3-media4.fl.yelpcdn.com/bphoto/l1gRK3...,False,https://www.yelp.com/biz/taproom-beer-co-san-d...,195,"[{'alias': 'breweries', 'title': 'Breweries'},...",4.5,"{'latitude': 32.75567, 'longitude': -117.14347}","[pickup, delivery]","{'address1': '2000 El Cajon Blvd', 'address2':...",16195397738.0,(619) 539-7738,8819.092243,$$
2,zDRzccJo9qgKYLvAjuGl8w,groundswell-brewing-company-san-diego,Groundswell Brewing Company,https://s3-media4.fl.yelpcdn.com/bphoto/CC1lcc...,False,https://www.yelp.com/biz/groundswell-brewing-c...,131,"[{'alias': 'breweries', 'title': 'Breweries'}]",4.0,"{'latitude': 32.788978269559365, 'longitude': ...",[],"{'address1': '6304 Riverdale St', 'address2': ...",16197952337.0,(619) 795-2337,3867.685781,$
3,5HPGruCzXhweiDrN1KqnhQ,cork-n-brew-san-diego,Cork N Brew,https://s3-media4.fl.yelpcdn.com/bphoto/JaZ-ST...,False,https://www.yelp.com/biz/cork-n-brew-san-diego...,5,"[{'alias': 'beerbar', 'title': 'Beer Bar'}, {'...",5.0,"{'latitude': 32.71115769039964, 'longitude': -...",[],"{'address1': '550 Park Blvd', 'address2': None...",16199155881.0,(619) 915-5881,13661.455147,
4,gh1r0PvTWoTpjcj2JW4c8w,common-theory-san-diego-3,Common Theory,https://s3-media4.fl.yelpcdn.com/bphoto/u5S738...,False,https://www.yelp.com/biz/common-theory-san-die...,1435,"[{'alias': 'gastropubs', 'title': 'Gastropubs'...",4.0,"{'latitude': 32.8293223613103, 'longitude': -1...",[delivery],"{'address1': '4805 Convoy St', 'address2': '',...",16197806689.0,(619) 780-6689,5432.121827,$$


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price
995,8Ku_QTiJ9kS7wdlAKyxg3Q,turf-supper-club-san-diego,Turf Supper Club,https://s3-media3.fl.yelpcdn.com/bphoto/5Fj_o-...,False,https://www.yelp.com/biz/turf-supper-club-san-...,889,"[{'alias': 'steak', 'title': 'Steakhouses'}, {...",4.0,"{'latitude': 32.7173184645089, 'longitude': -1...",[delivery],"{'address1': '1116 25th St', 'address2': '', '...",16192346363,(619) 234-6363,12551.246777,$$
996,AS--bXkiFStoRkwoLqkJEg,buds-louisiana-cafe-san-diego,Bud's Louisiana Cafe,https://s3-media2.fl.yelpcdn.com/bphoto/QnTJ9V...,False,https://www.yelp.com/biz/buds-louisiana-cafe-s...,2613,"[{'alias': 'cajun', 'title': 'Cajun/Creole'}, ...",4.5,"{'latitude': 32.821062910211, 'longitude': -11...","[delivery, restaurant_reservation]","{'address1': '4320 Viewridge Ave', 'address2':...",18585732837,(858) 573-2837,2783.084016,$$
997,VGdOmRskhXWBcFAk2iADCA,littler-lounge-rancho-santa-fe,Littler Lounge,https://s3-media3.fl.yelpcdn.com/bphoto/jjBP3G...,False,https://www.yelp.com/biz/littler-lounge-rancho...,16,"[{'alias': 'newamerican', 'title': 'American (...",4.0,"{'latitude': 32.9908297698805, 'longitude': -1...",[],"{'address1': 'Morgan Run Club & Resort', 'addr...",18587562471,(858) 756-2471,21439.208959,$$
998,mzLGnmpwtrIgUeplJtmOOw,la-puerta-san-diego,La Puerta,https://s3-media4.fl.yelpcdn.com/bphoto/GPnCtL...,False,https://www.yelp.com/biz/la-puerta-san-diego?a...,4245,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,"{'latitude': 32.7110023498535, 'longitude': -1...","[delivery, pickup]","{'address1': '560 4th Ave', 'address2': '', 'a...",16196963466,(619) 696-3466,13946.648675,$$
999,V6gWGlvV2ZKwooOfrnh7fg,diamond-palace-san-diego,Diamond Palace,https://s3-media2.fl.yelpcdn.com/bphoto/3KxUdi...,False,https://www.yelp.com/biz/diamond-palace-san-di...,209,"[{'alias': 'dimsum', 'title': 'Dim Sum'}, {'al...",4.0,"{'latitude': 32.74778, 'longitude': -117.078872}",[],"{'address1': '3993 54th St', 'address2': None,...",16192299998,(619) 229-9998,8557.474945,$$


In [190]:
# Just in case, drop duplicates and verify 
final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()

0

In [191]:
# Get a quick info of the final DataFrame
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1000 non-null   object 
 1   alias          1000 non-null   object 
 2   name           1000 non-null   object 
 3   image_url      1000 non-null   object 
 4   is_closed      1000 non-null   bool   
 5   url            1000 non-null   object 
 6   review_count   1000 non-null   int64  
 7   categories     1000 non-null   object 
 8   rating         1000 non-null   float64
 9   coordinates    1000 non-null   object 
 10  transactions   1000 non-null   object 
 11  location       1000 non-null   object 
 12  phone          1000 non-null   object 
 13  display_phone  1000 non-null   object 
 14  distance       1000 non-null   float64
 15  price          848 non-null    object 
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 126.0+ KB


In [192]:
# Save the final results to a compressed csv for storage efficiency
final_df.to_csv(f"Data/final_results_{LOCATION.replace(' ', '')}_{TERM.replace(' ', '')}.csv.gz", compression='gzip', index=False)