# Code for efficient API extraction

## Imports

In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Additional Imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [2]:
# Load API Credentials
with open('/Users/jonme/.secret/yelp_api.json') as f:   #use your path here!
    login = json.load(f)
# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

To allow us to easily perform different searches in the future, we will define variables for LOCATION and TERM set for our particular search conditions. Then, when we want to use a different location or term, we can just redefine these variables. This streamlines the code and makes it more readable and reproducible. 

In [3]:
# set our API call parameters 
LOCATION = 'NY,NY'
TERM = 'Pizza'

**Create a results-in-progress JSON file, but only if it doesn't exist.**

This is the file where your results will be saved. Note: you must rename your JSON_FILE for different queries to prevent confusing results from other searches. We recommend you include your search terms in the filename. 

In [4]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = "Data/results_in_progress_NY_pizza.json"
JSON_FILE

'Data/results_in_progress_NY_pizza.json'

 Check if our JSON_FILE already exists. This will prevent us from accidentally overwriting an existing file.

If it doesn't exist:

-   Create any folders needed for the file path.
-   Save an empty list as JSON_File

In [5]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist: 
if file_exists == False:
    
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
        
        
    ## INFORM USER AND SAVE EMPTY LIST
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')
    
    
    # save an empty list
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
# If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] Data/results_in_progress_NY_pizza.json already exists.


**Determine how many results are already in the file**

Load the results file to determine the # of results we have previously retrieved. If you just created the file, you would expect it to be empty.

We will use this as our offset parameter for our API call. Even if this is your first API call, and the number is 0, we want to define "n_results" based on the length of "previous_results." 

In [6]:
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 20 previous results found.


**Figure out how many pages of results we will need**

We will perform our first query to get our first page of results and the total number of results. We will then (via code) calculate how many pages we will need to retrieve all of our results.

In [7]:
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [8]:
## How many results total?
total_results = results['total']
total_results

15500

In [9]:
## How many did we get the details for?
results_per_page = len(results['businesses'])
results_per_page

20

There are over 16200 businesses to retrieve from our API and we can get 20 results at a time (per "page").

-   We can calculate the # of results remaining by subtracting our offset (length of our previous results) from our total.
-   Then we can determining how many pages we will need by dividing the results by 20 (or whatever the value happens to be for results per page)
-   Note that we need to round up the number of pages in order to get all of the results. Even if there is only 1 result on the last page, we want to include that page! To do this we will use `math.ceil`.


In [10]:
# Import additional packages for controlling our loop
import time, math
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages


774

In [11]:
# join new results with old list with extend and save to file
previous_results.extend(results['businesses'])  
with open(JSON_FILE,'w') as f:
     json.dump(previous_results,f)

In [12]:
from tqdm.notebook import tqdm_notebook
import time
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    # add a 200ms pause
    time.sleep(.2)

  0%|          | 0/774 [00:00<?, ?it/s]

YelpAPIError: VALIDATION_ERROR: Too many results requested, limit+offset must be <= 1000.

In [13]:
## delete file and confirm it no longer exits.
os.remove(JSON_FILE)
os.path.isfile(JSON_FILE)

False

In [14]:
def create_json_file(JSON_FILE,  delete_if_exists=False):
    ## Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)
    
    ## If it DOES exist:
    if file_exists == True:
        
        ## Check if user wants to delete if exists
        if delete_if_exists==True:         
            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
            ## delete file and confirm it no longer exits.
            os.remove(JSON_FILE)
            ## Recursive call to function after old file deleted
            create_json_file(JSON_FILE,delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")            
                  
    ## If it does NOT exist:
    else:    
        ## INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
        
        ## CREATE ANY NEEDED FOLDERS
        # Get the Folder Name only
        folder = os.path.dirname(JSON_FILE)
        
        ## If JSON_FILE included a folder:
        if len(folder)>0:
            # create the folder
            os.makedirs(folder,exist_ok=True)
        ## Save empty list to start the json file
        with open(JSON_FILE,'w') as f:
            json.dump([],f)  


In [15]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
## How many results total?
total_results = results['total']
## How many did we get the details for?
results_per_page = len(results['businesses'])
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

[i] Data/results_in_progress_NY_pizza.json not found. Saving empty list to new file.
- 0 previous results found.


770

In [16]:
for i in tqdm_notebook( range(1,n_pages+1)):
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    time.sleep(.2)

  0%|          | 0/770 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.


In [17]:
# load final results
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,zj8Lq1T8KIC5zwFief15jg,prince-street-pizza-new-york-2,Prince Street Pizza,https://s3-media2.fl.yelpcdn.com/bphoto/I4gm7i...,False,https://www.yelp.com/biz/prince-street-pizza-n...,4786,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.5,"{'latitude': 40.72308755605564, 'longitude': -...","[delivery, pickup]",$,"{'address1': '27 Prince St', 'address2': None,...",12129664100,(212) 966-4100,2209.311618
1,ysqgdbSrezXgVwER2kQWKA,julianas-brooklyn-3,Juliana's,https://s3-media2.fl.yelpcdn.com/bphoto/NVoLFl...,False,https://www.yelp.com/biz/julianas-brooklyn-3?a...,2607,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.5,"{'latitude': 40.70274718768062, 'longitude': -...",[delivery],$$,"{'address1': '19 Old Fulton St', 'address2': '...",17185966700,(718) 596-6700,1289.857286
2,WG639VkTjmK5dzydd1BBJA,rubirosa-new-york-2,Rubirosa,https://s3-media3.fl.yelpcdn.com/bphoto/F65qqO...,False,https://www.yelp.com/biz/rubirosa-new-york-2?a...,2917,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 40.722766, 'longitude': -73.996233}",[pickup],$$,"{'address1': '235 Mulberry St', 'address2': ''...",12129650500,(212) 965-0500,2268.49195
3,uc5qQMzs96rzjK27epDCug,joes-pizza-new-york-4,Joe's Pizza,https://s3-media4.fl.yelpcdn.com/bphoto/1TvTRG...,False,https://www.yelp.com/biz/joes-pizza-new-york-4...,2975,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.73060076, 'longitude': -74.002...",[delivery],$,"{'address1': '7 Carmine St', 'address2': '', '...",12123661182,(212) 366-1182,3263.174143
4,KEEXuHCYhsQJ-zpJhu98bA,joes-pizza-new-york-148,Joe's Pizza,https://s3-media1.fl.yelpcdn.com/bphoto/VkqlwR...,False,https://www.yelp.com/biz/joes-pizza-new-york-1...,1785,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.7547, 'longitude': -73.98696}","[delivery, pickup]",$,"{'address1': '1435 Broadway', 'address2': '', ...",16465594878,(646) 559-4878,5345.192429


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
995,aXuBZ6tBe93MHNtEIWNDCQ,calabria-pizza-oradell-2,Calabria Pizza,https://s3-media3.fl.yelpcdn.com/bphoto/5V7YbI...,False,https://www.yelp.com/biz/calabria-pizza-oradel...,124,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.0,"{'latitude': 40.948291, 'longitude': -74.0321582}","[pickup, delivery]",$,"{'address1': '239 Kinderkamack Rd', 'address2'...",12015239228,(201) 523-9228,27200.412919
996,qs5x7OfTtpgQ4xQeN7N4Iw,saraghina-caffè-brooklyn-2,Saraghina Caffè,https://s3-media1.fl.yelpcdn.com/bphoto/a2qEDF...,False,https://www.yelp.com/biz/saraghina-caff%C3%A8-...,46,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",3.0,"{'latitude': 40.6897188, 'longitude': -73.9719...",[],,"{'address1': '195 DeKalb Ave, Brooklyn, NY 112...",17183274224,(718) 327-4224,2019.440121
997,sy9YaB1czZpyodpZom3FLw,marios-pizzeria-bronx,Mario's Pizzeria,https://s3-media2.fl.yelpcdn.com/bphoto/LuMzCY...,False,https://www.yelp.com/biz/marios-pizzeria-bronx...,26,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.88864, 'longitude': -73.83104}","[pickup, delivery]",$,"{'address1': '3824 Dyre Ave', 'address2': '', ...",17183258770,(718) 325-8770,23738.050129
998,b9Qd4jaxugk7Zo7LE_lLrg,oddfellows-ice-cream-co-brooklyn-7,OddFellows Ice Cream Co.,https://s3-media2.fl.yelpcdn.com/bphoto/mo3hBD...,False,https://www.yelp.com/biz/oddfellows-ice-cream-...,134,"[{'alias': 'icecream', 'title': 'Ice Cream & F...",4.5,"{'latitude': 40.70325596349647, 'longitude': -...","[pickup, delivery]",$$,"{'address1': '44 Water St', 'address2': '', 'a...",17186835755,(718) 683-5755,1187.666496
999,eQdjZQaLnjItmn9suEEeIA,patricias-bronx,Patricia's,https://s3-media3.fl.yelpcdn.com/bphoto/PygasA...,False,https://www.yelp.com/biz/patricias-bronx?adjus...,605,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.0,"{'latitude': 40.84912, 'longitude': -73.8533099}","[pickup, delivery]",$$,"{'address1': '1082 Morris Park Ave', 'address2...",17184099069,(718) 409-9069,19031.665993


-   Because our yelp results include columns that contain lists, we cannot check every column in dataframe for duplicates.
-   Instead, we can use the subset argument for df.duplicated() and df.drop_duplicates() to only check the id column for duplicates.

In [19]:
# check for duplicate results
final_df.duplicated().sum()

TypeError: unhashable type: 'list'

In [20]:
# check for duplicate ID's 
final_df.duplicated(subset='id').sum()

220

In [21]:
## Drop duplicate ids and confirm there are no more duplicates
final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()

0

In [22]:
# save the final results to a compressed csv
final_df.to_csv('Data/final_results_NY_pizza.csv.gz', compression='gzip',index=False)