# Efficient API Extraction Method to Obtain Data from Yelp

## Objective

- Using the Yelp API to search my favorite city for a cuisine type of my choice.

- Extract all of the results from your search and compile them into one dataframe 

- Loop through a list of queries and save the results throughout the loop

- Use the tqdm lib to make a progress bar to track the time remaining in a loop

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#additional imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [2]:
## delete file and confirm it no longer exits.
#os.remove(JSON_FILE)
#os.path.isfile(JSON_FILE)

In [3]:
def create_json_file(JSON_FILE,  delete_if_exists=False):
    
    ## Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)
    
    ## If it DOES exist:
    if file_exists == True:
        
        ## Check if user wants to delete if exists
        if delete_if_exists==True:
            
            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
            ## delete file and confirm it no longer exits.
            os.remove(JSON_FILE)
            ## Recursive call to function after old file deleted
            create_json_file(JSON_FILE,delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")            
            
            
    ## If it does NOT exist:
    else:
        ## INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
        
        ## CREATE ANY NEEDED FOLDERS
        # Get the Folder Name only
        folder = os.path.dirname(JSON_FILE)
        
        ## If JSON_FILE included a folder:
        if len(folder)>0:
            # create the folder
            os.makedirs(folder,exist_ok=True)
        ## Save empty list to start the json file
        with open(JSON_FILE,'w') as f:
            json.dump([],f) 


## Load credentials and create yelp api object 

In [4]:
# Load API credentials
with open('/Users/heng-tsertsai/.secret/yelp_api.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['client-id', 'api-key'])

In [5]:
# instantiate yelpapi variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)
yelp_api

<yelpapi.yelpapi.YelpAPI at 0x120f829d0>

## Define search and file path

In [6]:
# use our yelp_api variable's search_query method to perform our API call
location='Chicago, IL'
term='Italian'

In [7]:
#specify folder to save data
FOLDER = 'Data/'
os.makedirs(FOLDER, exist_ok = True)

#specify JSON file name in a programatic way
JSON_FILE = FOLDER+f"results_in_progress_{location.split(',')[0]}_{term}.json"
JSON_FILE

'Data/results_in_progress_Chicago_Italian.json'

## Check if JSON file exists

In [8]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist: 
if file_exists == False:
    
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
        
        
    ## INFORM USER AND SAVE EMPTY LIST
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')
    
    
    # save an empty list
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
# If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] Data/results_in_progress_Chicago_Italian.json already exists.


## Determine how many results are already in the file


In [9]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

[!] Data/results_in_progress_Chicago_Italian.json already exists. Deleting previous file...
[i] Data/results_in_progress_Chicago_Italian.json not found. Saving empty list to new file.
- 0 previous results found.


## Figure out how many pages of results we will need

- API will return results by pages
- Let's perform query to get first page of results and. the total # of results
- Then we will calculate how many pages needed to retrieve all results

In [10]:
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=location,
                                term=term,
                               offset=n_results)
## How many results total?
total_results = results['total']
## How many did we get the details for?
results_per_page = len(results['businesses'])
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

230

## For Loop to call each page

I encounter error msg which suggests "YelpAPIError: VALIDATION_ERROR: Too many results requested, limit+offset must be <= 1000." 
This is the limitation of using the free tier of Yelp's API. It is telling us that we asked for too many results and that we can only get <= 1,000 results.

So I use a programmatic way to create a new function to not only delete the old json file but also add a new json file.

After that, an updated progress bar was set up.



In [11]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=location,
                                    term=term, 
                                    offset=n_results)
    
    
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    time.sleep(.2)

  0%|          | 0/230 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.


## Open the Final JSON File with Pandas

In [16]:
df = pd.read_json(JSON_FILE)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1000 non-null   object 
 1   alias          1000 non-null   object 
 2   name           1000 non-null   object 
 3   image_url      1000 non-null   object 
 4   is_closed      1000 non-null   bool   
 5   url            1000 non-null   object 
 6   review_count   1000 non-null   int64  
 7   categories     1000 non-null   object 
 8   rating         1000 non-null   float64
 9   coordinates    1000 non-null   object 
 10  transactions   1000 non-null   object 
 11  price          828 non-null    object 
 12  location       1000 non-null   object 
 13  phone          1000 non-null   object 
 14  display_phone  1000 non-null   object 
 15  distance       1000 non-null   float64
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 118.3+ KB
None


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,LYppbvgJlBG0SqjSKFiFGg,sapori-trattoria-chicago,Sapori Trattoria,https://s3-media4.fl.yelpcdn.com/bphoto/AicJhq...,False,https://www.yelp.com/biz/sapori-trattoria-chic...,2733,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 41.93165443680187, 'longitude': -...","[pickup, delivery]",$$,"{'address1': '2701 N Halsted St', 'address2': ...",17738329999.0,(773) 832-9999,13017.515172
1,7vsOVA4wrHP6f3DMQdD8og,volare-ristorante-italiano-chicago,Volare Ristorante Italiano,https://s3-media2.fl.yelpcdn.com/bphoto/wxlYbK...,False,https://www.yelp.com/biz/volare-ristorante-ita...,2626,"[{'alias': 'italian', 'title': 'Italian'}]",4.0,"{'latitude': 41.8915901798304, 'longitude': -8...","[restaurant_reservation, pickup, delivery]",$$,"{'address1': '201 E Grand Ave', 'address2': ''...",13124109900.0,(312) 410-9900,11184.586803
2,imArj2D-DaXLHAeQlnrzJg,ciao-ragazzi-restaurant-and-bar-chicago,Ciao Ragazzi Restaurant & Bar,https://s3-media3.fl.yelpcdn.com/bphoto/zmABRG...,False,https://www.yelp.com/biz/ciao-ragazzi-restaura...,49,"[{'alias': 'italian', 'title': 'Italian'}]",4.5,"{'latitude': 41.79295188433059, 'longitude': -...",[],,"{'address1': '5440 S Narragansett Ave', 'addre...",17733060093.0,(773) 306-0093,6051.110816
3,GZsrGq6H8CQ4YlGtE_Bm0Q,ciccio-mio-chicago-2,Ciccio Mio,https://s3-media2.fl.yelpcdn.com/bphoto/mXb0Wh...,False,https://www.yelp.com/biz/ciccio-mio-chicago-2?...,354,"[{'alias': 'italian', 'title': 'Italian'}]",4.5,"{'latitude': 41.88939, 'longitude': -87.63524}",[delivery],$$$,"{'address1': '226 W Kinzie St', 'address2': ''...",,,10192.910226
4,IX34m-olKrlEeeEIXEPh9g,francos-ristorante-chicago,Franco's Ristorante,https://s3-media3.fl.yelpcdn.com/bphoto/s1igyi...,False,https://www.yelp.com/biz/francos-ristorante-ch...,305,"[{'alias': 'italian', 'title': 'Italian'}]",4.5,"{'latitude': 41.83843, 'longitude': -87.6343}","[pickup, delivery]",$$,"{'address1': '300 W 31st St', 'address2': '', ...",13122259566.0,(312) 225-9566,8064.668317


In [19]:
# check for duplicate ID's 
df.duplicated(subset='id').sum()

0

In [20]:
## convert the filename to a .csv.gz
csv_file = JSON_FILE.replace('.json','.csv.gz')
csv_file

'Data/results_in_progress_Chicago_Italian.csv.gz'

In [21]:
## Save it as a compressed csv (to save space)
df.to_csv(csv_file, compression = 'gzip', index = False)

## Compare file size

In [22]:
size_json = os.path.getsize(JSON_FILE)
size_csv_gz = os.path.getsize(JSON_FILE.replace('.json','.csv.gz'))

print(f'JSON FILE: {size_json:,} Bytes')
print(f'CSV.GZ FILE: {size_csv_gz:,} Bytes')

print(f'the csv.gz is {size_json/size_csv_gz} times smaller!')

JSON FILE: 981,136 Bytes
CSV.GZ FILE: 149,945 Bytes
the csv.gz is 6.543305878822235 times smaller!
