In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Additional Imports
import os, json, math, time

# to make yelpapi calls
from yelpapi import YelpAPI

# progress bar from tqdm_notebook
from tqdm.notebook import tqdm_notebook

In [2]:
!pip install yelpapi
!pip install tqdm



# Load Credentials and Create Yelp API Object

In [3]:
# Load API Credentials
with open('/Users/huytran_1/.secret/yelp_api.json', 'r') as f:  
    login = json.load(f)

In [4]:
# Instantiate YelpAPI Variable
yelp = YelpAPI(login['api-key'], timeout_s=5.0)

# Define Search Terms and File Paths

In [5]:
# set our API call parameters and filename before the first call
location = 'Louisiana, LA 70508'
term = 'seafood'

In [6]:
location.split(',')[0]

'Tustin'

In [7]:
## Specify folder for saving data
FOLDER = 'Data/'

os.makedirs(FOLDER, exist_ok = True)
# Specifying JSON_FILE filename (can include a folder)
JSON_FILE = FOLDER+f"{location.split(',')[0]}-{term}.json"

In [8]:
JSON_FILE

'Data/Tustin-seafood.json'

# Check if Json File exists and Create it if it doesn't

In [9]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist: 
if file_exists == False:    
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    
    ## If JSON folder name is not empty:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder, exist_ok = True)
        
        
    ## INFORM USER AND SAVE EMPTY LIST
    print(f"[i] {JSON_FILE} not found. Saving empty list to file.")
    
    
    ## save the first page of results
    with open(JSON_FILE, 'w') as f:
          json.dump([], f)
        
## If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] Data/Tustin-seafood.json already exists.


In [10]:
os.path.isfile(JSON_FILE)

True

# Load JSON FIle and account for previous results

# Make the first API call to get the first page of data

In [11]:
# use our yelp_api variable's search_query method to perform our API call
results = yelp.search_query(term = term, location = location)

In [12]:
type(results)

dict

In [13]:
len(results)

3

In [14]:
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [15]:
results['total']

865

In [16]:
results['region']

{'center': {'longitude': -117.82081604003906, 'latitude': 33.73717766325585}}

In [17]:
results['businesses']

[{'id': 'cvwddy324vWwesa47hO1rQ',
  'alias': 'pacific-catch-tustin',
  'name': 'Pacific Catch',
  'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/MVvbkepIZlrCAEvwFd1vPw/o.jpg',
  'is_closed': False,
  'url': 'https://www.yelp.com/biz/pacific-catch-tustin?adjust_creative=GAXTYfb4qtwMRsCd0avxFw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=GAXTYfb4qtwMRsCd0avxFw',
  'review_count': 300,
  'categories': [{'alias': 'seafood', 'title': 'Seafood'},
   {'alias': 'sushi', 'title': 'Sushi Bars'},
   {'alias': 'tacos', 'title': 'Tacos'}],
  'rating': 4.5,
  'coordinates': {'latitude': 33.722135808376024, 'longitude': -117.7933607},
  'transactions': ['delivery', 'pickup'],
  'price': '$$',
  'location': {'address1': '3040 El Camino Real',
   'address2': '',
   'address3': None,
   'city': 'Tustin',
   'zip_code': '92782',
   'country': 'US',
   'state': 'CA',
   'display_address': ['3040 El Camino Real', 'Tustin, CA 92782']},
  'phone': '+17146130300',
  'display_ph

In [18]:
## How many results total?
pd.DataFrame(results['businesses'])

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,cvwddy324vWwesa47hO1rQ,pacific-catch-tustin,Pacific Catch,https://s3-media1.fl.yelpcdn.com/bphoto/MVvbke...,False,https://www.yelp.com/biz/pacific-catch-tustin?...,300,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",4.5,"{'latitude': 33.722135808376024, 'longitude': ...","[delivery, pickup]",$$,"{'address1': '3040 El Camino Real', 'address2'...",17146130300,(714) 613-0300,3040.390115
1,vxFKFgCf39qb_MeJBn_11Q,fresh-off-the-boat-fish-grill-tustin,Fresh Off The Boat Fish Grill,https://s3-media3.fl.yelpcdn.com/bphoto/kTOXke...,False,https://www.yelp.com/biz/fresh-off-the-boat-fi...,1222,"[{'alias': 'mediterranean', 'title': 'Mediterr...",4.5,"{'latitude': 33.7593034, 'longitude': -117.825...","[delivery, pickup]",$$,"{'address1': '17582 E 17th St', 'address2': 'S...",17145441900,(714) 544-1900,2498.256262
2,P6UaGMOGv8i0hTEXHEbUhQ,the-black-marlin-tustin,The Black Marlin,https://s3-media3.fl.yelpcdn.com/bphoto/yb-8kg...,False,https://www.yelp.com/biz/the-black-marlin-tust...,910,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",4.0,"{'latitude': 33.7402553015911, 'longitude': -1...",[delivery],$$,"{'address1': '560 El Camino Real', 'address2':...",17149297427,(714) 929-7427,449.561824
3,3S6lKzpyZnLGRQA68pgzfA,long-hai-restaurant-tustin,Long Hai Restaurant,https://s3-media2.fl.yelpcdn.com/bphoto/AtGAAn...,False,https://www.yelp.com/biz/long-hai-restaurant-t...,397,"[{'alias': 'vietnamese', 'title': 'Vietnamese'...",4.0,"{'latitude': 33.7384103693369, 'longitude': -1...",[pickup],$$,"{'address1': '682 El Camino Real', 'address2':...",17148388118,(714) 838-8118,397.716714
4,RWAugaLWYZpoO2iUDZ40Xw,seven-grams-tustin,Seven Grams,https://s3-media1.fl.yelpcdn.com/bphoto/SVkO6I...,False,https://www.yelp.com/biz/seven-grams-tustin?ad...,375,"[{'alias': 'chinese', 'title': 'Chinese'}]",4.5,"{'latitude': 33.747672, 'longitude': -117.81093}","[pickup, delivery]",$$,"{'address1': '1086 Irvine Blvd', 'address2': N...",16576008212,(657) 600-8212,1483.58136
5,ZsNsQchXY0SG5puyDSnqgA,izakaya-habuya-okinawan-dining-tustin-5,Izakaya Habuya Okinawan Dining,https://s3-media3.fl.yelpcdn.com/bphoto/Q7w4rp...,False,https://www.yelp.com/biz/izakaya-habuya-okinaw...,825,"[{'alias': 'noodles', 'title': 'Noodles'}, {'a...",4.5,"{'latitude': 33.731655, 'longitude': -117.8184...",[],$$,"{'address1': '14215 Red Hill Ave', 'address2':...",17146797453,(714) 679-7453,652.697144
6,b6lMxCjNMS1rA9P5USG68w,water-grill-costa-mesa-2,Water Grill,https://s3-media3.fl.yelpcdn.com/bphoto/CDBAR7...,False,https://www.yelp.com/biz/water-grill-costa-mes...,3383,"[{'alias': 'seafood', 'title': 'Seafood'}]",4.5,"{'latitude': 33.6910598430702, 'longitude': -1...",[delivery],$$$,"{'address1': '3300 Bristol S', 'address2': '',...",19492087060,(949) 208-7060,7848.818769
7,T3X5GoQf6gVTxJlLsPo7uQ,roast-fish-and-bbq-house-tustin,Roast Fish & BBQ House,https://s3-media4.fl.yelpcdn.com/bphoto/T58R3J...,False,https://www.yelp.com/biz/roast-fish-and-bbq-ho...,9,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...",3.5,"{'latitude': 33.73147677226516, 'longitude': -...","[delivery, pickup]",,"{'address1': '1046 Walnut Ave', 'address2': No...",17144030628,(714) 403-0628,881.084842
8,9U7XQBn8F2CxxRQpWQyk_g,the-crab-cooker-tustin,The Crab Cooker,https://s3-media3.fl.yelpcdn.com/bphoto/cvxMBg...,False,https://www.yelp.com/biz/the-crab-cooker-tusti...,624,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",3.5,"{'latitude': 33.7595939, 'longitude': -117.828...","[delivery, pickup]",$$,"{'address1': '17260 17th St', 'address2': '', ...",17145731077,(714) 573-1077,2600.528512
9,wpqWZ7YtiW5Rf7WZ2jG-QA,kings-fish-house-orange-orange,King's Fish House - Orange,https://s3-media4.fl.yelpcdn.com/bphoto/vrL1lN...,False,https://www.yelp.com/biz/kings-fish-house-oran...,1704,"[{'alias': 'seafood', 'title': 'Seafood'}]",4.0,"{'latitude': 33.808768562341, 'longitude': -11...",[delivery],$$,"{'address1': '1521 W Katella Ave', 'address2':...",17147716655,(714) 771-6655,9112.554137


- Where is the actual data we want to save?

In [19]:
## How many did we get the details for?
results_per_page = len(results['businesses'])
results_per_page

20

- Calculate how many pages of results needed to cover the total_results

In [20]:
(results['total'])/ results_per_page

43.25

In [21]:
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total'])/ results_per_page)
n_pages

44

In [22]:
for i in tqdm_notebook(range(1,n_pages+1)):
    ## The block of code we want to TRY to run
    try:
        
        time.sleep(.2)
        
        ## Read in results in progress file and check the length
        with open(JSON_FILE, 'r') as f:
            previous_results = json.load(f)
        
        ## save number of results to use as offset
        n_results = len(previous_results)
        
        
        ## use n_results as the OFFSET 
        results = yelp.search_query(location = location, term = term,
                                   offset = n_results+1)

        ## append new results and save to file
        previous_results.extend(results['businesses'])
        
        with open(JSON_FILE, 'w') as f:
            json.dump(previous_results, f)

            
    ## What to do if we get an error/exception.
    except Exception as e:
        print(' [!] ERROR', e)

  0%|          | 0/44 [00:00<?, ?it/s]

# Open the Final JSON File with Pandas

In [23]:
df = pd.read_json(JSON_FILE)

In [24]:
df.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,vxFKFgCf39qb_MeJBn_11Q,fresh-off-the-boat-fish-grill-tustin,Fresh Off The Boat Fish Grill,https://s3-media3.fl.yelpcdn.com/bphoto/kTOXke...,False,https://www.yelp.com/biz/fresh-off-the-boat-fi...,1197,"[{'alias': 'mediterranean', 'title': 'Mediterr...",4.5,"{'latitude': 33.7593034, 'longitude': -117.825...","[delivery, pickup]",$$,"{'address1': '17582 E 17th St', 'address2': 'S...",17145441900,(714) 544-1900,2498.256262
1,P6UaGMOGv8i0hTEXHEbUhQ,the-black-marlin-tustin,The Black Marlin,https://s3-media3.fl.yelpcdn.com/bphoto/yb-8kg...,False,https://www.yelp.com/biz/the-black-marlin-tust...,888,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",4.0,"{'latitude': 33.7402553015911, 'longitude': -1...",[delivery],$$,"{'address1': '560 El Camino Real', 'address2':...",17149297427,(714) 929-7427,449.561824
2,RWAugaLWYZpoO2iUDZ40Xw,seven-grams-tustin,Seven Grams,https://s3-media1.fl.yelpcdn.com/bphoto/SVkO6I...,False,https://www.yelp.com/biz/seven-grams-tustin?ad...,286,"[{'alias': 'chinese', 'title': 'Chinese'}]",4.5,"{'latitude': 33.747672, 'longitude': -117.81093}","[delivery, pickup]",$$,"{'address1': '1086 Irvine Blvd', 'address2': N...",16576008212,(657) 600-8212,1483.58136
3,3S6lKzpyZnLGRQA68pgzfA,long-hai-restaurant-tustin,Long Hai Restaurant,https://s3-media2.fl.yelpcdn.com/bphoto/AtGAAn...,False,https://www.yelp.com/biz/long-hai-restaurant-t...,394,"[{'alias': 'vietnamese', 'title': 'Vietnamese'...",4.0,"{'latitude': 33.7384103693369, 'longitude': -1...",[pickup],$$,"{'address1': '682 El Camino Real', 'address2':...",17148388118,(714) 838-8118,397.716714
4,T3X5GoQf6gVTxJlLsPo7uQ,roast-fish-and-bbq-house-tustin,Roast Fish & BBQ House,https://s3-media4.fl.yelpcdn.com/bphoto/T58R3J...,False,https://www.yelp.com/biz/roast-fish-and-bbq-ho...,6,"[{'alias': 'bbq', 'title': 'Barbeque'}, {'alia...",4.0,"{'latitude': 33.73147677226516, 'longitude': -...","[delivery, pickup]",,"{'address1': '1046 Walnut Ave', 'address2': No...",17144030628,(714) 403-0628,881.084842


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864 entries, 0 to 863
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             864 non-null    object 
 1   alias          864 non-null    object 
 2   name           864 non-null    object 
 3   image_url      864 non-null    object 
 4   is_closed      864 non-null    bool   
 5   url            864 non-null    object 
 6   review_count   864 non-null    int64  
 7   categories     864 non-null    object 
 8   rating         864 non-null    float64
 9   coordinates    864 non-null    object 
 10  transactions   864 non-null    object 
 11  price          779 non-null    object 
 12  location       864 non-null    object 
 13  phone          864 non-null    object 
 14  display_phone  864 non-null    object 
 15  distance       864 non-null    float64
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 102.2+ KB


# Check for duplicates

In [26]:
# check for duplicate ID's 
df.duplicated(subset='id').sum()

15

In [27]:
## convert the filename to a .csv.gz
csv_file = JSON_FILE.replace('.json','.csv.gz')
csv_file

'Data/Tustin-seafood.csv.gz'

In [28]:
## Save it as a compressed csv (to save space)
df.to_csv(csv_file, compression = 'gzip', index = False)

# Compare filesize with os module's os.path.getsize

In [29]:
size_json = os.path.getsize(JSON_FILE)
size_csv_gz = os.path.getsize(JSON_FILE.replace('.json','.csv.gz'))

print(f'JSON FILE: {size_json:,} Bytes')
print(f'CSV.GZ FILE: {size_csv_gz:,} Bytes')

print(f'the csv.gz is {size_json/size_csv_gz} times smaller!')

JSON FILE: 851,230 Bytes
CSV.GZ FILE: 123,527 Bytes
the csv.gz is 6.891044063241234 times smaller!
