# Yelp Attractions

**[Work in progress]**

This notebook creates a .csv file with yelp information for ingestion into the Knowledge Graph.

In [352]:
import os
import json
import pandas as pd

from pathlib import Path
from yelpapi import YelpAPI
from pprint import pprint

In [268]:
# read yelp api key
with open('key.data', 'r') as key:
    api_key = key.read()

yelp_api = YelpAPI(api_key)

In [269]:
def float_range(start, stop, step):
    while round(start, 6) < stop:
        yield float(round(start, 6))
        start += step

In [283]:
term = 'attraction'

# 1 degree: approximately 111 km
# using step size of 0.009 degree
start_latitude = 32.550 # 32.670 # 32.710
stop_latitude = 33.100 # 32.770 # 32.750
step_latitude = 0.018

# 1 degree: approximaely 93 km
# using step size of 0.010 degree
start_longitude = -117.260 # -117.260 # -117.175
stop_longitude = -116.900 # -117.100 # -117.140
step_longitude = 0.020

# radius of search 500m
radius = 1000

# yelp api offset and limit
offset = 0
limit = 50

In [284]:
zone = list()
for lat in float_range(start_latitude, stop_latitude, step_latitude * 1.3):
    for long in float_range(start_longitude, stop_longitude, step_longitude):
        zone.append((lat, long))
        
for lat in float_range(start_latitude + step_latitude/1.45, stop_latitude, step_latitude * 1.3):
    for long in float_range(start_longitude + step_longitude/2, stop_longitude, step_longitude):
        zone.append((lat, long))
        
print('Number of coordinates: {}'.format(len(zone)))

Number of coordinates: 846


In [285]:
import folium
import random as rnd

zone_center = [
    round((start_latitude + stop_latitude - step_latitude) / 2, 3), 
    round((start_longitude + stop_longitude - step_longitude) / 2, 3)
]
print(zone_center)

# Build map 
map_zone = folium.Map(location=zone_center, zoom_start=10, tiles='cartodbpositron', width=800, height=600)

# folium.CircleMarker(zone_center, radius=1, color='#00FF00', fill_color='#0080bb').add_to(map_zone)
folium.Rectangle(zone, color='#0080bb', fill_color='#0080bb').add_to(map_zone)

for i in range(len(zone)):
    folium.CircleMarker(zone[i], radius=1, color='#FF0000', fill_color='#0080bb').add_to(map_zone)
    folium.Circle(zone[i], radius=radius, color='#FF0000', fill_color='#0080bb').add_to(map_zone)

map_zone

[32.816, -117.09]


In [286]:
def scrap_yelp(df_combined, latitude, longitude, radius):
    # Maximum 1000 businesses can be pulled
    scrapped = 0
    for i in range(20):
        print('\tScrapping offset: {}'.format(50 * i))
        response = yelp_api.search_query(term=term, latitude=latitude, longitude=longitude, 
                                         limit=limit, radius=radius, offset=scrapped)
        scrapped += len(response['businesses'])
        df = pd.json_normalize(response['businesses'])    

        if df_combined is not None:
            df_combined = pd.concat([df_combined, df], sort=False, ignore_index=True)
        else:       
            df_combined = df.copy()

        # More to scrapped from this location
        if response['total'] <= scrapped:
            print('Completed scrapping. Total: {}'.format(response['total']))
            break
    return df_combined.copy()

In [294]:
df_business = None 
for latitude, longitude in zone:
    print('Scrapping coordinate: ({}, {}). Radius: {}'.format(latitude, longitude, radius))
    df_business = scrap_yelp(df_business, latitude, longitude, radius)

done


In [293]:
len(df_business)

5697

In [298]:
df_business = df_business.drop_duplicates(subset=['id'])

In [299]:
len(df_business)

1790

In [301]:
columns={
    'coordinates.latitude':'coordinates_latitude',
    'coordinates.longitude':'coordinates_longitude',    
    'location.address1':'location_address1',
    'location.address2':'location_address2',
    'location.address3':'location_address3',
    'location.city':'location_city',
    'location.zip_code':'location_zip_code',
    'location.country':'location_country',
    'location.state':'location_state',
    'location.display_address':'location_display_address',
}
df_business_final = df_business.rename(columns=columns)

In [302]:
df_business_final.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,transactions,...,coordinates_longitude,location_address1,location_address2,location_address3,location_city,location_zip_code,location_country,location_state,location_display_address,price
0,Hilw8GTgqPkTw5Dyt_KwTg,border-field-state-park-san-diego-2,Border Field State Park,https://s3-media1.fl.yelpcdn.com/bphoto/cFlik2...,False,https://www.yelp.com/biz/border-field-state-pa...,53,"[{'alias': 'parks', 'title': 'Parks'}]",4.0,[],...,-117.097078,1500 Monument Rd,,,San Diego,92154,US,CA,"[1500 Monument Rd, San Diego, CA 92154]",
1,xTuelVRXKTFMoUmZ0oNp-A,disco-s-paddle-surf-san-diego,Disco’s Paddle Surf,https://s3-media1.fl.yelpcdn.com/bphoto/_xLg1E...,False,https://www.yelp.com/biz/disco-s-paddle-surf-s...,123,"[{'alias': 'paddleboarding', 'title': 'Paddleb...",5.0,[],...,-117.224951,5000 N Harbor Dr,Ste 100,,San Diego,92106,US,CA,"[5000 N Harbor Dr, Ste 100, San Diego, CA 92106]",$$
2,gCgRSRKAoFyT6BS5NHpINw,las-americas-premium-outlets-san-diego,Las Americas Premium Outlets,https://s3-media3.fl.yelpcdn.com/bphoto/QAX80y...,False,https://www.yelp.com/biz/las-americas-premium-...,526,"[{'alias': 'shoppingcenters', 'title': 'Shoppi...",4.0,[],...,-117.041131,4211 Camino De La Plz,,,San Diego,92173,US,CA,"[4211 Camino De La Plz, San Diego, CA 92173]",$$
3,EH_ym2Mm-lpeFCX4jOseWw,under-armour-factory-house-las-americas-san-ys...,Under Armour Factory House - Las Americas,https://s3-media3.fl.yelpcdn.com/bphoto/0Lt6xu...,False,https://www.yelp.com/biz/under-armour-factory-...,20,"[{'alias': 'sportswear', 'title': 'Sports Wear...",4.0,[],...,-117.04304,4191 Camino de La Plaza,,,San Ysidro,92173,US,CA,"[4191 Camino de La Plaza, San Ysidro, CA 92173]",$$
4,GD0pEcp8rFl4CN1cqIsJ2g,old-navy-san-ysidro,Old Navy,https://s3-media4.fl.yelpcdn.com/bphoto/qvVQt-...,False,https://www.yelp.com/biz/old-navy-san-ysidro?a...,24,"[{'alias': 'womenscloth', 'title': 'Women's Cl...",2.5,[],...,-117.042416,4201 Camino De La Plz,,,San Ysidro,92173,US,CA,"[4201 Camino De La Plz, San Ysidro, CA 92173]",$$


In [304]:
with open('./../../data/yelp_attraction.csv', 'w') as f:
    df_business_final.to_csv(f, index=False)

In [305]:
df_csv = pd.read_csv('./../../data/yelp_attraction.csv')

In [306]:
df_coordinates = df_csv[['coordinates_latitude', 'coordinates_longitude']].dropna()

In [307]:
# Build map 
map_zone = folium.Map(location=zone_center, zoom_start=10, tiles='cartodbpositron', width=800, height=600)
for i in range(len(df_coordinates)):
    folium.CircleMarker((df_coordinates.iloc[i][0], df_coordinates.iloc[i][1]), 
                        radius=1, color='#FF0000', fill_color='#0080bb').add_to(map_zone)
map_zone

In [338]:
len(df_csv)

1790

In [339]:
# df_review = None
start_again = False
for i, business_id in enumerate(df_csv['id']):
    if not start_again:
        if business_id == 'vANmdmXGBJvqljlNoBFmvA':
            start_again = True
    else:
        response = yelp_api.reviews_query(id=business_id)
        df = pd.json_normalize(response['reviews'])
        df.insert(0, 'business_id', business_id) 

        if df_review is not None:
            df_review = pd.concat([df_review, df], sort=False, ignore_index=True)
        else:       
            df_review = df.copy()
    
    if i % 100 == 0:
        print('Processing business #{}'.format(i + 1))

Processing business #1
Processing business #101
Processing business #201
Processing business #301
Processing business #401
Processing business #501
Processing business #601
Processing business #701
Processing business #801
Processing business #901
Processing business #1001
Processing business #1101
Processing business #1201
Processing business #1301
Processing business #1401
Processing business #1501
Processing business #1601
Processing business #1701


In [350]:
len(df_review)

5322

In [351]:
df_review.tail(5)

Unnamed: 0,business_id,id,url,text,rating,time_created,user.id,user.profile_url,user.image_url,user.name
5320,NZ1Tc5327OiVNq5gf3pz_w,38A9a-3FJFj_BrS8a4st4Q,https://www.yelp.com/biz/kit-carson-amphitheat...,Does anyone know when Helen Reddy performed he...,4,2020-09-29 21:55:38,N0cHox3p9YJqhf3LHQEYug,https://www.yelp.com/user_details?userid=N0cHo...,,Stuart T.
5321,NZ1Tc5327OiVNq5gf3pz_w,uE1LxAdMRtG1OuSUErRefQ,https://www.yelp.com/biz/kit-carson-amphitheat...,We had a good time when we attended Jesus Chri...,4,2018-08-01 19:02:21,CqTSy-8kS7A18KJJz9nigg,https://www.yelp.com/user_details?userid=CqTSy...,https://s3-media3.fl.yelpcdn.com/photo/wkxlEAg...,Alexandra B.
5322,Of9u4pxVO4jR_OczKYUeog,H_gCTGfH3mbsOAeD0dQK2Q,https://www.yelp.com/biz/kit-carson-park-escon...,"Just attended a birthday party there, it seems...",5,2019-08-31 16:38:57,kP-IYwVVqdsP2Ep9lR6XnQ,https://www.yelp.com/user_details?userid=kP-IY...,https://s3-media2.fl.yelpcdn.com/photo/emPQLMr...,Ile R.
5323,Of9u4pxVO4jR_OczKYUeog,alzgWfWXvWR1XQacz0ONeQ,https://www.yelp.com/biz/kit-carson-park-escon...,Park is huge. Free parking. Sad to see lots of...,4,2020-11-01 10:07:54,D9iEY9vYTATJKe5g7urbpg,https://www.yelp.com/user_details?userid=D9iEY...,https://s3-media2.fl.yelpcdn.com/photo/gXGHzo9...,Heci L.
5324,Of9u4pxVO4jR_OczKYUeog,fWjNyHLd6c3EGuKko3ydrw,https://www.yelp.com/biz/kit-carson-park-escon...,Pretty busy and hardly anyone wearing masks li...,2,2020-09-10 04:54:42,JbLdpXCUAO7cAWdMww28uA,https://www.yelp.com/user_details?userid=JbLdp...,https://s3-media4.fl.yelpcdn.com/photo/zsNA-PD...,Lisa M.


In [None]:
df_review = df_review.drop_duplicates(subset=['id'])

In [346]:
columns={
    'user.id':'user_id',
    'user.profile_url':'user_profile_url',    
    'user.image_url':'user_image_url',
    'user.name':'user_name',
}
df_review_final = df_review.rename(columns=columns)

In [348]:
with open('./../../data/yelp_attraction_review.csv', 'w') as f:
    df_review_final.to_csv(f, index=False)