In [1]:
import numpy as np
import pandas as pd
from yelpapi import YelpAPI
from googleplaces import GooglePlaces, types, lang
import googlemaps 
import gmaps as maps

In [2]:
# Open Yelp API key file and read key
# The file is assumed to contain only one single line with the API key

def read_file_content(filename):
    """ Given a filename,
        returns the content of this file
    """
    try:
        with open(filename, 'r') as f:
            return f.read()
    except FileNotFoundError:
        print("'%s' file not found" % filename)

YELP_API_KEY = read_file_content("./API Keys/YELP_API_KEY.txt")
yelp_api = YelpAPI(YELP_API_KEY, timeout_s=3.0)

In [3]:
# Configure Google Places
PLACES_API_KEY = read_file_content("./API Keys/GOOGLE_PLACES_API_KEY.txt")
gmaps = googlemaps.Client(key=PLACES_API_KEY)

### Yelp Top 20 vs. Google Top 20 Restaurants in Stuttgart

Comparing the Top 20 results for Restaurants near Stuttgart from Google vs. Yelp, it seems that the Google ranking tends to prefer high-quality restaurants while the Yelp ranking tends to prefer restaurants visited frequently (e.g. 'Frittenwerk' vs. 'Hotel Royal', 'Biertargen im Schlossgarten' vs. 'Der Zauberlehrling')

In [4]:
# Lat/lng Stuttgart Hbf
lat_hbf = 48.783333
lng_hbf = 9.183333

In [5]:
# Top 20 restaurants in Stuttgart
response = yelp_api.search_query(term='restaurant', latitude=lat_hbf, longitude=lng_hbf, sort_by='best_match', limit=20)

In [6]:
def print_restaurant_names(response):
    restaurants = response['businesses']
    for rest in restaurants:
        print (rest['name'])
        
def get_restaurant_names(response):
    restaurant_list = []
    restaurants = response['businesses']
    for rest in restaurants:
        restaurant_list.append(rest['name'])
    return restaurant_list

In [7]:
print_restaurant_names(response)

Biergarten im Schlossgarten
Frittenwerk
Gaststätte Schlesinger
Poffers Café Stuttgart
Die Zirbelstube
Kleinschmeckerei
Aspendos
Takeshii's
Ramen 8
Brauhaus Schönbuch
Gravity
Cube
L'Osteria
Italiani
Injeera
Zum Becher
Mandu
Food Court
Thios Inn
Carls Brauhaus


In [8]:
# Compare Top 20 Google results with top 20 Yelp results -> there is no common element
top_20_yelp = get_restaurant_names(response)

# Google Top 20, which we recieved earlier (but saved only the IDs) 
top_20_google = ['Hotel Royal', 'Der Zauberlehrling', 'Alte Kanzlei', 'Oggi', 'Enchilada Stuttgart', 
                 '5 Bar Gourmetrestaurant', 'Valle', 'Sky Beach Stuttgart', 'BLOCK HOUSE Eberhardstraße', 
                 'Arche Weinstube', 'VAPIANO Suttgart Bolzstrasse', 'Weinstube Fröhlich', 
                 'Paulaner am alten Postplatz', 'Prince of India', 'Hotel-Restaurant Köhler', 'Ambiente Africa', 
                 'MAREDO Steakhaus Stuttgart', 'BLOCK HOUSE Arnulf-Klett-Platz', 'CUBE Restaurant', 'Amici']

list(set(top_20_yelp) & set(top_20_google))

[]

### Download Yelp IDs based on Multiple Google Places Nearby Searches

Earlier we downloaded a list of ~1300 Google Place IDs for the most popular restaurants in Stuttgart area. The list was obtained by making multiple Google Places Nearby Searches around varying search centers. This list of IDs represents a good selection of restaurants (in terms of relevancy and quality of the restaurants), and the starting point for our Restaurant database.

Since we are not allowed to save any Google Places data other than the ID, we need to gather the restaurant information from the Yelp API, matching the Google Places ID to the corresponding Yelp IDs.

To do so, we first look up name and location at Google Places for each Google ID, then try to find the corresponding Yelp business by making a Yelp search_query based on name and location.

In [9]:
# For each restaurant id in the Google list, lookup name and geolocation at Google,
# then do a matching query at Yelp

google_restaurant_ids = np.array(
    pd.read_csv('./data/google_restaurant_ids.csv')['id'])

In [11]:
# For each Google ID, look up lat, lng, and name at Google Places
# Try to find matching Yelp Business based on name and geolocation.

yelp_ids_based_on_google = []
for google_id in google_restaurant_ids:
    try:
        # Lookup name + geolocations at Google by ID 
        google_restaurant = gmaps.place(place_id=google_id)
        lat = google_restaurant['result']['geometry']['location']['lat']
        lng = google_restaurant['result']['geometry']['location']['lng']
        name = google_restaurant['result']['name']
        # Now try to find it at Yelp based on name and geolocation
        yelp_restaurant = yelp_api.search_query(term=name, latitude=lat, longitude=lng, radius=500, limit=2)
        yelp_id = yelp_restaurant['businesses'][0]['id']
        yelp_latitude = yelp_restaurant['businesses'][0]['coordinates']['latitude']
        yelp_longitude = yelp_restaurant['businesses'][0]['coordinates']['longitude']
        yelp_ids_based_on_google.append(yelp_id)
    except Exception:
        continue

In [18]:
# Write to CSV file and Excel file
df = pd.DataFrame(yelp_ids_based_on_google, columns=['yelp_id'])
df.to_csv('yelp_ids_based_on_google_search.csv')
df.to_excel('yelp_ids_based_on_google_search.xlsx')

### Download Yelp IDs based on Multiple Yelp searches 

Only ~420 of the restaurants found via Google Places could be matched to a Yelp ID based on name and geolocation search. This is probably the case when the names at Google Places vs. Yelp differ too much (even though the Yelp API does not seem to require an exact match), or when the geolocations differ too much. However, we saw the results changing across multiple executions (cf. https://github.com/Yelp/yelp-fusion/issues/197), and the number of matches ranged between 408 and 430.

So we need to supplement these matches with additional results, retrieved from Yelp directly. 

Even though the Yelp API would support to request up to 1000 results around one location (instead 3x20 ), we prefer a grid-based search around multiple search centers (similar as we did with Google Places earlier), which is supposed to provide a higher coverage.

In [12]:
# Define inner and outer quadrangle within we want to search as well as the appropriate step width.
# For the inner quardrangle, we define a smaller step width. 

step = -0.029 # ~2km
smaller_step = step / 2

# Stuttgart area
start_lat = 48.85
end_lat = 48.66
start_long= 9.38
end_long = 8.95

# Stuttgart city
start_lat_city = 48.83
end_lat_city = 48.72
start_long_city = 9.25
end_long_city = 9.10


In [13]:
# Show search centers as markers on Google Maps

maps.configure(PLACES_API_KEY)
fig = maps.figure()

# Draw red markers for the wider area of Stuttgart
marker_locations=[]
for i in np.arange(start_lat, end_lat, step):
    for j in np.arange(start_long, end_long, step):
        # draw markers only if i and j is not within the city area
        if ((i > start_lat_city) | (i < end_lat_city) | (j > start_long_city) | (j < end_long_city)):
            marker_locations.append((i, j))
            
print("Number of red markers: {}".format(len(marker_locations)))   
markers_red = maps.symbol_layer(marker_locations, fill_color='blue', stroke_color='blue')
fig.add_layer(markers_red)

# Draw blue markers for the city area of Stuttgart
marker_locations=[]
for i in np.arange(start_lat_city, end_lat_city, smaller_step):
    for j in np.arange(start_long_city, end_long_city, smaller_step):
        marker_locations.append((i, j))

print("Number of blue markers: {}".format(len(marker_locations)))   
markers_blue = maps.symbol_layer(marker_locations, fill_color='purple', stroke_color='purple')
fig.add_layer(markers_blue)

fig


Number of red markers: 85
Number of blue markers: 88


Figure(layout=FigureLayout(height='420px'))

In [41]:
# Run Yelp query for each marker

yelp_ids = []
yelp_names = []
yelp_lats = []
yelp_lngs = []
for marker in marker_locations:
    yelp_result = yelp_api.search_query(
        term='restaurant', latitude=marker[0], longitude=marker[1], radius=1500, limit=50)
    for i in range(len(yelp_result['businesses'])):
        yelp_id = yelp_result['businesses'][i]['id']
        name = yelp_result['businesses'][i]['name']
        lat = yelp_result['businesses'][i]['coordinates']['latitude']
        lng = yelp_result['businesses'][i]['coordinates']['longitude']
        yelp_ids.append(yelp_id)
        yelp_names.append(name)
        yelp_lats.append(lat)
        yelp_lngs.append(lng)


In [42]:
assert(len(yelp_names) == len(yelp_ids) & len(yelp_ids) == len(yelp_lats) & len(yelp_lats) == len(yelp_lngs))

In [43]:
# Build DataFrame
d = {'yelp_id':yelp_ids, 'name':yelp_names, 'latitude': yelp_lats, 'longitude':yelp_lngs}
yelp_restaurants = pd.DataFrame(d)
yelp_restaurants.head()

Unnamed: 0,yelp_id,name,latitude,longitude
0,-k0Ksk_FSzQAiKU6tdjayg,Restaurant Tsv Steinhaldenfeld,48.82974,9.24035
1,K6T2ZTjriU-4MYWJrPGlGg,SchwabenSpeisen,48.8304,9.25935
2,5djuZfIkqf1ykQGbpWESrg,Stadio,48.8319,9.25231
3,qzeKtnP08p-u65yBINrddw,Restaurant Schmidener Eintracht,48.834041,9.26424
4,R93PaCrS4qai9u1uiiHPMw,"Restaurant ""La Perla"" im Schießsportzentrum",48.833358,9.249487


In [44]:
yelp_restaurants['yelp_id'].nunique()

832

In [45]:
# De-dupliacte
yelp_restaurants = yelp_restaurants.drop_duplicates()

#### Yelp IDs found via Matching Google results vs. Yelp IDs found via Yelp directly

`yelp_ids_based_on_google` (IDs based on Google search): 
 * Based on the list of Google IDs, obtained by multiple Google Places Nearby Searches with varying search centers, returning 3x20 results each
 * Based on the Google ID, lookup the Google name and try to find a match at Yelp
 * For 1333 Google restaurants, only 430 matches could be identified, probalby because of name or geolocation mismatch between Yelp and Google (e.g. "Eat Drink Man Woman" vs. "Gasthaus Eat Drink Man Woman")

`yelp_ids` (IDs based on Yelp search): 
* Based on multiple Yelp Search Queries with 50 results each, using the same search grid as earlier for the Google search
* 823 results after de-duplication

`yelp_ids_based_on_google` - `yelp_ids`: 
* 97 restaurants that are in `yelp_ids_based_on_google`, but not in `yelp_ids` .
* Restaurants that were originally found via Google grid search, then mapped to the Yelp equivalents, but are not found via Yelp grid search

`yelp_ids` - `yelp_ids_based_on_google`: 
* 498 restaurants that are in `yelp_ids`, but not in `yelp_ids_based_on_google` 
* Restaurants that are found via Yelp grid search, but not included in the Yelp equivalents for our previous Google search
* Mostly due to 'name mismatch' between Google name and Yelp name 

==> We continue with a merged set (`yelp_ids_based_on_google` + `yelp_ids`) to avoid missing any relevant restaurant, wether it comes from Google or from Yelp. This gives us **823 + 97 = 920 restaurants in the area of Stuttgart**.

In [71]:
yelp_set = set(yelp_ids)
google_set = set(yelp_ids_based_on_google)
print(len(yelp_set), len(google_set))

832 403


In [72]:
# IDs that are included in Yelp-based grid search, but not in Google-based grid search
# This includes also the name mismatch cases
yelp_minus_google = yelp_set.difference(google_set)
len(yelp_minus_google)

519

In [73]:
# IDs that are included in Google-based search, but not in Yelp based search
google_minus_yelp = google_set.difference(yelp_set)
len(google_minus_yelp)

90

In [75]:
merged_ids = yelp_set.union(google_set)

In [86]:
# Save merged IDs from Google and Yelp-based search 
df = pd.DataFrame(list(merged_ids), columns=['yelp_id'])
df.to_csv('./data/yelp_ids.csv', index=False)
df.to_excel('./data/yelp_ids.xlsx', index=False)

### Enrich merged IDs with detail information

We now have a list of ~820 Yelp IDs, retrieved either by matching the Google Places to Yelp Businesses, or by doing Yelp searches directly. For these IDs, we need to gather the required information (mostly from Yelp). We perform the following steps: 
- For each Yelp ID, make a Yelp business request and enrich the given ID with detail information (such as name, categories, geolocations).
- For each Yelp ID, find the corresponding Google ID from Google Places. This is done with a Google Places find_place search based on Yelp name and location. For every Yelp ID, the corresponding Google ID can be found.

In [83]:
# Read in restaurant IDs.
df_yelp_ids = pd.read_csv('./data/yelp_ids.csv')

In [96]:
# For each Yelp ID, get detail information from Yelp

id_list = []
name_list = []
is_closed_list = []
categories_list = []
price_list = []
rating_list = []
review_count_list = []
photos_list = []
address_list = []
lat_list = []
lng_list = []
for yelp_id in df_yelp_ids['yelp_id'].values:
    try:
        restaurant = yelp_api.business_query(id=yelp_id)
        name = restaurant['name']
        is_closed = restaurant['is_closed']
        categories = restaurant['categories']
        categories_unstacked = []
        for c in categories:
            categories_unstacked.append(c['title'])
        if 'price' in restaurant:
            price_level = restaurant['price']
        else:
            price_level = ''
        rating = restaurant['rating']
        review_count = restaurant['review_count']
        if 'photos' in restaurant:
            photo_urls = restaurant['photos']
        else: photo_urls = ''
        address = restaurant['location']['display_address']
        lat = restaurant['coordinates']['latitude']
        lng = restaurant['coordinates']['longitude']
        # Append everything to the lists
        id_list.append(yelp_id)
        name_list.append(name)
        address_list.append(address)
        is_closed_list.append(is_closed)
        categories_list.append(categories_unstacked)
        price_list.append(price_level)
        rating_list.append(rating)
        review_count_list.append(review_count)
        photos_list.append(photo_urls)
        lat_list.append(lat)
        lng_list.append(lng)
    except:
        continue
    
assert(len(id_list) == len(name_list) & len(name_list) == len(address_list) & len(address_list) == len(is_closed_list)
       & len(is_closed_list) == len(categories_list) & len(categories_list) == len(price_list) & 
       len(price_list) == len(rating_list) & len(rating_list) == len(review_count_list) & 
       len(review_count_list) == len(photos_list) & len(photos_list) == len(lat_list) & 
       len(lat_list) == len(lng_list))

In [97]:
d = {
    'yelp_id' : id_list,
    'yelp_name' : name_list,
    'address' : address_list, 
    'is_closed' : is_closed_list,
    'categories' : categories_list,
    'price_level' : price_list,
    'avg_rating' : rating_list,
    'review_count' : review_count_list,
    'photo_urls' : photos_list,
    'latitude' : lat_list, 
    'longitude' : lng_list
}

df_yelp = pd.DataFrame(d)

In [98]:
#Filter out all restaurants that are  permanently closed -> before 920, afterwards 915
df_yelp.drop(df_yelp[df_yelp['is_closed']==True].index, inplace=True)

In [99]:
# Write details DataFrame to file
df_yelp.to_csv('yelp_restaurants.csv', index=False)
df_yelp.to_excel('yelp_restaurants.xlsx', index=False)

In [100]:
# For each Yelp ID, find the corresponding Google ID
# Do so by first looking up the Yelp name, then make a Google Places find_place search based on Yelp name and location

google_ids = []
google_names = []
yelp_name_list = df_yelp['yelp_name'].values

for yelp_name in yelp_name_list:
    google_restaurant = gmaps.find_place(
        input=yelp_name, input_type='textquery', 
        location_bias='point:' + str(lat_hbf) + ', ' + str(lng_hbf), 
        fields=['place_id', 'name'],
        language='de')
    if (google_restaurant['candidates']):
        google_id = google_restaurant['candidates'][0]['place_id']
        google_name = google_restaurant['candidates'][0]['name']
    else:
        google_id = ''
        google_name = ''
    google_ids.append(google_id)
    google_names.append(google_name)
        
assert(len(yelp_name_list) == len(google_ids) & len(google_ids) == len(google_names))

In [518]:
df_yelp['google_id'] = google_ids
df_yelp['google_name'] = google_names

In [528]:
# Copy and re-order columns. 
df_yelp_reordered = df_yelp[['yelp_id', 'google_id', 'yelp_name', 'google_name', 'address', 'categories', 
                  'price_level', 'avg_rating', 'review_count', 'photo_urls', 'latitude', 'longitude']]
df_yelp = df_yelp_reordered

In [529]:
df_yelp.head()

Unnamed: 0,yelp_id,google_id,yelp_name,google_name,address,categories,price_level,avg_rating,review_count,photo_urls,latitude,longitude
1,6XKiHruNrTO8jDAcHLoR3w,ChIJF5m2rSLFmUcRH9sBPHwx3kM,Bäckerhaus Veit Café,Bäckerhaus Veit Café,"[Epplestr. 1 b, 70597 Stuttgart, Germany]","[Bakeries, Cafes]",,1.5,5,[https://s3-media2.fl.yelpcdn.com/bphoto/M0vpn...,48.749109,9.16981
2,0Ep-rCF4UVrz3ZJkd_gAXQ,ChIJOWEUrcvEmUcRqRXjZal8AZY,Prince of India,Prince of India,"[Urbanstr. 38, 70182 Stuttgart, Germany]",[Indian],€€,3.0,27,[https://s3-media4.fl.yelpcdn.com/bphoto/rwxyn...,48.778664,9.187046
3,MVScl0YGy_ZkGb7iN58k-A,ChIJI_1zBlHbmUcROMi0FydapcE,Bocca Buona,Bocca Buona Restaurant,"[Hauptstätter Str. 147, 70180 Stuttgart, Germany]",[Italian],,2.5,3,[https://s3-media4.fl.yelpcdn.com/bphoto/NA4u_...,48.764643,9.169958
4,KbIoL9bMYzCZDWxpWTcx0g,ChIJ6wNLZzPbmUcRrtanMZs8deo,Weinwirtschaft | Weingut Franz Keller,Weinwirtschaft | Weingut Franz Keller,"[Schillerstr. 23, 70173 Stuttgart, Germany]","[International, German, Wine Bars]",€€,3.5,10,[https://s3-media2.fl.yelpcdn.com/bphoto/l0icD...,48.7826,9.18246
5,LlLDMhhPeCKs4oyZxFQGaw,ChIJo1Y9RTjQmUcRnsk3LcgHomE,Zum Adler Gasthaus Weinstube,Hotel & Restaurant Adler,"[Alte Stuttgarter Str. 2, 70195 Stuttgart, Ger...",[Restaurants],€,5.0,3,[https://s3-media2.fl.yelpcdn.com/bphoto/6_MBr...,48.7782,9.12474


In [None]:
# Drop column 'google_name', as it may not be saved permanently
df_yelp = df_yelp.drop(columns=['google_name'])

In [530]:
# Write to file
df_yelp.to_csv('yelp_restaurants.csv', index=False)
df_yelp.to_excel('yelp_restaurants.xlsx', index=False)