In [6]:
# imports

import requests
import os
import pandas as pd

# Foursquare

## Julie's Notes:
From Foursquare categories list (https://location.foursquare.com/places/docs/categories), I compiled the following categories (sometimes combined) which I will send with the "categories" query parameter for each bike station location:

<img src='../images/foursquare_categories.png'>

These decisions attempt to optimize the tension between API call limit (40,000) and response item limit (maxxes out at 50 items).

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [150]:
# Pull my FOURSQUARE API key into a variable
FOURSQUARE_KEY = os.getenv('FOURSQUARE_API_KEY')

In [151]:
# Function Definitions

# Define function that will make the GET request to Foursquare
def foursquare_get_request_place_search(station_latitude, station_longitude, radius, categories, API_KEY):

    base_foursquare_endpoint = 'https://api.foursquare.com/v3'
    place_search = '/places/search'

    # Default Query Parameters for all our GET requests, that aren't otherwise passed in
    limit = 50  # Always get as many as allowed
    sort_by = 'distance'  # Foursquare says 'ratings' is a valid sort option, but I haven't seen evidence in their payload that they have ratings!

    # Craft the request_url:
    request_url = base_foursquare_endpoint + place_search + \
    '?' + \
    'll=' + str(station_latitude) + ',' + str(station_longitude) + \
    '&radius=' + str(radius) + \
    '&categories=' + categories + \
    '&limit=' + str(limit) + \
    '&sort_by=' + sort_by

    header_dict = {
    'accept': 'application/json',
    'Authorization' : FOURSQUARE_KEY
    }
    
    # DEBUG
    #print(f"Inside foursquare_get_request: request_url = {request_url}, header_dict = {header_dict}")
    print(f"     Calling API: request_url = {request_url}")

    # Make the call, get response out
    response = requests.get(request_url, headers=header_dict)

    # Return the payload_dict to caller
    return response.json()
    

# Define function that will create the default fsq_dict for each GET request:
def create_default_fsqdict(fsqdict):
    fsqdict.clear()
    fsqdict['station_id']= []
    fsqdict['place_id'] = []
    fsqdict['name'] = []
    fsqdict['distance'] = []
    fsqdict['address'] = []
    fsqdict['city'] = []
    fsqdict['postal'] = []
    fsqdict['category_id'] = []
    fsqdict['category_name'] = []
    fsqdict['query_categories'] = []
    fsqdict['query_category_text'] = []


# Define function that will parse the JSON-formatted response
def fsqdict_from_response(stationid, fsqdict, jsonpayload, query_categories, query_category_text):
    # Ternary operators are required because my testing indicated some entries are missing 'postcode'.
    # Given the risk of this erroring-out on 425 * 6 = 2550 calls, adding ternary operators to check for presence, on name/distance/address/city/postcode
    results_array = jsonpayload['results']
    for result in results_array:
        fsqdict['station_id'].append(stationid)
        fsqdict['place_id'].append(result['fsq_id'])
        fsqdict['name'].append(result['name'] if 'name' in result else 'N/A')
        fsqdict['distance'].append(result['distance'] if 'distance' in result else NaN)
        fsqdict['address'].append(result['location']['address'] if 'address' in result['location'] else 'N/A')
        fsqdict['city'].append(result['location']['locality'] if 'locality' in result['location'] else 'N/A')
        fsqdict['postal'].append(result['location']['postcode'] if 'postcode' in result['location'] else 'N/A')
        
        categories_array = result['categories']
        id_string = ''
        categoryname_string = ''
        for entry in categories_array:
            id_string += str(entry['id']) + '|'
            categoryname_string += entry['name'] + '|'
        
        id_string = id_string[:-1]
        categoryname_string = categoryname_string[:-1]
        fsqdict['category_id'].append(id_string)
        fsqdict['category_name'].append(categoryname_string)
        fsqdict['query_categories'].append(query_categories)
        fsqdict['query_category_text'].append(query_category_text)

    return fsqdict

In [153]:
# Categories - 6 API calls for each station_id (each of these strings for 'categories=' in the query parameter)
# Unlike Yelp, this needs a dictionary because the category_ids are numerical and hard for humans to identify easily when looking at the eventual dataframe
categories_dict = {
    '10027,10047,10059,10069' : 'Arts and Entertainment: Museum|Public Art',
    '13032' : 'Dining and Drinking: Cafe, Coffee and Tea House',
    '16003,16020,16046' : 'Landmarks and Outdoors: Beach|Historic Site|Scenic Lookout',
    '16004' : 'Bike Trail',
    '16032' : 'Park',
    '19010,19013,19014,19019' : 'Travel and Transportation: B&B|Hostel|Hotel|Vacation Rental'
}

# Set Default Radius
radius = 1000

# Load the citybikes dataframe
stations_df = pd.read_csv('../data/citybikes_vancouver.csv')

# Generate the list of station_ids
station_ids_list = stations_df['id'].tolist()
station_ids_list = sorted(station_ids_list)

# Set up an empty dataframe 'rolling_df' which will collect each individual API call into the larger dataframe
rolling_df = pd.DataFrame()

#for station_id in station_ids_list[:1]:  # Testing only
for station_id in station_ids_list:
    print(f"************** New Station! **************")
    for entry_category, category_text in categories_dict.items():
        filt_station = (stations_df['id'] == station_id)
        station_lat = stations_df.loc[filt_station]['lat'].values[0]
        station_long = stations_df.loc[filt_station]['long'].values[0]
        print (f"Working on station_id: {station_id}, categories being sent is: {entry_category} ({category_text})")
        print (f"     station_lat, station_long = ({station_lat}, {station_long})")

        # Call the function to do Foursquare GET request from the API
        payload_dict = foursquare_get_request_place_search(station_lat, station_long, radius, entry_category, FOURSQUARE_KEY)
        
        # Define/reset the fsqdict dictionary, which holds the parsed JSON from the REST GET API call
        fsqdict = dict()
        create_default_fsqdict(fsqdict)

        # Parse the JSON from the payload from the API call
        fsqdict_from_response(station_id, fsqdict, payload_dict, entry_category, category_text)
        
        # Create a temp_df dataframe
        temp_df = pd.DataFrame(fsqdict)

        # Debug
        print(f"          number rows = {temp_df.shape[0]}")

        # Add the just-generated dataframe from this singular previous API call, to the cumulative "rolling_df" dataframe for the entirety of the station_ids
        rolling_df = pd.concat([rolling_df, temp_df], ignore_index=True)

************** New Station! **************
Working on station_id: 00fa94ad698dc4a9e4d708d6fd32f294, categories being sent is: 10027,10047,10059,10069 (Arts and Entertainment: Museum|Public Art)
     station_lat, station_long = (49.291909, -123.140713)
     Calling API: request_url = https://api.foursquare.com/v3/places/search?ll=49.291909,-123.140713&radius=1000&categories=10027,10047,10059,10069&limit=50&sort_by=distance
          number rows = 3
Working on station_id: 00fa94ad698dc4a9e4d708d6fd32f294, categories being sent is: 13032 (Dining and Drinking: Cafe, Coffee and Tea House)
     station_lat, station_long = (49.291909, -123.140713)
     Calling API: request_url = https://api.foursquare.com/v3/places/search?ll=49.291909,-123.140713&radius=1000&categories=13032&limit=50&sort_by=distance
          number rows = 32
Working on station_id: 00fa94ad698dc4a9e4d708d6fd32f294, categories being sent is: 16003,16020,16046 (Landmarks and Outdoors: Beach|Historic Site|Scenic Lookout)
     s

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [1]:
# See above frame for answer.

Put your parsed results into a DataFrame

In [3]:
# See above frame for answer.

#### Julie's Notes: Housekeeping tasks related to saving and retrieving the parsed dataframe

In [157]:
# Save the dataframe as .csv file
#rolling_df.to_csv('../data/fsq_vancouver.csv', index=False)  # Saved on 2023-10-20 afternoon

In [7]:
# Load FSQ .csv files, into a singular dataframe for Foursquare
fsq_df = pd.read_csv('../data/fsq_vancouver.csv')

#### Julie's Notes: Various cells aimed at having a first look at the Foursquare dataframe

In [10]:
fsq_df.shape

(18007, 11)

In [11]:
fsq_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18007 entries, 0 to 18006
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   station_id           18007 non-null  object 
 1   place_id             18007 non-null  object 
 2   name                 18007 non-null  object 
 3   distance             18007 non-null  float64
 4   address              16338 non-null  object 
 5   city                 17729 non-null  object 
 6   postal               14364 non-null  object 
 7   category_id          18007 non-null  object 
 8   category_name        18007 non-null  object 
 9   query_categories     18007 non-null  object 
 10  query_category_text  18007 non-null  object 
dtypes: float64(1), object(10)
memory usage: 1.5+ MB


In [13]:
fsq_df.describe()

Unnamed: 0,distance
count,18007.0
mean,2533.318
std,79073.35
min,2.0
25%,465.0
50%,676.0
75%,853.0
max,3357405.0


In [16]:
fsq_df['category_id'].value_counts()

category_id
19014                      2668
16032                      2470
13035                      1411
13035|13065                 961
13034|13035|13065           892
                           ... 
13036|13099|13303             1
13022|13035|13065             1
13034|13039|13322             1
13002|13032|13065             1
11086|11124|12082|19014       1
Name: count, Length: 221, dtype: int64

In [14]:
fsq_df.groupby(['station_id', 'query_category_text']).size()

station_id                        query_category_text                                        
00fa94ad698dc4a9e4d708d6fd32f294  Arts and Entertainment: Museum|Public Art                       3
                                  Dining and Drinking: Cafe, Coffee and Tea House                32
                                  Landmarks and Outdoors: Beach|Historic Site|Scenic Lookout      5
                                  Park                                                           27
                                  Travel and Transportation: B&B|Hostel|Hotel|Vacation Rental    30
                                                                                                 ..
fef69fb400210d861107a61db954d037  Park                                                           20
                                  Travel and Transportation: B&B|Hostel|Hotel|Vacation Rental    50
fffd87607aaae16fbb1f71615cbe7d17  Arts and Entertainment: Museum|Public Art                       1
      

In [15]:
fsq_df.head(10)

Unnamed: 0,station_id,place_id,name,distance,address,city,postal,category_id,category_name,query_categories,query_category_text
0,00fa94ad698dc4a9e4d708d6fd32f294,4b5b84c0f964a5202f0429e3,The Inukshuk,869.0,1700 Beach Ave,Vancouver,,10047|13065|16016|19014,Public Art|Restaurant|Fountain|Hotel,10027100471005910069,Arts and Entertainment: Museum|Public Art
1,00fa94ad698dc4a9e4d708d6fd32f294,4ded40b4fa76b21ed97ff4db,Lord Stanley of Preston statue,747.0,900 Stanley Park Dr,Vancouver,V6G 3E2,10047|16026,Public Art|Monument,10027100471005910069,Arts and Entertainment: Museum|Public Art
2,00fa94ad698dc4a9e4d708d6fd32f294,4e5955281f6e804280c4bad1,Roedde House Museum,833.0,1415 Barclay St,Vancouver,V6G 1J6,10030,History Museum,10027100471005910069,Arts and Entertainment: Museum|Public Art
3,00fa94ad698dc4a9e4d708d6fd32f294,4aa9ac4ff964a520cc5420e3,Cardero Bottega,647.0,1016 Cardero St,Vancouver,V6G 2H1,13035|13039|13145,Coffee Shop|Deli|Fast Food Restaurant,13032,"Dining and Drinking: Cafe, Coffee and Tea House"
4,00fa94ad698dc4a9e4d708d6fd32f294,52c72fe5498edf109f363e18,Greenhorn Cafe,760.0,994 Nicola St,Vancouver,V6G 2C8,13034|13035|13065,Café|Coffee Shop|Restaurant,13032,"Dining and Drinking: Cafe, Coffee and Tea House"
5,00fa94ad698dc4a9e4d708d6fd32f294,4ad008e2f964a5205fd720e3,Delany's Coffee House,413.0,1105 Denman St,Vancouver,V6G 2M7,13035|13065,Coffee Shop|Restaurant,13032,"Dining and Drinking: Cafe, Coffee and Tea House"
6,00fa94ad698dc4a9e4d708d6fd32f294,52e19f99498ee190912a3ff0,Pappa Roti,793.0,1505 Robson St,Vancouver,V6G 1C3,13002|13034|13035,Bakery|Café|Coffee Shop,13032,"Dining and Drinking: Cafe, Coffee and Tea House"
7,00fa94ad698dc4a9e4d708d6fd32f294,531f860e498e932bf3c1ecfb,JJ Bean Coffee Roasters,637.0,1209 Bidwell St,Vancouver,V6G 2K7,13035|13065,Coffee Shop|Restaurant,13032,"Dining and Drinking: Cafe, Coffee and Tea House"
8,00fa94ad698dc4a9e4d708d6fd32f294,4acfe023f964a52099d620e3,Starbucks,518.0,1795 Davie St,Vancouver,V6G 1W5,13035,Coffee Shop,13032,"Dining and Drinking: Cafe, Coffee and Tea House"
9,00fa94ad698dc4a9e4d708d6fd32f294,4b197914f964a520f8dd23e3,Red Umbrella Cafe,625.0,1707 Davie St,Vancouver,V6G 1W5,13034|13035|13051,Café|Coffee Shop|Fish and Chips Shop,13032,"Dining and Drinking: Cafe, Coffee and Tea House"


# Yelp

## Julie's Notes:

From Yelp categories list (https://docs.developer.yelp.com/docs/resources-categories), I compiled the following categories (sometimes combined) which I will send with the "categories" query parameter for each bike station location:

<img src='../images/yelp_categories.png'></img>

These decisions attempt to optimize the tension between API call limit (500 per day) and response item limit (maxxes out at 50 items).


Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [5]:
# Pull my YELP API key into a variable
YELP_KEY = os.getenv('YELP_API_KEY')

In [61]:
# Function Definitions

# Define function that will make the GET request to Yelp
def yelp_get_request_business_search(station_latitude, station_longitude, radius, categories, API_KEY):

    base_yelp_endpoint = 'https://api.yelp.com/v3'
    business_search = '/businesses/search'

    # Default Query Parameters for all our GET requests, that aren't otherwise passed in
    limit = 50  # Always get as many as allowed
    sort_by = 'best_match'

    # Craft the request_url:
    request_url = base_yelp_endpoint + business_search + \
    '?' + \
    'latitude=' + str(station_latitude) + \
    '&longitude=' + str(station_longitude) + \
    '&radius=' + str(radius) + \
    '&categories=' + categories + \
    '&limit=' + str(limit) + \
    '&sort_by=' + sort_by

    header_dict = {
        'accept': 'application/json',
        'Authorization' : 'Bearer ' + API_KEY
        }
    
    # DEBUG
    #print(f"Inside yelp_get_request: request_url = {request_url}, header_dict = {header_dict}")

    # Make the call, get response out
    response = requests.get(request_url, headers=header_dict)

    # Return the payload_dict to caller
    return response.json()
    

# Define function that will create the default yelp_dict for each GET request:
def create_default_yelpdict(yelpdict):
    yelpdict.clear()
    yelpdict['station_id']= []
    yelpdict['place_id'] = []
    yelpdict['name'] = []
    yelpdict['distance'] = []
    yelpdict['address'] = []
    yelpdict['city'] = []
    yelpdict['postal'] = []
    yelpdict['review_count'] = []
    yelpdict['rating'] = []
    yelpdict['category_id'] = []
    yelpdict['category_name'] = []
    yelpdict['query_categories'] = []


# Define function that will parse the JSON-formatted response
def yelpdict_from_response(stationid, yelpdict, jsonpayload, query_categories):
    businesses_array = jsonpayload['businesses']
    for biz in businesses_array:
        yelpdict['station_id'].append(stationid)
        yelpdict['place_id'].append(biz['id'])
        yelpdict['name'].append(biz['name'])
        yelpdict['distance'].append(biz['distance'])
        yelpdict['address'].append(biz['location']['address1'])
        yelpdict['city'].append(biz['location']['city'])
        yelpdict['postal'].append(biz['location']['zip_code'])
        yelpdict['review_count'].append(biz['review_count'])
        yelpdict['rating'].append(biz['rating'])

        categories_array = biz['categories']
        alias_string = ''
        title_string = ''
        for entry in categories_array:
            alias_string += entry['alias'] + '|'
            title_string += entry['title'] + '|'
        
        alias_string = alias_string[:-1]
        title_string = title_string[:-1]
        yelpdict['category_id'].append(alias_string)
        yelpdict['category_name'].append(title_string)
        yelpdict['query_categories'].append(query_categories)

    return yelpdict

In [None]:
# Define Yelp query string categories / category names that will be sent in GET requests
# Adjusted - this was too many API calls
# category_mapping = {
#     "beaches,parks": "Beaches, Parks",
#     "bicyclepaths,mountainbiking": "Bicycle Paths, Mountain Biking",
#     "museums": "Museums",
#     "coffee": "Coffee & Tea",
#     "juicebars": "Juice Bars & Smoothies",
#     "hostels,hotels": "Hotels, Hotels"
# }

# Categories - 3 API calls for each station_id (each of these strings for categories= in the query parameter)
categories = ['beaches,parks,bicyclepaths,mountainbiking', 'museums', 'hostels,hotels']

# Set Default Radius
radius = 1000

# Load the citybikes dataframe
stations_df = pd.read_csv('../data/citybikes_vancouver.csv')

# Generate the list of station_ids
station_ids_list = stations_df['id'].tolist()
station_ids_list = sorted(station_ids_list)

# Break into partition sizes to maximize usage of Yelp's 500 daily call limit (resets at 6pm Mountain == midnight UTC) and 3 calls per station_id in 245 stations
partition_1_size = 122
partition_2_size = 36

partition_1 = station_ids_list[:partition_1_size]
partition_2 = station_ids_list[partition_1_size: partition_1_size + partition_2_size]
partition_3 = station_ids_list[partition_1_size + partition_2_size:]  # is 87 ids long

# Set up an empty dataframe 'rolling_df' which will collect each individual API call into the larger dataframe
rolling_df = pd.DataFrame()

#for station_id in station_ids_list:  # Can't use this because Yelp's daily limit will cause this to fail mid-way
#for station_id in partition_1:  # Used this on 2023-10-19
#for station_id in partition_2:  # Used this on 2023-10-19 to hit remainder of calls for the day
for station_id in partition_3:  # Used this on 2023-10-20 to hit remainder of calls for the day
    print(f"************** New Station! **************")
    for entry_category in categories:
        filt_station = (stations_df['id'] == station_id)
        station_lat = stations_df.loc[filt_station]['lat'].values[0]
        station_long = stations_df.loc[filt_station]['long'].values[0]
        print (f"Working on station_id: {station_id}, categories being sent is: {entry_category}")
        print (f"     station_lat, station_long = ({station_lat}, {station_long})")

        # Call the function to do YELP GET request from the API
        #payload_dict = yelp_get_request_business_search(station_lat, station_long, radius, entry_category, YELP_KEY)
        
        # Define/reset the yelpdict dictionary, which holds the parsed JSON from the REST GET API call
        yelpdict = dict()
        create_default_yelpdict(yelpdict)

        # Parse the JSON from the payload from the API call
        yelpdict_from_response(station_id, yelpdict, payload_dict, entry_category)
        
        # Create a temp_df dataframe
        temp_df = pd.DataFrame(yelpdict)

        # Add the just-generated dataframe from this singular previous API call, to the cumulative "rolling_df" dataframe for the entirety of the station_ids
        rolling_df = pd.concat([rolling_df, temp_df], ignore_index=True)

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [17]:
# See above frame for answer.

Put your parsed results into a DataFrame

In [18]:
# See above frame for answer.

#### Julie's Notes: Housekeeping tasks related to saving and retrieving the parsed dataframe

In [84]:
# Save the individual partitioned dataframes as .csv files

rolling_df.to_csv('../data/yelp_vancouver_partition1.csv', index=False)  # Saved on 2023-10-20 evening for partition_1
rolling_df.to_csv('../data/yelp_vancouver_partition2.csv', index=False)  # Saved on 2023-10-20 evening for partition_2
rolling_df.to_csv('../data/yelp_vancouver_partition3.csv', index=False)  # Saved on 2023-10-21 for partition_3

In [76]:
# Calculations required to support breaking station_id list into partition sizes to maximize usage of Yelp's 500 daily call limit
# (resets at 6pm Mountain == midnight UTC) and 3 calls per station_id in 245 stations
partition_1_size = 122
partition_2_size = 36

partition_1 = station_ids_list[:partition_1_size]
partition_2 = station_ids_list[partition_1_size: partition_1_size + partition_2_size]
partition_3 = station_ids_list[partition_1_size + partition_2_size:]

print(len(partition_1))
print(len(partition_2))
print(len(partition_3))

total_ids_in_partitions = len(partition_1) + len(partition_2) + len(partition_3)
print(f"total ids in all partitions ({total_ids_in_partitions}) == stations_df.shape[0]: {total_ids_in_partitions == stations_df.shape[0]}")

122
36
87
total ids in all partitions (245) == stations_df.shape[0]: True


In [19]:
# Need to pull each of the 3 partitions from 3 .csv files, into a singular dataframe for Yelp
partition1_df = pd.read_csv('../data/yelp_vancouver_partition1.csv')
partition2_df = pd.read_csv('../data/yelp_vancouver_partition2.csv')
partition3_df = pd.read_csv('../data/yelp_vancouver_partition3.csv')

yelp_df = pd.concat([partition1_df, partition2_df, partition3_df], ignore_index=True)

#### Julie's Notes: Various cells aimed at having a first look at the Yelp dataframe

In [20]:
yelp_df.shape

(6684, 12)

In [21]:
yelp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6684 entries, 0 to 6683
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   station_id        6684 non-null   object 
 1   place_id          6684 non-null   object 
 2   name              6684 non-null   object 
 3   distance          6684 non-null   float64
 4   address           6644 non-null   object 
 5   city              6684 non-null   object 
 6   postal            6659 non-null   object 
 7   review_count      6684 non-null   float64
 8   rating            6684 non-null   float64
 9   category_id       6684 non-null   object 
 10  category_name     6684 non-null   object 
 11  query_categories  6684 non-null   object 
dtypes: float64(3), object(9)
memory usage: 626.8+ KB


In [22]:
yelp_df.describe()

Unnamed: 0,distance,review_count,rating
count,6684.0,6684.0,6684.0
mean,725.31579,49.743716,3.714991
std,293.413739,77.591669,0.946343
min,6.578951,1.0,1.0
25%,511.117321,3.0,3.0
50%,750.268052,19.0,4.0
75%,953.138886,72.0,4.5
max,1410.449171,1091.0,5.0


In [23]:
yelp_df['category_id'].value_counts()

category_id
hotels                               2793
parks                                1659
hotels|venues                         298
museums                               290
hostels                               264
hotels|bedbreakfast                   211
dog_parks                             151
parks|playgrounds                     147
resorts|hotels|vacation_rentals        54
beaches                                53
gardens|parks                          50
artmuseums                             50
museums|galleries                      49
playgrounds|dog_parks                  45
catering|hotels                        44
landmarks|parks                        40
skate_parks                            38
beaches|parks                          37
casinos|hotels|venues                  36
museums|galleries|venues               34
waterparks|parks|playgrounds           32
parks|theater                          27
hotels|divebars                        27
amateursportsteams|foo

In [25]:
yelp_df.groupby(['station_id', 'query_categories']).size()

station_id                        query_categories                         
00fa94ad698dc4a9e4d708d6fd32f294  beaches,parks,bicyclepaths,mountainbiking    13
                                  hostels,hotels                               16
                                  museums                                       1
012d3e06901cc222b1c2cf0a2ace3a29  beaches,parks,bicyclepaths,mountainbiking    10
                                  hostels,hotels                                1
                                                                               ..
fef69fb400210d861107a61db954d037  hostels,hotels                               49
                                  museums                                       8
fffd87607aaae16fbb1f71615cbe7d17  beaches,parks,bicyclepaths,mountainbiking    11
                                  hostels,hotels                                1
                                  museums                                       1
Length: 604, dtype: in

In [26]:
yelp_df.head(10)

Unnamed: 0,station_id,place_id,name,distance,address,city,postal,review_count,rating,category_id,category_name,query_categories
0,00fa94ad698dc4a9e4d708d6fd32f294,kajMc2fkWKdzKJ1M4pm47Q,Stanley Park,978.386841,1166 Stanley Park Drive,Vancouver,V6G,1091.0,5.0,parks,Parks,"beaches,parks,bicyclepaths,mountainbiking"
1,00fa94ad698dc4a9e4d708d6fd32f294,VoziJj_Fw67OtZtdDzrpQg,English Bay Beach Park,783.428693,1700 Beach Avenue,Vancouver,V6E 1V3,68.0,4.5,parks,Parks,"beaches,parks,bicyclepaths,mountainbiking"
2,00fa94ad698dc4a9e4d708d6fd32f294,XHJTdq8QJp6_9oCj5hU85w,Vancouver Seawall,663.404115,,Vancouver,,101.0,5.0,hiking|parks,Hiking|Parks,"beaches,parks,bicyclepaths,mountainbiking"
3,00fa94ad698dc4a9e4d708d6fd32f294,EGZABxCmlA3PNwbSYXhLbA,Morton Park,426.335401,1800 Morton Avenue,Vancouver,V6G 1Z1,14.0,4.5,parks,Parks,"beaches,parks,bicyclepaths,mountainbiking"
4,00fa94ad698dc4a9e4d708d6fd32f294,AVulOVkLG2LIRaOdOAmdlA,Lost Lagoon,328.531508,Lagoon Dr,Vancouver,V6G,18.0,4.5,parks|lakes,Parks|Lakes,"beaches,parks,bicyclepaths,mountainbiking"
5,00fa94ad698dc4a9e4d708d6fd32f294,2CxBAbnFIOfjRASbWcHC4w,Stanley Park 2nd Beach Picnic Area,715.128216,Ceperly 2nd Beach,Vancouver,V6G 3E2,8.0,3.5,beaches,Beaches,"beaches,parks,bicyclepaths,mountainbiking"
6,00fa94ad698dc4a9e4d708d6fd32f294,4563XS_PrPJivPv_R5sW3Q,Alexandra Park,721.708957,1755 Beach avenue,Vancouver,V6E 1V3,1.0,5.0,parks,Parks,"beaches,parks,bicyclepaths,mountainbiking"
7,00fa94ad698dc4a9e4d708d6fd32f294,kRl_c-eObP6vf3KbJ0fulw,Movies in the Park,701.08881,Stanley Park Dr,Vancouver,V6G,6.0,4.5,parks|arts,Parks|Arts & Entertainment,"beaches,parks,bicyclepaths,mountainbiking"
8,00fa94ad698dc4a9e4d708d6fd32f294,42Tg2jf217mRb_rqYpKAbw,Stanley Park Shuffleboard Court Area - Gated O...,395.498272,2000 W Georgia Street,Vancouver,V6G,1.0,3.0,dog_parks,Dog Parks,"beaches,parks,bicyclepaths,mountainbiking"
9,00fa94ad698dc4a9e4d708d6fd32f294,TBcn1EwTCv3EsF4SEI3s4w,Lovers Walk Trail,1287.4175,Lovers Walk,Vancouver,V6G,2.0,5.0,hiking|parks,Hiking|Parks,"beaches,parks,bicyclepaths,mountainbiking"


# Comparing Results

Which API provided you with more complete data? Provide an explanation. 

#### Upon inspecting the datasets that came back, these are my overall impressions:

- Yelp has the better, richer data set
    - Example:  Yelp's 'coffee' for station_id '00fa94ad698dc4a9e4d708d6fd32f294' came back "maxxed out" at over 50 results, whereas Foursquare '13032' ('Cafe, Coffee and Tea House') for same location returned fewer results, within the 50 limit.
- Yelp has a slightly fuller dataset in that it provides Number of Ratings and Average Rating Values for each location
    - That being said, a major limitation on this project is the use of the "free tier" for the Developer API.
    - The API limit call (500 per day) means I cannot break the categories apart for more granular data (and to stay under the max 50 items per payload), or I will run out of calls.  The need to combine categories together in one API call means I am more likely to exceed the max number of items in payload, so my dataset's completeness and therefore reliability beceomes questionable.  Running a linear regression on "number of bikes available for rent at a given station, as a function of how many POIs within 1km radius of station" becomes problematic when many of the POI counts are at 50, but we don't know how much over 50.
- Because I had more calls on Foursquare available to me, I could make more calls for individual categories (especially breaking out the "Outdoor" categories into Museum/Public Art, Landmarks, Bike Trail, and Park.  This ability to request fewer categories for a given lat/long means I am less likely to hit the max number of items returned in payload, I'm not combining multiple subcategories together).  The Foursquare dataset therefore becomes more reliable even though it doesn't have ratings.

##### **Conclusion:  I will be using the Foursquare dataset primarily for my regression/statistical model building.**

#### Looking at the stats in yelp_df.info() vs. fsq_df.info(), and my experience tweaking the code to grab the payload data from the API:
- Yelp data is more complete (fewer missing/null values)
- FSQ data less complete, not all keys are present (esp. postal code or address)
- As the data cleaning steps showed, the FSQ data also appears to be more error-prone, with inconsistent data (a hotel in northern BC showed up as located in Vancouver, a cafe that doesn't appear to exist showed up with a Vancouver lat/long but city listed as Vernon, etc.)

#### Before even pulling the data, the below were my conclusions about which Company's data would be more promising, preferences for specific endpoints to hit for each company, and what the caveats would be about the data:

1. There is quite a difference in the number and granularity of categories (eye-ball), with Yelp seeming to be more granular and numerous in number of categories, but Foursquare having a category I liked, e.g. "Vacation Rental" which Yelp does not "break out" from "Hostels" or "Hotels":
- Yelp:  https://docs.developer.yelp.com/docs/resources-categories
- Foursquare:  https://location.foursquare.com/places/docs/categories

2.  Yelp has different flavours of Endpoints, a lot more different types of information.  Look at the different types of APIs:

##### Yelp:

APIs (Yelp Fusion) Overview:  https://docs.developer.yelp.com/docs/fusion-intro

- Can get Businesses, Reviews, Events, Available Categories, Brands, and Autocomplete (typeahead search service)
- Out of the above, the "Businesses Search" (https://docs.developer.yelp.com/reference/v3_business_search) seems most useful
    - Limitation:  Won't return any businesses without reviews <-- this might be a problem and/or biased and/or disinclude data that could be useful to us
    - Accepts lat & long
    - Has Max 50 limit
 - "Businesses Reviews" (https://docs.developer.yelp.com/reference/v3_business_reviews) could be useful, but only returns up to 3 review exercepts and does not return businesses without reviews, which could be limiting.
    -The fact it returns up to 3 reviews will skew our data because businesses that are very popular with more than 3 reviews won't be differentiable in the dataset.
      - It does not seem to return review ratings/numbers so it's not that easy or accurate to easily generate a sentiment, either, for use in a model
- Events Search (https://docs.developer.yelp.com/reference/v3_events_search) could be useful to see if more events in the immediate station radius would impact how many 
    - Before deciding to use, we would need to determine if it gives all events across the year, or only within a certain timeframe around the request time (which is less useful for statistical model)
      - 50 item max limit may be limiting


##### Foursquare: 

APIs Overview:  https://location.foursquare.com/developer/reference/api-overview

- Places API:  https://location.foursquare.com/developer/reference/places-api-overview#endpoints
    - Place Search (https://location.foursquare.com/developer/reference/place-search):  "Search for places in the FSQ Places database using a location and querying by name, category name, telephone number, taste label, or chain name. For example, search for "coffee" to get back a list of recommended coffee shops ... You may pass a location with your request by using one of the following options."
    - Place Details (https://location.foursquare.com/developer/reference/place-details):  "Retrieve comprehensive information and metadata for a FSQ Place using the fsq_id."
    - Place Photos (https://location.foursquare.com/developer/reference/place-photos):  "Retrieve photos for a FSQ Place using the fsq_id."
    - Place Tips (https://location.foursquare.com/developer/reference/place-tips):  "Retrieve tips for a FSQ Place using the fsq_id."
    - Place Match (https://location.foursquare.com/developer/reference/place-match): "Return the Foursquare record of a POI (via FSQ_ID) given a Name and Location. Provide a Location by using all the Address parameters, or by LL."
- Studio Data API (geospatial assets - not useful to us for this exercise)
- Geofence API (user-configured geofences - not useful to us for this exercise)

    - The Places API and **maybe** the Place Tips would be the most useful to us.  For Place Tips, would need to understand how many categories are available.
    - Max 50 limit in return string could be a limiting factor.



Get the top 10 restaurants according to their rating

In [29]:
# Presuming this is using the Yelp dataset because Foursquare does not provide ratings.

# I did not pull restaurants using Yelp because for my locations, most API calls were "maxing out" on numbers of even "coffee shops" being returned (more than 50),
# and hence did not think it would be accurate/useful for building a statistical model.

# I will answer your question getting Top 10 Parks according to their rating!

parks_filter = yelp_df['category_id'].str.contains('parks')
yelp_df[parks_filter].nlargest(10, 'rating')[['name', 'rating', 'address', 'category_id']]

Unnamed: 0,name,rating,address,category_id
0,Stanley Park,5.0,1166 Stanley Park Drive,parks
2,Vancouver Seawall,5.0,,hiking|parks
6,Alexandra Park,5.0,1755 Beach avenue,parks
9,Lovers Walk Trail,5.0,Lovers Walk,hiking|parks
33,Mount Pleasant Park,5.0,3161 Ontario Street,parks|playgrounds
49,Vancouver Seawall,5.0,,hiking|parks
52,Alexandra Park,5.0,1755 Beach avenue,parks
87,Creekside Park,5.0,1455 Quebec Street,parks
104,Granville Park,5.0,3001 Fir Street,parks|playgrounds|tennis
108,Angus Park,5.0,3600 Angus Drive,parks
