# W3P2 - PART3 - yelp_foursquareEDA

assignment file part 2

In [112]:
from datetime import datetime
import pandas as pd
import requests
import json
import os
import warnings
warnings.filterwarnings('ignore')

# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. Assuming the average comfortable walking speed below, the radius of 1km should list sites available within a 15min walking distance from each bike station.

$$
1km * hour/4km 
$$

In [113]:
# Import data from file
df = pd.read_csv('../data/df_ctybks_toronto.csv')
df.shape

(655, 19)

Create lat and long series from the ctyBks table:

In [114]:
lat = df['latitude']
long = df['longitude']
print(f'{lat[5]},{long[5]}')

43.657763,-79.389165


In [115]:
numStns = len(lat)

In [116]:
def fsqStationPOI(lat, long):
  '''
  input:
  output:
  '''
  api_key = os.environ["FOURSQUARE_API_KEY"]
  fields = 'fsq_id,name,geocodes,categories,distance,rating,hours,hours_popular,popularity,features'

  headers = {
      "accept": "application/json",
      "Authorization": api_key
  }

  fsq_url = (f'https://api.foursquare.com/v3/places/search?ll={lat},{long}&fields={fields}&radius=350&&limit=50')
  
  fsqStnPOI_json = requests.get(fsq_url, headers=headers).json()

  list_of_dict = []
  for poi in fsqStnPOI_json['results']:
    categories = poi.get('categories', None)
    cat_id = categories[0]['id'] if len(categories) > 0 else None
    category_name = categories[0]['name'] if len(categories) > 0 else None

    poi_dict = {
            'lat-long': f'{lat},{long}'
          , 'fsq_id': poi['fsq_id']
          , 'cat_id': cat_id
          , 'category_name': category_name
          , 'categories': categories
          , 'name': poi.get('name', None)
          , 'distance': poi.get('distance', None)
          , 'latitude': poi.get('geocodes', {}).get('main', {}).get('latitude', None)
          , 'longitude': poi.get('geocodes', {}).get('main', {}).get('longitude', None)
          , 'address': poi.get('location', {}).get('formatted_address', None)
          , 'popularity': poi.get('popularity', None)
          , 'open_now': poi.get('hours', {}).get('open_now', None)
          , 'rating': poi.get('rating', None)
          , 'rating_count': poi.get('stats', {}).get('total_ratings', None)
          , 'my_timestamp': datetime.now()
      }
      
    list_of_dict.append(poi_dict)
  return pd.DataFrame(list_of_dict)


In [117]:
print(f'{lat[5]},{long[5]}')

43.657763,-79.389165


In [118]:
# Test
fsqStnPOI_df = fsqStationPOI(lat[5], long[5])
fsqStnPOI_df.head(1)

Unnamed: 0,lat-long,fsq_id,cat_id,category_name,categories,name,distance,latitude,longitude,address,popularity,open_now,rating,rating_count,my_timestamp
0,"43.657763,-79.389165",5187d8e9498e1f88b1f50ec7,17018,Bookstore,"[{'id': 17018, 'name': 'Bookstore', 'icon': {'...",Indigo,40,43.657694,-79.389932,,0.914967,False,6.9,,2022-12-05 09:31:42.228305


Generate a list of DataFrames now for each station site:

In [119]:
fsq_dfs_list = []
for i in range(numStns):
  poi_df = fsqStationPOI(lat[i], long[i])
  fsq_dfs_list.append(poi_df)

Concatenate the DataFrames to create one large one.

In [120]:
df_fsq = pd.DataFrame()
x = pd.DataFrame()

# concatenate full df object of all stn site data:
for poi_df in fsq_dfs_list:
    x = pd.concat([df_fsq, poi_df])
    df_fsq = x

In [121]:
df_fsq.shape

(29955, 15)

In [122]:
# df_fsq = pd.read_csv('../data/fsqPOI/df_fsq_sun00h.csv')

## Cleaning and Normalizing the nested table values:

The function which created the DataFrame made sure that each POI had at least one category name and ID, but we can see above that many have two or even three which can be normalized to flatten the hierarchy.

In [123]:
# Normalize the categories:
df_fsqcat = pd.json_normalize(df_fsq['categories'][0])
df_fsqcat.head()

Unnamed: 0,0,1,2
0,"{'id': 17065, 'name': 'Farmers' Market', 'icon...","{'id': 17069, 'name': 'Grocery Store / Superma...",
1,"{'id': 16037, 'name': 'Playground', 'icon.pref...",,
2,"{'id': 17119, 'name': 'Bicycle Store', 'icon.p...",,
3,"{'id': 13309, 'name': 'Middle Eastern Restaura...",,
4,"{'id': 13276, 'name': 'Sushi Restaurant', 'ico...",,


In [124]:
# normalize the resulting 3 columns into 3 tables in the example below:
df_fsqcat0 = pd.json_normalize(df_fsqcat[0])
df_fsqcat1 = pd.json_normalize(df_fsqcat[1])
df_fsqcat2 = pd.json_normalize(df_fsqcat[2])
df_fsqcat0.head()

Unnamed: 0,id,name,icon.prefix,icon.suffix
0,17065.0,Farmers' Market,https://ss3.4sqi.net/img/categories_v2/shops/f...,.png
1,16037.0,Playground,https://ss3.4sqi.net/img/categories_v2/parks_o...,.png
2,17119.0,Bicycle Store,https://ss3.4sqi.net/img/categories_v2/shops/b...,.png
3,13309.0,Middle Eastern Restaurant,https://ss3.4sqi.net/img/categories_v2/food/mi...,.png
4,13276.0,Sushi Restaurant,https://ss3.4sqi.net/img/categories_v2/food/su...,.png


In [125]:
# Add new category labels back onto the df: 
df_fsq['cat1_id'] = (df_fsqcat0['id'].astype('Int64'))
df_fsq['cat1_name'] = df_fsqcat0['name']
df_fsq['cat1_icon'] = df_fsqcat0['icon.prefix']
df_fsq['cat1_icon.suffix'] = df_fsqcat0['icon.suffix']

df_fsq['cat2_id'] = df_fsqcat1['id'].astype('Int64')
df_fsq['cat2_name'] = df_fsqcat1['name']
df_fsq['cat2_icon'] = df_fsqcat1['icon.prefix']
df_fsq['cat2_icon.suffix'] = df_fsqcat1['icon.suffix']

df_fsq['cat3_id'] = df_fsqcat2['id'].astype('Int64')
df_fsq['cat3_name'] = df_fsqcat2['name']
df_fsq['cat3_icon'] = df_fsqcat2['icon.prefix']
df_fsq['cat3_icon.suffix'] = df_fsqcat2['icon.suffix']

# Remove the now redundant 'category' and 'name' columns:
df_fsq = df_fsq.drop(['categories', 'category_name', 'cat_id', ], axis = 1)

df_fsq.head(1)

Unnamed: 0,lat-long,fsq_id,name,distance,latitude,longitude,address,popularity,open_now,rating,...,cat1_icon,cat1_icon.suffix,cat2_id,cat2_name,cat2_icon,cat2_icon.suffix,cat3_id,cat3_name,cat3_icon,cat3_icon.suffix
0,"43.665269,-79.319796",4deb8ba688774880e3387c0c,Leslieville Farmers Market,40,43.664679,-79.319687,,0.86858,False,8.6,...,https://ss3.4sqi.net/img/categories_v2/shops/f...,.png,17069,Grocery Store / Supermarket,https://ss3.4sqi.net/img/categories_v2/shops/f...,.png,,,,


#### SAVE IT!!

In [126]:
# df_fsq.to_csv(f'../data/fsqPOI/df_fsq{datetime.now()}.csv', index=False)
df_fsq.to_csv(f'../data/fsqPOI/df_fsq_mon1130.csv', index=False)

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

### FOURSQUARE POI:

In [127]:
# Clone a working copy:
dff = df_fsq.copy()

# Drop columns from our copy df not currently needed:
dff = dff.drop(['fsq_id', 'cat1_icon', 'cat2_icon', 'cat3_icon', 'cat1_icon.suffix', 'cat2_icon.suffix', 'cat3_icon.suffix', 'address'], axis = 1)
dff.head(2)

Unnamed: 0,lat-long,name,distance,latitude,longitude,popularity,open_now,rating,rating_count,my_timestamp,cat1_id,cat1_name,cat2_id,cat2_name,cat3_id,cat3_name
0,"43.665269,-79.319796",Leslieville Farmers Market,40,43.664679,-79.319687,0.86858,False,8.6,,2022-12-05 09:31:42.842959,17065,Farmers' Market,17069.0,Grocery Store / Supermarket,,
1,"43.665269,-79.319796",McDonald's,62,43.664993,-79.319222,0.995501,True,7.4,,2022-12-05 09:31:42.842978,16037,Playground,,,,


Convert the timestamp data to the Toronto's time zone:

In [128]:
from datetime import datetime
import pytz

In [129]:
# create both timezone objects
old_timezone = pytz.timezone("US/Mountain")
new_timezone = pytz.timezone("US/Eastern")

# Confirm current timezone
print(dff['my_timestamp'].iloc[0])

# two-step process
for i in range(dff.shape[0]):
  localized_timestamp = old_timezone.localize(dff['my_timestamp'].iloc[i])
  dff['my_timestamp'].iloc[i] = localized_timestamp.astimezone(new_timezone)
  dff['my_timestamp'].iloc[i] = localized_timestamp.astimezone(new_timezone)


# Has converted to new timezone:
print(dff['my_timestamp'].iloc[0])

2022-12-05 09:31:42.842959
2022-12-05 11:31:42.842959-05:00


Put your parsed results into a DataFrame

In [130]:
dff.to_csv(f'../data/stats_mod/df_fsq_mon1130h.csv', index= False)

dff.head(5)

Unnamed: 0,lat-long,name,distance,latitude,longitude,popularity,open_now,rating,rating_count,my_timestamp,cat1_id,cat1_name,cat2_id,cat2_name,cat3_id,cat3_name
0,"43.665269,-79.319796",Leslieville Farmers Market,40,43.664679,-79.319687,0.86858,False,8.6,,2022-12-05 11:31:42.842959-05:00,17065,Farmers' Market,17069.0,Grocery Store / Supermarket,,
1,"43.665269,-79.319796",McDonald's,62,43.664993,-79.319222,0.995501,True,7.4,,2022-12-05 11:31:42.842978-05:00,16037,Playground,,,,
2,"43.665269,-79.319796",Jonathan Ashbridge Park,63,43.664672,-79.31978,0.952623,False,,,2022-12-05 11:31:42.842985-05:00,17119,Bicycle Store,,,,
3,"43.665269,-79.319796",Ashdale Medical Centre,73,43.665967,-79.318955,0.575147,True,,,2022-12-05 11:31:42.842992-05:00,13309,Middle Eastern Restaurant,,,,
4,"43.665269,-79.319796",Chick-N-Joy,129,43.665121,-79.321347,0.035783,True,6.9,,2022-12-05 11:31:42.842999-05:00,13276,Sushi Restaurant,,,,


# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [131]:
import requests
import os

def yelpStationPOI(lat, long):
  '''
  input:
  output:
  '''
  api_key = os.environ["YELP_API_KEY"]
  yelp_url = (f'https://api.yelp.com/v3/businesses/search?latitude={lat}&longitude={long}&radius=350&limit=50')
  payload={}
  headers = {
      "accept": "application/json",
      "Authorization": f'Bearer {api_key}'
  }

  yelpStnPOI_json = requests.get(yelp_url, headers=headers, data=payload).json()           #Perform get request
  
  list_of_dict = []
  for poi in yelpStnPOI_json['businesses']:
    poi_dict = {
          'lat-long': f'{lat},{long}'
          , 'category_name': poi['categories'][0]['alias']
          , 'categories': poi['categories']
          , 'name': poi['name']
          , 'distance': poi['distance']
          , 'latitude': poi['coordinates']['latitude']
          , 'longitude': poi['coordinates']['longitude']
          , 'address': poi['location']['display_address']
          , 'price': poi.get('price', None)
          , 'is_closed': poi.get('is_closed')
          , 'rating': poi.get('rating')
          , 'rating_count': poi.get('review_count')
          , 'my_timestamp': datetime.now()
      }
      
    list_of_dict.append(poi_dict)
  return pd.DataFrame(list_of_dict)


In [132]:
# Test
yelpStnPOI_dfTest = yelpStationPOI(lat[5], long[5])
yelpStnPOI_dfTest.head(1)

Unnamed: 0,lat-long,category_name,categories,name,distance,latitude,longitude,address,price,is_closed,rating,rating_count,my_timestamp
0,"43.657763,-79.389165",ramen,"[{'alias': 'ramen', 'title': 'Ramen'}, {'alias...",Sansotei Ramen,376.064159,43.655,-79.38643,"[179 Dundas Street W, Toronto, ON M5G 1Z8, Can...",$$,False,4.0,925,2022-12-05 09:39:10.294035


In [134]:
yelp_dfs_list = []
for i in range(numStns):
  poi_df = yelpStationPOI(lat[i], long[i])
  yelp_dfs_list.append(poi_df)

In [None]:
df_yelp = pd.DataFrame()
x = pd.DataFrame()

# concatenate full df object of all stn site data:
for poi_df in yelp_dfs_list:
    x = pd.concat([df_yelp, poi_df])
    df_yelp = x

In [None]:
df_yelp.head(2)

Unnamed: 0,lat-long,category_name,categories,name,distance,latitude,longitude,address,price,is_closed,rating,rating_count,my_timestamp
0,"43.665269,-79.319796",burgers,"[{'alias': 'burgers', 'title': 'Burgers'}]",The Burger's Priest,374.238073,43.6667,-79.315585,"[1636 Queen Street E, Toronto, ON M4L 1G3, Can...",$$,False,3.5,498,2022-12-05 04:14:43.742563
1,"43.665269,-79.319796",hotdogs,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",Chick-N-Joy,125.483132,43.66509,-79.32132,"[1483 Queen Street E, Toronto, ON M4L 1E2, Can...",$,False,4.0,54,2022-12-05 04:14:43.742587


In [None]:
df_yelp.shape

(18659, 13)

## Cleaning and Normalizing the nested table values:

The function which created the DataFrame made sure that each POI had at least one category name and alias, but we can see above that many have two or even three which can be normalized to flatten the hierarchy.

In [None]:
# Normalize the categories:
df_yelpcat = pd.json_normalize(df_yelp['categories'][0])
df_yelpcat.head()

Unnamed: 0,0,1,2
0,"{'alias': 'burgers', 'title': 'Burgers'}",,
1,"{'alias': 'chickenshop', 'title': 'Chicken Shop'}",,
2,"{'alias': 'burgers', 'title': 'Burgers'}",,
3,"{'alias': 'newcanadian', 'title': 'Canadian (N...",,
4,"{'alias': 'japanese', 'title': 'Japanese'}","{'alias': 'sushi', 'title': 'Sushi Bars'}","{'alias': 'tapasmallplates', 'title': 'Tapas/S..."


In [None]:
# normalize the resulting 3 columns into 3 tables in the example below:
df_yelpcat0 = pd.json_normalize(df_yelpcat[0])
df_yelpcat1 = pd.json_normalize(df_yelpcat[1])
df_yelpcat2 = pd.json_normalize(df_yelpcat[2])
df_yelpcat0.head()

Unnamed: 0,alias,title
0,burgers,Burgers
1,chickenshop,Chicken Shop
2,burgers,Burgers
3,newcanadian,Canadian (New)
4,japanese,Japanese


In [None]:
# Add new category labels back onto the df: 
df_yelp['cat1_alias'] = df_yelpcat0['alias']
df_yelp['cat1_title'] = df_yelpcat0['title']

df_yelp['cat2_alias'] = df_yelpcat1['alias']
df_yelp['cat2_title'] = df_yelpcat1['title']

df_yelp['cat3_alias'] = df_yelpcat2['alias']
df_yelp['cat3_title'] = df_yelpcat2['title']

# Remove the now redundant 'category' and 'name' columns:
df_yelp = df_yelp.drop(['categories', 'category_name'], axis = 1)

df_yelp.head(1)

Unnamed: 0,lat-long,name,distance,latitude,longitude,address,price,is_closed,rating,rating_count,my_timestamp,cat1_alias,cat1_title,cat2_alias,cat2_title,cat3_alias,cat3_title
0,"43.665269,-79.319796",The Burger's Priest,374.238073,43.6667,-79.315585,"[1636 Queen Street E, Toronto, ON M4L 1G3, Can...",$$,False,3.5,498,2022-12-05 04:14:43.742563,burgers,Burgers,,,,


### SAVE IT!!!

In [None]:
# df_yelp.to_csv(f'../data/yelpPOI/df_yelp{datetime.now()}.csv', index=False)
df_yelp.to_csv(f'../data/yelpPOI/df_yelp_mon1130h.csv', index=False)

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

### YELP POI:

In [None]:
# Clone a working copy:
df = df_yelp.copy()

the addresses look chunky which diminishes the aesthetic of the table, and I expect this format might be useful for printing them in mailing format.  I won't be using it for my analysis at this point anyway, but I would certainly research this more before I made decisions about changing the format.  


In [None]:
# Drop columns from our copy df not currently needed:
df = df.drop(['address', 'cat1_alias', 'cat2_alias', 'cat3_alias'], axis = 1)
df.head(2)

Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_closed,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",The Burger's Priest,374.238073,43.6667,-79.315585,$$,False,3.5,498,2022-12-05 04:14:43.742563,Burgers,,
1,"43.665269,-79.319796",Chick-N-Joy,125.483132,43.66509,-79.32132,$,False,4.0,54,2022-12-05 04:14:43.742587,Chicken Shop,,


flip the "is_closed" column to remove the confusing double negative, and enable foursquare comparison:

In [None]:
df['is_closed'] = df['is_closed'] == False
# Rename the heading to match:
df = df.rename(columns = {'is_closed':'is_open'})
df.head(2)

Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_open,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",The Burger's Priest,374.238073,43.6667,-79.315585,$$,True,3.5,498,2022-12-05 04:14:43.742563,Burgers,,
1,"43.665269,-79.319796",Chick-N-Joy,125.483132,43.66509,-79.32132,$,True,4.0,54,2022-12-05 04:14:43.742587,Chicken Shop,,


Convert the timestamp data to the Toronto's time zone:

In [None]:
from datetime import datetime
import pytz

In [None]:
# create both timezone objects
old_timezone = pytz.timezone("US/Mountain")
new_timezone = pytz.timezone("US/Eastern")

# Confirm current timezone
print(df['my_timestamp'].iloc[0])

# two-step process
for i in range(df.shape[0]):
  localized_timestamp = old_timezone.localize(df['my_timestamp'].iloc[i])
  df['my_timestamp'].iloc[i] = localized_timestamp.astimezone(new_timezone)
  df['my_timestamp'].iloc[i] = localized_timestamp.astimezone(new_timezone)


# Has converted to new timezone:
print(df['my_timestamp'].iloc[0])

2022-12-05 04:14:43.742563
2022-12-05 06:14:43.742563-05:00


In [None]:
df['distance'] = round(df['distance'], 1)
df.head(2)

Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_open,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",The Burger's Priest,374.2,43.6667,-79.315585,$$,True,3.5,498,2022-12-05 06:14:43.742563-05:00,Burgers,,
1,"43.665269,-79.319796",Chick-N-Joy,125.5,43.66509,-79.32132,$,True,4.0,54,2022-12-05 06:14:43.742587-05:00,Chicken Shop,,


In [None]:
test = df['my_timestamp'].iloc[0]
print(test)
print(datetime.date(test))
print(datetime.time(test))
print(len('2022-12-04 00:09'))

2022-12-05 06:14:43.742563-05:00
2022-12-05
06:14:43.742563
16


##### Categorical encoding: price


In [None]:
print(df['price'].value_counts())
print(df['price'].isna().value_counts())
df['price'].unique()

$$      7731
$       3170
$$$     1248
$$$$     269
Name: price, dtype: int64
False    12418
True      6241
Name: price, dtype: int64


array(['$$', '$', None, '$$$', '$$$$'], dtype=object)

In [None]:
# order in array above will be same as the keys,
keys = df['price'].unique()
# Setting ordinal numerical values to match the order:
values = [2, 1, 3, None, 4]
price_map = dict(zip(keys, values))
price_map

{'$$': 2, '$': 1, None: 3, '$$$': None, '$$$$': 4}

In [None]:
# Replace
df['price'] = df['price'].map(price_map)
# Validate counts unaffected:
print(df['price'].value_counts())

df.head(1)

2.0    7731
3.0    6241
1.0    3170
4.0     269
Name: price, dtype: int64


Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_open,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",The Burger's Priest,374.2,43.6667,-79.315585,2.0,True,3.5,498,2022-12-05 06:14:43.742563-05:00,Burgers,,


Put your parsed results into a DataFrame

In [None]:
yorkUniversity = [7759, 7758, 7756, 7679, 7585, 7586, 7584, 7759, 7679, 7678, 7587, 7588, 7589, 7590]
yorkUni = (7759, 7758, 7756, 7679, 7585, 7586, 7584, 7759, 7679, 7678, 7587, 7588, 7589, 7590)

scarboroughUni = [7612, 7613, 7624, 7614, 7615, 7616, 7617, 7645, 7626]
scarUni = (7612, 7613, 7624, 7614, 7615, 7616, 7617, 7645, 7626)

uni = (7612, 7613, 7624, 7614, 7615, 7616, 7617, 7645, 7626, 7759, 7758, 7756, 7679, 7585, 7586, 7584, 7759, 7679, 7678, 7587, 7588, 7589, 7590)

In [None]:
yelpPOIcoord = df[['latitude', 'longitude', 'lat-long']]
df[['latitude', 'longitude', 'lat-long']].to_csv(f'../data/df_yelp_POIs_for_map.csv', index= False)

fsqPOIcoord = dff[['latitude', 'longitude', 'lat-long']]
dff[['latitude', 'longitude', 'lat-long']].to_csv(f'../data/df_fsq_POIs_for_map.csv', index= False)


In [None]:
fsqPOIcoord_cafe = fsqPOIcoord[dff['cat1_id']==13034]
fsqPOIcoord_cafe.sort_values('latitude', inplace=True)
fsqPOIcoord_cafe.drop_duplicates(keep=False, inplace=True)
fsqPOIcoord_cafe.to_csv(f'../data/df_fsqCafe_POIs_for_map.csv', index= False)

fsqPOIcoord_cafe.shape

(1736, 3)

In [None]:

fsqPOIcoord_parks = fsqPOIcoord[dff['cat1_id']==16032]
fsqPOIcoord_parks.sort_values('latitude', inplace=True)
fsqPOIcoord_parks.drop_duplicates(keep=False, inplace=True)
fsqPOIcoord_parks.to_csv(f'../data/df_fsqparks_POIs_for_map.csv', index= False)

fsqPOIcoord_parks.shape

(1108, 3)

In [None]:
yelpPOIcoord_lounges.to_csv(f'../data/df_yelpLounges_POIs_for_map.csv', index= False)


In [None]:
yelpPOIcoord_lounges = yelpPOIcoord[df['cat1_title']=='Lounges']
yelpPOIcoord_lounges.shape

(0, 3)

In [None]:
yelpPOIcoord_lounges.to_csv(f'../data/df_yelpLounges_POIs_for_map.csv', index= False)


In [None]:
df.to_csv(f'../data/df_yelp_mon1130h.csv', index= False)

df.head(5)

Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_open,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",The Burger's Priest,374.2,43.6667,-79.315585,2.0,True,3.5,498,2022-12-05 06:14:43.742563-05:00,Burgers,,
1,"43.665269,-79.319796",Chick-N-Joy,125.5,43.66509,-79.32132,1.0,True,4.0,54,2022-12-05 06:14:43.742587-05:00,Chicken Shop,,
2,"43.665269,-79.319796",O Sushi,277.6,43.66654,-79.31693,2.0,True,4.0,57,2022-12-05 06:14:43.742593-05:00,Burgers,,
3,"43.665269,-79.319796",Jaclyn's,172.4,43.66627,-79.31802,3.0,True,4.5,14,2022-12-05 06:14:43.742599-05:00,Canadian (New),,
4,"43.665269,-79.319796",Burrito Bandidos,286.2,43.666561,-79.316422,2.0,True,3.5,57,2022-12-05 06:14:43.742604-05:00,Japanese,Sushi Bars,Tapas/Small Plates


In [None]:
df.to_csv(f'../data/stats_mod/df_yelp_mon1130h.csv', index=False)


# Comparing Results

Which API provided you with more complete data? Provide an explanation. 

#### Number of POIs per bike station site:

Both API get request limits were set to 50, but you can see below that yelp was less able to meet that max.  

In [None]:
df['lat-long'].value_counts().tail(10)

43.697490048103,-79.25655146462776      1
43.7777114,-79.1659711                  1
43.779353,-79.193023                    1
43.6615467,-79.498398                   1
43.720645,-79.355474                    1
43.6954486,-79.4026176                  1
43.693256,-79.383238                    1
43.69413548199171,-79.30058718775287    1
43.6750806,-79.3068697                  1
43.63855,-79.46693                      1
Name: lat-long, dtype: int64

In [None]:
dff['lat-long'].value_counts().tail(10)

43.63771,-79.458173                      8
43.7611741,-79.5114452                   8
43.7027268,-79.33368089999999            7
43.688148,-79.362101                     6
43.6314,-79.4304                         6
43.7561681,-79.2026694                   5
43.788319,-79.123505                     5
43.720233,-79.362092                     4
43.645835578756525,-79.32088910859937    1
43.720645,-79.355474                     1
Name: lat-long, dtype: int64

This is likely because the yelp API appears to restrict it's results to food or restaurant vendors, where as foursquare returned a broader range of venues:

In [None]:

print(df['cat1_title'].nunique())
df['cat1_title'].value_counts()

31


Japanese                     1655
Ramen                        1523
Burgers                      1210
Pizza                        1148
Canadian (New)                862
Cafes                         799
Chinese                       793
Seafood                       730
Mexican                       645
Barbeque                      610
Venezuelan                    609
Chicken Shop                  598
Bakeries                      504
Coffee Roasteries             501
Sandwiches                    498
Tapas/Small Plates            481
Breakfast & Brunch            471
Museums                       463
Dog Parks                     454
Vietnamese                    436
Ice Cream & Frozen Yogurt     421
Gluten-Free                   412
Caterers                      399
Sushi Bars                    359
Golf                          350
Filipino                      341
Middle Eastern                325
Comfort Food                  317
Indian                        272
Parks         

In [None]:
dff['cat1_name'].nunique()
dff['cat1_name'].value_counts()

Pizzeria                           1825
Restaurant                         1810
Café                               1750
Bookstore                          1287
Coffee Shop                        1260
Farmers' Market                    1246
Convenience Store                  1199
Sushi Restaurant                   1186
Japanese Restaurant                1184
Park                               1120
Dessert Shop                       1120
Middle Eastern Restaurant           652
Bicycle Store                       652
Playground                          652
Juice Bar                           647
BBQ Joint                           646
Italian Restaurant                  639
Other Great Outdoors                629
Bistro                              627
Car Wash and Detail                 619
Professional Cleaning Service       614
Health and Medicine                 613
Beer Bar                            607
Burrito Restaurant                  605
Bubble Tea Shop                     584


In [None]:
# Category titles:
dff2 = dff.copy()
dff2['catType'] = ((dff['cat1_id']//1000)*1000)

dff2[['catType', 'cat1_name']].groupby('catType').count()


Unnamed: 0_level_0,cat1_name
catType,Unnamed: 1_level_1
11000,2915
12000,555
13000,17409
15000,613
16000,2401
17000,5478


- 10000:	Arts and Entertainment
- 11000:  Business and Professional Services
- 13000:  Dining and Drinking
- 16000:  Landmarks and Outdoors
- 17000:  Retail


In [None]:
# Category titles:
dff2[['cat1_id', 'cat1_name', 'rating']].groupby(['cat1_id', 'cat1_name']).count().sort_values('rating', ascending=False).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
cat1_id,cat1_name,Unnamed: 2_level_1
17065,Farmers' Market,660
13065,Restaurant,610
13064,Pizzeria,587
13034,Café,552
17018,Bookstore,547
13276,Sushi Restaurant,470
13035,Coffee Shop,428
16037,Playground,417
17029,Convenience Store,395
13263,Japanese Restaurant,382


foursquare gives a much wider range of venues to research rather than just restaurants this data set for the bike station sites from foursquare also includes parks, farmers' markets, bike shops, music venues, and other misc stores.

### Get the top 10 restaurants according to their rating

using foursquare:

In [None]:
restaurants = dff2[dff2['cat1_id'].between(13062, 13380)]
restaurants[['name', 'rating']].sort_values('rating', ascending=False).head(15)

Unnamed: 0,name,rating
3,Fiesta Farms,9.5
3,Booster Juice,9.5
9,Vdev,9.4
3,Vdev,9.4
9,Cumbrae's,9.4
49,Vdev,9.4
35,Hawk and Chick Food Inc,9.4
45,Montgomery's,9.3
4,Food & Liquor,9.3
21,Montgomery's,9.3


more time might need to be spend understanding how exactly their category id classification system works, but the range chose above references their [category taxonomy table](https://location.foursquare.com/places/docs/categories) for the id numbers categorized by:
>Dining and Drinking > Restaurant >

using the Yelp results:

In [None]:
print(df['rating'].nunique())
df[['rating', 'name']].groupby('rating').count()

9


Unnamed: 0_level_0,name
rating,Unnamed: 1_level_1
1.0,209
1.5,184
2.0,534
2.5,912
3.0,1861
3.5,4705
4.0,5890
4.5,3237
5.0,1127


this suggests any 10 of the 1057 '5.0' rated venues could be in the top ten.  So I'll account for popularity or frequncy of 5.0 ratings,

In [None]:
topRate = df[df['rating']==5.0]
topRate[['name', 'rating_count']].groupby('name').count().sort_values('rating_count', ascending=False).head(10)

Unnamed: 0_level_0,rating_count
name,Unnamed: 1_level_1
Starbucks,13
Gong Cha,10
Fahrenheit Coffee,9
Old Town Bodega,9
Gurume Sushi,9
Jamican,8
The Maker Bean Cafe,8
Super Wash N Dry Coin Laundry,8
Little Canada,8
Tim Hortons,8
