# W3P2 - PART3 - yelp_foursquareEDA

assignment file part 2

In [1]:
from datetime import datetime
import pandas as pd
import requests
import json
import os
import warnings
warnings.filterwarnings('ignore')

# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. Assuming the average comfortable walking speed below, the radius of 1km should list sites available within a 15min walking distance from each bike station.

$$
1km * hour/4km 
$$

In [2]:
# Import data from file
df = pd.read_csv('../data/df_ctybks_toronto.csv')
df.shape

(655, 19)

Create lat and long series from the ctyBks table:

In [3]:
lat = df['latitude']
long = df['longitude']
print(f'{lat[5]},{long[5]}')

43.657763,-79.389165


In [4]:
numStns = len(lat)

In [5]:
def fsqStationPOI(lat, long):
  '''
  input:
  output:
  '''
  api_key = os.environ["FOURSQUARE_API_KEY"]
  fields = 'fsq_id,name,geocodes,categories,distance,rating,hours,hours_popular,popularity,features'

  headers = {
      "accept": "application/json",
      "Authorization": api_key
  }

  fsq_url = (f'https://api.foursquare.com/v3/places/search?ll={lat},{long}&fields={fields}&radius=1000&&limit=50')
  
  fsqStnPOI_json = requests.get(fsq_url, headers=headers).json()

  list_of_dict = []
  for poi in fsqStnPOI_json['results']:
    categories = poi.get('categories', None)
    cat_id = categories[0]['id'] if len(categories) > 0 else None
    category_name = categories[0]['name'] if len(categories) > 0 else None

    poi_dict = {
            'lat-long': f'{lat},{long}'
          , 'fsq_id': poi['fsq_id']
          , 'cat_id': cat_id
          , 'category_name': category_name
          , 'categories': categories
          , 'name': poi.get('name', None)
          , 'distance': poi.get('distance', None)
          , 'latitude': poi.get('geocodes', {}).get('main', {}).get('latitude', None)
          , 'longitude': poi.get('geocodes', {}).get('main', {}).get('longitude', None)
          , 'address': poi.get('location', {}).get('formatted_address', None)
          , 'popularity': poi.get('popularity', None)
          , 'open_now': poi.get('hours', {}).get('open_now', None)
          , 'rating': poi.get('rating', None)
          , 'rating_count': poi.get('stats', {}).get('total_ratings', None)
          , 'my_timestamp': datetime.now()
      }
      
    list_of_dict.append(poi_dict)
  return pd.DataFrame(list_of_dict)


In [6]:
print(f'{lat[5]},{long[5]}')

43.657763,-79.389165


In [7]:
# Test
fsqStnPOI_df = fsqStationPOI(lat[5], long[5])
fsqStnPOI_df.head(1)

Unnamed: 0,lat-long,fsq_id,cat_id,category_name,categories,name,distance,latitude,longitude,address,popularity,open_now,rating,rating_count,my_timestamp
0,"43.657763,-79.389165",537d4d6d498ec171ba22e7fe,13034,Café,"[{'id': 13034, 'name': 'Café', 'icon': {'prefi...",Jimmy's Coffee,294,43.658525,-79.385436,,0.972089,False,8.5,,2022-12-04 20:59:14.714662


Generate a list of DataFrames now for each station site:

In [8]:
fsq_dfs_list = []
for i in range(numStns):
  poi_df = fsqStationPOI(lat[i], long[i])
  fsq_dfs_list.append(poi_df)

Concatenate the DataFrames to create one large one.

In [9]:
df_fsq = pd.DataFrame()
x = pd.DataFrame()

# concatenate full df object of all stn site data:
for poi_df in fsq_dfs_list:
    x = pd.concat([df_fsq, poi_df])
    df_fsq = x

In [10]:
df_fsq.shape

(32709, 15)

In [316]:
# df_fsq = pd.read_csv('../data/fsqPOI/df_fsq_sun00h.csv')

## Cleaning and Normalizing the nested table values:

The function which created the DataFrame made sure that each POI had at least one category name and ID, but we can see above that many have two or even three which can be normalized to flatten the hierarchy.

In [11]:
# Normalize the categories:
df_fsqcat = pd.json_normalize(df_fsq['categories'][0])
df_fsqcat.head()

Unnamed: 0,0,1,2
0,"{'id': 17065, 'name': 'Farmers' Market', 'icon...","{'id': 17069, 'name': 'Grocery Store / Superma...",
1,"{'id': 13035, 'name': 'Coffee Shop', 'icon.pre...",,
2,"{'id': 17119, 'name': 'Bicycle Store', 'icon.p...",,
3,"{'id': 10039, 'name': 'Music Venue', 'icon.pre...",,
4,"{'id': 13011, 'name': 'Gay Bar', 'icon.prefix'...","{'id': 13016, 'name': 'Lounge', 'icon.prefix':...","{'id': 13065, 'name': 'Restaurant', 'icon.pref..."


In [12]:
# normalize the resulting 3 columns into 3 tables in the example below:
df_fsqcat0 = pd.json_normalize(df_fsqcat[0])
df_fsqcat1 = pd.json_normalize(df_fsqcat[1])
df_fsqcat2 = pd.json_normalize(df_fsqcat[2])
df_fsqcat0.head()

Unnamed: 0,id,name,icon.prefix,icon.suffix
0,17065.0,Farmers' Market,https://ss3.4sqi.net/img/categories_v2/shops/f...,.png
1,13035.0,Coffee Shop,https://ss3.4sqi.net/img/categories_v2/food/co...,.png
2,17119.0,Bicycle Store,https://ss3.4sqi.net/img/categories_v2/shops/b...,.png
3,10039.0,Music Venue,https://ss3.4sqi.net/img/categories_v2/arts_en...,.png
4,13011.0,Gay Bar,https://ss3.4sqi.net/img/categories_v2/nightli...,.png


In [13]:
# Add new category labels back onto the df: 
df_fsq['cat1_id'] = (df_fsqcat0['id'].astype('Int64'))
df_fsq['cat1_name'] = df_fsqcat0['name']
df_fsq['cat1_icon'] = df_fsqcat0['icon.prefix']
df_fsq['cat1_icon.suffix'] = df_fsqcat0['icon.suffix']

df_fsq['cat2_id'] = df_fsqcat1['id'].astype('Int64')
df_fsq['cat2_name'] = df_fsqcat1['name']
df_fsq['cat2_icon'] = df_fsqcat1['icon.prefix']
df_fsq['cat2_icon.suffix'] = df_fsqcat1['icon.suffix']

df_fsq['cat3_id'] = df_fsqcat2['id'].astype('Int64')
df_fsq['cat3_name'] = df_fsqcat2['name']
df_fsq['cat3_icon'] = df_fsqcat2['icon.prefix']
df_fsq['cat3_icon.suffix'] = df_fsqcat2['icon.suffix']

# Remove the now redundant 'category' and 'name' columns:
df_fsq = df_fsq.drop(['categories', 'category_name', 'cat_id', ], axis = 1)

df_fsq.head(1)

Unnamed: 0,lat-long,fsq_id,name,distance,latitude,longitude,address,popularity,open_now,rating,...,cat1_icon,cat1_icon.suffix,cat2_id,cat2_name,cat2_icon,cat2_icon.suffix,cat3_id,cat3_name,cat3_icon,cat3_icon.suffix
0,"43.665269,-79.319796",4deb8ba688774880e3387c0c,Leslieville Farmers Market,40,43.664679,-79.319687,,0.86858,False,8.6,...,https://ss3.4sqi.net/img/categories_v2/shops/f...,.png,17069,Grocery Store / Supermarket,https://ss3.4sqi.net/img/categories_v2/shops/f...,.png,,,,


#### SAVE IT!!

In [14]:
# df_fsq.to_csv(f'../data/fsqPOI/df_fsq{datetime.now()}.csv', index=False)
df_fsq.to_csv(f'../data/fsqPOI/df_fsq_sun2311.csv', index=False)

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

### FOURSQUARE POI:

In [15]:
# Clone a working copy:
dff = df_fsq.copy()

# Drop columns from our copy df not currently needed:
dff = dff.drop(['fsq_id', 'cat1_icon', 'cat2_icon', 'cat3_icon', 'cat1_icon.suffix', 'cat2_icon.suffix', 'cat3_icon.suffix', 'address'], axis = 1)
dff.head(2)

Unnamed: 0,lat-long,name,distance,latitude,longitude,popularity,open_now,rating,rating_count,my_timestamp,cat1_id,cat1_name,cat2_id,cat2_name,cat3_id,cat3_name
0,"43.665269,-79.319796",Leslieville Farmers Market,40,43.664679,-79.319687,0.86858,False,8.6,,2022-12-04 20:59:17.456850,17065,Farmers' Market,17069.0,Grocery Store / Supermarket,,
1,"43.665269,-79.319796",McDonald's,62,43.664993,-79.319222,0.995501,False,7.4,,2022-12-04 20:59:17.456868,13035,Coffee Shop,,,,


Convert the timestamp data to the Toronto's time zone:

In [16]:
from datetime import datetime
import pytz

In [17]:
# create both timezone objects
old_timezone = pytz.timezone("US/Mountain")
new_timezone = pytz.timezone("US/Eastern")

# Confirm current timezone
print(dff['my_timestamp'].iloc[0])

# two-step process
for i in range(dff.shape[0]):
  localized_timestamp = old_timezone.localize(dff['my_timestamp'].iloc[i])
  dff['my_timestamp'].iloc[i] = localized_timestamp.astimezone(new_timezone)
  dff['my_timestamp'].iloc[i] = localized_timestamp.astimezone(new_timezone)


# Has converted to new timezone:
print(dff['my_timestamp'].iloc[0])

2022-12-04 20:59:17.456850
2022-12-04 22:59:17.456850-05:00


Put your parsed results into a DataFrame

In [19]:
dff.to_csv(f'../data/stats_mod/df_fsq_sun2311h.csv', index= False)

dff.head(5)

Unnamed: 0,lat-long,name,distance,latitude,longitude,popularity,open_now,rating,rating_count,my_timestamp,cat1_id,cat1_name,cat2_id,cat2_name,cat3_id,cat3_name
0,"43.665269,-79.319796",Leslieville Farmers Market,40,43.664679,-79.319687,0.86858,False,8.6,,2022-12-04 22:59:17.456850-05:00,17065,Farmers' Market,17069.0,Grocery Store / Supermarket,,
1,"43.665269,-79.319796",McDonald's,62,43.664993,-79.319222,0.995501,False,7.4,,2022-12-04 22:59:17.456868-05:00,13035,Coffee Shop,,,,
2,"43.665269,-79.319796",Rorschach Brewing,194,43.663542,-79.319882,0.984511,False,8.3,,2022-12-04 22:59:17.456875-05:00,17119,Bicycle Store,,,,
3,"43.665269,-79.319796",Velotique,186,43.666157,-79.317863,0.968785,False,8.0,,2022-12-04 22:59:17.456881-05:00,10039,Music Venue,,,,
4,"43.665269,-79.319796",Chick-N-Joy,129,43.665121,-79.321347,0.035783,False,6.9,,2022-12-04 22:59:17.456886-05:00,13011,Gay Bar,13016.0,Lounge,13065.0,Restaurant


# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [20]:
import requests
import os

def yelpStationPOI(lat, long):
  '''
  input:
  output:
  '''
  api_key = os.environ["YELP_API_KEY"]
  yelp_url = (f'https://api.yelp.com/v3/businesses/search?latitude={lat}&longitude={long}&radius=1000&limit=50')
  payload={}
  headers = {
      "accept": "application/json",
      "Authorization": f'Bearer {api_key}'
  }

  yelpStnPOI_json = requests.get(yelp_url, headers=headers, data=payload).json()           #Perform get request
  
  list_of_dict = []
  for poi in yelpStnPOI_json['businesses']:
    poi_dict = {
          'lat-long': f'{lat},{long}'
          , 'category_name': poi['categories'][0]['alias']
          , 'categories': poi['categories']
          , 'name': poi['name']
          , 'distance': poi['distance']
          , 'latitude': poi['coordinates']['latitude']
          , 'longitude': poi['coordinates']['longitude']
          , 'address': poi['location']['display_address']
          , 'price': poi.get('price', None)
          , 'is_closed': poi.get('is_closed')
          , 'rating': poi.get('rating')
          , 'rating_count': poi.get('review_count')
          , 'my_timestamp': datetime.now()
      }
      
    list_of_dict.append(poi_dict)
  return pd.DataFrame(list_of_dict)


In [21]:
# Test
yelpStnPOI_dfTest = yelpStationPOI(lat[5], long[5])
yelpStnPOI_dfTest.head(1)

Unnamed: 0,lat-long,category_name,categories,name,distance,latitude,longitude,address,price,is_closed,rating,rating_count,my_timestamp
0,"43.657763,-79.389165",ramen,"[{'alias': 'ramen', 'title': 'Ramen'}, {'alias...",Sansotei Ramen,376.064159,43.655,-79.38643,"[179 Dundas Street W, Toronto, ON M5G 1Z8, Can...",$$,False,4.0,925,2022-12-04 21:15:09.064598


In [22]:
yelp_dfs_list = []
for i in range(numStns):
  poi_df = yelpStationPOI(lat[i], long[i])
  yelp_dfs_list.append(poi_df)

In [23]:
df_yelp = pd.DataFrame()
x = pd.DataFrame()

# concatenate full df object of all stn site data:
for poi_df in yelp_dfs_list:
    x = pd.concat([df_yelp, poi_df])
    df_yelp = x

In [24]:
df_yelp.head(2)

Unnamed: 0,lat-long,category_name,categories,name,distance,latitude,longitude,address,price,is_closed,rating,rating_count,my_timestamp
0,"43.665269,-79.319796",egyptian,"[{'alias': 'egyptian', 'title': 'Egyptian'}, {...",Maha's,1004.992087,43.67167,-79.32853,"[226 Greenwood Avenue, Toronto, ON M4L 2R2, Ca...",$$,False,4.0,573,2022-12-04 21:15:10.587013
1,"43.665269,-79.319796",burgers,"[{'alias': 'burgers', 'title': 'Burgers'}]",The Burger's Priest,374.238073,43.6667,-79.315585,"[1636 Queen Street E, Toronto, ON M4L 1G3, Can...",$$,False,3.5,498,2022-12-04 21:15:10.587024


In [25]:
df_yelp.shape

(30828, 13)

## Cleaning and Normalizing the nested table values:

The function which created the DataFrame made sure that each POI had at least one category name and alias, but we can see above that many have two or even three which can be normalized to flatten the hierarchy.

In [26]:
# Normalize the categories:
df_yelpcat = pd.json_normalize(df_yelp['categories'][0])
df_yelpcat.head()

Unnamed: 0,0,1,2
0,"{'alias': 'egyptian', 'title': 'Egyptian'}","{'alias': 'vegan', 'title': 'Vegan'}","{'alias': 'sandwiches', 'title': 'Sandwiches'}"
1,"{'alias': 'italian', 'title': 'Italian'}",,
2,"{'alias': 'burgers', 'title': 'Burgers'}",,
3,"{'alias': 'newcanadian', 'title': 'Canadian (N...",,
4,"{'alias': 'japanese', 'title': 'Japanese'}","{'alias': 'pubs', 'title': 'Pubs'}",


In [27]:
# normalize the resulting 3 columns into 3 tables in the example below:
df_yelpcat0 = pd.json_normalize(df_yelpcat[0])
df_yelpcat1 = pd.json_normalize(df_yelpcat[1])
df_yelpcat2 = pd.json_normalize(df_yelpcat[2])
df_yelpcat0.head()

Unnamed: 0,alias,title
0,egyptian,Egyptian
1,italian,Italian
2,burgers,Burgers
3,newcanadian,Canadian (New)
4,japanese,Japanese


In [28]:
# Add new category labels back onto the df: 
df_yelp['cat1_alias'] = df_yelpcat0['alias']
df_yelp['cat1_title'] = df_yelpcat0['title']

df_yelp['cat2_alias'] = df_yelpcat1['alias']
df_yelp['cat2_title'] = df_yelpcat1['title']

df_yelp['cat3_alias'] = df_yelpcat2['alias']
df_yelp['cat3_title'] = df_yelpcat2['title']

# Remove the now redundant 'category' and 'name' columns:
df_yelp = df_yelp.drop(['categories', 'category_name'], axis = 1)

df_yelp.head(1)

Unnamed: 0,lat-long,name,distance,latitude,longitude,address,price,is_closed,rating,rating_count,my_timestamp,cat1_alias,cat1_title,cat2_alias,cat2_title,cat3_alias,cat3_title
0,"43.665269,-79.319796",Maha's,1004.992087,43.67167,-79.32853,"[226 Greenwood Avenue, Toronto, ON M4L 2R2, Ca...",$$,False,4.0,573,2022-12-04 21:15:10.587013,egyptian,Egyptian,vegan,Vegan,sandwiches,Sandwiches


### SAVE IT!!!

In [29]:
# df_yelp.to_csv(f'../data/yelpPOI/df_yelp{datetime.now()}.csv', index=False)
df_yelp.to_csv(f'../data/yelpPOI/df_yelp_sun2326h.csv', index=False)

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

### YELP POI:

In [30]:
# Clone a working copy:
df = df_yelp.copy()

the addresses look chunky which diminishes the aesthetic of the table, and I expect this format might be useful for printing them in mailing format.  I won't be using it for my analysis at this point anyway, but I would certainly research this more before I made decisions about changing the format.  


In [31]:
# Drop columns from our copy df not currently needed:
df = df.drop(['address', 'cat1_alias', 'cat2_alias', 'cat3_alias'], axis = 1)
df.head(2)

Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_closed,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",Maha's,1004.992087,43.67167,-79.32853,$$,False,4.0,573,2022-12-04 21:15:10.587013,Egyptian,Vegan,Sandwiches
1,"43.665269,-79.319796",The Burger's Priest,374.238073,43.6667,-79.315585,$$,False,3.5,498,2022-12-04 21:15:10.587024,Italian,,


flip the "is_closed" column to remove the confusing double negative, and enable foursquare comparison:

In [32]:
df['is_closed'] = df['is_closed'] == False
# Rename the heading to match:
df = df.rename(columns = {'is_closed':'is_open'})
df.head(2)

Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_open,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",Maha's,1004.992087,43.67167,-79.32853,$$,True,4.0,573,2022-12-04 21:15:10.587013,Egyptian,Vegan,Sandwiches
1,"43.665269,-79.319796",The Burger's Priest,374.238073,43.6667,-79.315585,$$,True,3.5,498,2022-12-04 21:15:10.587024,Italian,,


Convert the timestamp data to the Toronto's time zone:

In [33]:
from datetime import datetime
import pytz

In [34]:
# create both timezone objects
old_timezone = pytz.timezone("US/Mountain")
new_timezone = pytz.timezone("US/Eastern")

# Confirm current timezone
print(df['my_timestamp'].iloc[0])

# two-step process
for i in range(df.shape[0]):
  localized_timestamp = old_timezone.localize(df['my_timestamp'].iloc[i])
  df['my_timestamp'].iloc[i] = localized_timestamp.astimezone(new_timezone)
  df['my_timestamp'].iloc[i] = localized_timestamp.astimezone(new_timezone)


# Has converted to new timezone:
print(df['my_timestamp'].iloc[0])

2022-12-04 21:15:10.587013
2022-12-04 23:15:10.587013-05:00


In [35]:
df['distance'] = round(df['distance'], 1)
df.head(2)

Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_open,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",Maha's,1005.0,43.67167,-79.32853,$$,True,4.0,573,2022-12-04 23:15:10.587013-05:00,Egyptian,Vegan,Sandwiches
1,"43.665269,-79.319796",The Burger's Priest,374.2,43.6667,-79.315585,$$,True,3.5,498,2022-12-04 23:15:10.587024-05:00,Italian,,


In [36]:
test = df['my_timestamp'].iloc[0]
print(test)
print(datetime.date(test))
print(datetime.time(test))
print(len('2022-12-04 00:09'))

2022-12-04 23:15:10.587013-05:00
2022-12-04
23:15:10.587013
16


##### Categorical encoding: price


In [37]:
print(df['price'].value_counts())
print(df['price'].isna().value_counts())
df['price'].unique()

$$      16243
$        5224
$$$      2891
$$$$      929
Name: price, dtype: int64
False    25287
True      5541
Name: price, dtype: int64


array(['$$', '$', '$$$', None, '$$$$'], dtype=object)

In [38]:
# order in array above will be same as the keys,
keys = df['price'].unique()
# Setting ordinal numerical values to match the order:
values = [2, 1, 3, None, 4]
price_map = dict(zip(keys, values))
price_map

{'$$': 2, '$': 1, '$$$': 3, None: None, '$$$$': 4}

In [39]:
# Replace
df['price'] = df['price'].map(price_map)
# Validate counts unaffected:
print(df['price'].value_counts())

df.head(1)

2.0    16243
1.0     5224
3.0     2891
4.0      929
Name: price, dtype: int64


Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_open,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",Maha's,1005.0,43.67167,-79.32853,2.0,True,4.0,573,2022-12-04 23:15:10.587013-05:00,Egyptian,Vegan,Sandwiches


Put your parsed results into a DataFrame

In [40]:
df.to_csv(f'../data/df_yelp_1440h.csv', index= False)

df.head(5)

Unnamed: 0,lat-long,name,distance,latitude,longitude,price,is_open,rating,rating_count,my_timestamp,cat1_title,cat2_title,cat3_title
0,"43.665269,-79.319796",Maha's,1005.0,43.67167,-79.32853,2.0,True,4.0,573,2022-12-04 23:15:10.587013-05:00,Egyptian,Vegan,Sandwiches
1,"43.665269,-79.319796",The Burger's Priest,374.2,43.6667,-79.315585,2.0,True,3.5,498,2022-12-04 23:15:10.587024-05:00,Italian,,
2,"43.665269,-79.319796",Chino Locos Original,467.1,43.664482,-79.325501,1.0,True,4.0,190,2022-12-04 23:15:10.587028-05:00,Burgers,,
3,"43.665269,-79.319796",Lahore Tikka House,787.5,43.67148,-79.32451,2.0,True,3.5,537,2022-12-04 23:15:10.587031-05:00,Canadian (New),,
4,"43.665269,-79.319796",Gio Rana's Really Really Nice Restaurant,881.1,43.663335,-79.330419,3.0,True,4.0,227,2022-12-04 23:15:10.587034-05:00,Japanese,Pubs,


In [41]:
df.to_csv(f'../data/stats_mod/df_yelp_sun2326h.csv', index=False)


# Comparing Results

Which API provided you with more complete data? Provide an explanation. 

#### Number of POIs per bike station site:

Both API get request limits were set to 50, but you can see below that yelp was less able to meet that max.  

In [263]:
df['lat-long'].value_counts().tail(10)

43.72268,-79.37644                       9
43.6856,-79.3718                         8
43.7805505,-79.1301203                   7
43.685924,-79.376304                     7
43.6834703,-79.5108942                   7
43.747854,-79.199327                     7
43.727365488246306,-79.3814254607724     6
43.778015,-79.131911                     4
43.788319,-79.123505                     2
43.645835578756525,-79.32088910859937    1
Name: lat-long, dtype: int64

In [264]:
dff['lat-long'].value_counts().tail(10)

43.657024,-79.377257                     50
43.662862,-79.383572                     50
43.652276,-79.380701                     50
43.7063473,-79.40161                     50
43.6978701,-79.39419                     50
43.696708,-79.40045                      50
43.6476616,-79.37549                     50
43.772445,-79.511912                     50
43.788319,-79.123505                     45
43.645835578756525,-79.32088910859937    14
Name: lat-long, dtype: int64

This is likely because the yelp API appears to restrict it's results to food or restaurant vendors, where as foursquare returned a broader range of venues:

In [266]:

print(df['cat1_title'].nunique())
df['cat1_title'].value_counts()

21


Japanese                     6706
Thai                         3617
Italian                      3081
Breakfast & Brunch           1868
Pizza                        1866
Seafood                      1847
Egyptian                     1278
Burgers                      1264
Middle Eastern               1229
Mexican                      1187
Canadian (New)                653
Ramen                         652
Lounges                       647
Sandwiches                    646
Tapas/Small Plates            641
Vietnamese                    631
Ice Cream & Frozen Yogurt     626
American (Traditional)        613
French                        599
Aquariums                     595
Steakhouses                   582
Name: cat1_title, dtype: int64

In [267]:
dff['cat1_name'].nunique()
dff['cat1_name'].value_counts()

Café                               3924
Park                               2617
Japanese Restaurant                1962
Coffee Shop                        1310
Farmers' Market                    1309
Poutine Restaurant                 1308
Pizzeria                           1308
Dessert Shop                       1307
Gay Bar                             655
BBQ Joint                           655
Hair Salon                          655
Italian Restaurant                  655
Bookstore                           655
Music Venue                         655
Miscellaneous Store                 655
Bicycle Store                       655
Asian Restaurant                    654
Diner                               654
Seafood Restaurant                  654
Bar                                 654
Hot Dog Joint                       654
Vegan and Vegetarian Restaurant     654
Chocolate Store                     654
Bakery                              654
Restaurant                          654


In [288]:
# Category titles:
dff2 = dff.copy()
dff2['catType'] = ((dff['cat1_id']//1000)*1000)

dff2[['catType', 'cat1_name']].groupby('catType').count()


Unnamed: 0_level_0,cat1_name
catType,Unnamed: 1_level_1
10000,655
11000,1309
13000,21585
16000,3270
17000,5890


- 10000:	Arts and Entertainment
- 11000:  Business and Professional Services
- 13000:  Dining and Drinking
- 16000:  Landmarks and Outdoors
- 17000:  Retail


In [283]:
# Category titles:
dff2[['cat1_id', 'cat1_name', 'rating']].groupby(['cat1_id', 'cat1_name']).count().sort_values('rating', ascending=False).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
cat1_id,cat1_name,Unnamed: 2_level_1
13034,Café,2902
16032,Park,1903
13263,Japanese Restaurant,1468
13035,Coffee Shop,1103
17065,Farmers' Market,1084
13064,Pizzeria,969
13326,Poutine Restaurant,918
13040,Dessert Shop,908
17119,Bicycle Store,556
10039,Music Venue,548


foursquare gives a much wider range of venues to research rather than just restaurants this data set for the bike station sites from foursquare also includes parks, farmers' markets, bike shops, music venues, and other misc stores.

### Get the top 10 restaurants according to their rating

using foursquare:

In [298]:
restaurants = dff2[dff2['cat1_id'].between(13062, 13380)]
restaurants[['name', 'rating']].sort_values('rating', ascending=False).head(15)

Unnamed: 0,name,rating
42,Booster Juice,9.5
21,Booster Juice,9.5
12,Paradise Grapevine,9.5
23,Paradise Grapevine,9.5
31,Paradise Grapevine,9.5
45,Booster Juice,9.5
14,Booster Juice,9.5
43,Booster Juice,9.5
38,Booster Juice,9.5
31,Booster Juice,9.5


more time might need to be spend understanding how exactly their category id classification system works, but the range chose above references their [category taxonomy table](https://location.foursquare.com/places/docs/categories) for the id numbers categorized by:
>Dining and Drinking > Restaurant >

using the Yelp results:

In [306]:
print(df['rating'].nunique())
df[['rating', 'name']].groupby('rating').count()

9


Unnamed: 0_level_0,name
rating,Unnamed: 1_level_1
1.0,103
1.5,99
2.0,220
2.5,461
3.0,1192
3.5,5862
4.0,13656
4.5,8178
5.0,1057


this suggests any 10 of the 1057 '5.0' rated venues could be in the top ten.  So I'll account for popularity or frequncy of 5.0 ratings,

In [309]:
topRate = df[df['rating']==5.0]
topRate[['name', 'rating_count']].groupby('name').count().sort_values('rating_count', ascending=False).head(10)

Unnamed: 0_level_0,rating_count
name,Unnamed: 1_level_1
Grandma Loves You,25
Mallo,24
Brewhaha At Bandit Brewery,22
The Maker Bean Cafe,22
Super Wash N Dry Coin Laundry,17
COBS Bread,17
Stock In Trade,15
Bello Pizza,15
De Floured,13
Gurume Sushi,13
