In [2]:
import os
import pandas as pd
import requests as re
import json

from statistics import mode

In [3]:
# Get the neighborhood data

nyc = pd.read_csv('../data/data/nyc_geo_data_cleaned.csv', index_col=0)

In [4]:
nyc

Unnamed: 0,id,borough,neighborhood,longitude,latitude
0,nyu_2451_34572.1,Bronx,Wakefield,-73.847201,40.894705
1,nyu_2451_34572.2,Bronx,Co-op City,-73.829939,40.874294
2,nyu_2451_34572.3,Bronx,Eastchester,-73.827806,40.887556
3,nyu_2451_34572.4,Bronx,Fieldston,-73.905643,40.895437
4,nyu_2451_34572.5,Bronx,Riverdale,-73.912585,40.890834
...,...,...,...,...,...
301,nyu_2451_34572.302,Manhattan,Hudson Yards,-74.000111,40.756658
302,nyu_2451_34572.303,Queens,Hammels,-73.805530,40.587338
303,nyu_2451_34572.304,Queens,Bayswater,-73.765968,40.611322
304,nyu_2451_34572.305,Queens,Queensbridge,-73.945631,40.756091


In [5]:
## define yelp search function

def yelpSearch(latitude, longitude, query, radius):
    """
    Accepts search term string, lat & lon as floats, radius as int
    """

    yelp_key = os.environ["YELP_API_KEY"]

    yelp_headers = {
        "Accept": "application/json",
        "Authorization": "Bearer " + yelp_key
    }
    
    yelp_url = "https://api.yelp.com/v3/businesses/search"

    yelp_params = {
        "term": query,
        "latitude": latitude,
        "longitude": longitude,
        "radius": radius,
        "limit": 50
    }

    response = re.get(yelp_url, params=yelp_params, headers=yelp_headers)

    return response

In [6]:
response = yelpSearch(str(40.894705), str(-73.847201), 'restaurant', 500)
response.json()['businesses'][0]['categories'][0]['alias']

'caribbean'

In [7]:
# dict for storage
yelp_dict = {}

for row in nyc.itertuples():
    print(row.id)
    # call the API
    response = yelpSearch(str(row.latitude), str(row.longitude), 'restaurant', 500)

    # set up lists
    yelp_categories = []

    # parse the category data
    for business in response.json()['businesses']:
        for category in business['categories']:
            yelp_categories.append(category['alias'])

    yelp_dict[row.id] = {
        'yelp_categories': yelp_categories
    }

nyu_2451_34572.1
nyu_2451_34572.2
nyu_2451_34572.3
nyu_2451_34572.4
nyu_2451_34572.5
nyu_2451_34572.6
nyu_2451_34572.7
nyu_2451_34572.8
nyu_2451_34572.9
nyu_2451_34572.10
nyu_2451_34572.11
nyu_2451_34572.12
nyu_2451_34572.13
nyu_2451_34572.14
nyu_2451_34572.15
nyu_2451_34572.16
nyu_2451_34572.17
nyu_2451_34572.18
nyu_2451_34572.19
nyu_2451_34572.20
nyu_2451_34572.21
nyu_2451_34572.22
nyu_2451_34572.23
nyu_2451_34572.24
nyu_2451_34572.25
nyu_2451_34572.26
nyu_2451_34572.27
nyu_2451_34572.28
nyu_2451_34572.29
nyu_2451_34572.30
nyu_2451_34572.31
nyu_2451_34572.32
nyu_2451_34572.33
nyu_2451_34572.34
nyu_2451_34572.35
nyu_2451_34572.36
nyu_2451_34572.37
nyu_2451_34572.38
nyu_2451_34572.39
nyu_2451_34572.40
nyu_2451_34572.41
nyu_2451_34572.42
nyu_2451_34572.43
nyu_2451_34572.44
nyu_2451_34572.45
nyu_2451_34572.46
nyu_2451_34572.47
nyu_2451_34572.48
nyu_2451_34572.49
nyu_2451_34572.50
nyu_2451_34572.51
nyu_2451_34572.52
nyu_2451_34572.53
nyu_2451_34572.54
nyu_2451_34572.55
nyu_2451_34572.56
n

In [8]:
yelp_dict

{'nyu_2451_34572.1': {'yelp_categories': ['caribbean',
   'breakfast_brunch',
   'caribbean',
   'pizza',
   'breakfast_brunch',
   'burgers',
   'sandwiches',
   'caribbean',
   'comfortfood',
   'soulfood',
   'caribbean',
   'grocery',
   'cafes',
   'sandwiches',
   'pizza',
   'sandwiches']},
 'nyu_2451_34572.2': {'yelp_categories': ['chinese',
   'delis',
   'sandwiches',
   'chinese',
   'delis',
   'delis',
   'pizza',
   'coffee',
   'burgers',
   'hotdogs',
   'lounges',
   'restaurants',
   'hotdogs',
   'pizza']},
 'nyu_2451_34572.3': {'yelp_categories': ['caribbean',
   'seafood',
   'bars',
   'chinese',
   'breakfast_brunch',
   'greek',
   'caribbean',
   'tradamerican',
   'seafood',
   'tradamerican',
   'breakfast_brunch',
   'diners',
   'pizza',
   'caribbean',
   'pizza',
   'italian',
   'newamerican',
   'comfortfood',
   'danceclubs',
   'cafes',
   'hotdogs',
   'burgers',
   'coffee']},
 'nyu_2451_34572.4': {'yelp_categories': []},
 'nyu_2451_34572.5': {'yelp

In [17]:
with open('yelp_restaurant_categories.json', 'w') as fp:
    json.dump(yelp_dict, fp)

In [13]:
## let's clean it - take the most popular category

yelp_df = pd.DataFrame(yelp_dict).T
yelp_df

Unnamed: 0,yelp_categories
nyu_2451_34572.1,"[caribbean, breakfast_brunch, caribbean, pizza..."
nyu_2451_34572.2,"[chinese, delis, sandwiches, chinese, delis, d..."
nyu_2451_34572.3,"[caribbean, seafood, bars, chinese, breakfast_..."
nyu_2451_34572.4,[]
nyu_2451_34572.5,"[indpak, pizza, italian, latin, sushi, cocktai..."
...,...
nyu_2451_34572.302,"[breakfast_brunch, gluten_free, newamerican, a..."
nyu_2451_34572.303,"[beer_and_wine, burgers, breakfast_brunch, chi..."
nyu_2451_34572.304,[]
nyu_2451_34572.305,"[thai, soulfood, tradamerican, tacos, newmexic..."


In [14]:
# replace empty lists with NaN
yelp_df = yelp_df.mask(yelp_df.applymap(str).eq('[]'))

In [15]:
yelp_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 306 entries, nyu_2451_34572.1 to nyu_2451_34572.306
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   yelp_categories  292 non-null    object
dtypes: object(1)
memory usage: 4.8+ KB


In [17]:
# replace NaN with string
yelp_df = yelp_df.fillna(value='None')
yelp_df

Unnamed: 0,yelp_categories
nyu_2451_34572.1,"[caribbean, breakfast_brunch, caribbean, pizza..."
nyu_2451_34572.2,"[chinese, delis, sandwiches, chinese, delis, d..."
nyu_2451_34572.3,"[caribbean, seafood, bars, chinese, breakfast_..."
nyu_2451_34572.4,
nyu_2451_34572.5,"[indpak, pizza, italian, latin, sushi, cocktai..."
...,...
nyu_2451_34572.302,"[breakfast_brunch, gluten_free, newamerican, a..."
nyu_2451_34572.303,"[beer_and_wine, burgers, breakfast_brunch, chi..."
nyu_2451_34572.304,
nyu_2451_34572.305,"[thai, soulfood, tradamerican, tacos, newmexic..."


In [18]:
# reset index
yelp_df.reset_index(inplace=True)

In [19]:
# fix columns
yelp_df.columns = ['id', 'yelp_categories']

In [20]:
yelp_df

Unnamed: 0,id,yelp_categories
0,nyu_2451_34572.1,"[caribbean, breakfast_brunch, caribbean, pizza..."
1,nyu_2451_34572.2,"[chinese, delis, sandwiches, chinese, delis, d..."
2,nyu_2451_34572.3,"[caribbean, seafood, bars, chinese, breakfast_..."
3,nyu_2451_34572.4,
4,nyu_2451_34572.5,"[indpak, pizza, italian, latin, sushi, cocktai..."
...,...,...
301,nyu_2451_34572.302,"[breakfast_brunch, gluten_free, newamerican, a..."
302,nyu_2451_34572.303,"[beer_and_wine, burgers, breakfast_brunch, chi..."
303,nyu_2451_34572.304,
304,nyu_2451_34572.305,"[thai, soulfood, tradamerican, tacos, newmexic..."


In [21]:
# Create cleaned_df
yelp_cleaned_dict = {}

for row in yelp_df.itertuples():
    counter = {}
    for category in row.yelp_categories:
        counter[category] = counter.get(category, 0) + 1
    yelp_cleaned_dict[row.id] = counter

In [22]:
yelp_cleaned_dict

{'nyu_2451_34572.1': {'caribbean': 4,
  'breakfast_brunch': 2,
  'pizza': 2,
  'burgers': 1,
  'sandwiches': 3,
  'comfortfood': 1,
  'soulfood': 1,
  'grocery': 1,
  'cafes': 1},
 'nyu_2451_34572.2': {'chinese': 2,
  'delis': 3,
  'sandwiches': 1,
  'pizza': 2,
  'coffee': 1,
  'burgers': 1,
  'hotdogs': 2,
  'lounges': 1,
  'restaurants': 1},
 'nyu_2451_34572.3': {'caribbean': 3,
  'seafood': 2,
  'bars': 1,
  'chinese': 1,
  'breakfast_brunch': 2,
  'greek': 1,
  'tradamerican': 2,
  'diners': 1,
  'pizza': 2,
  'italian': 1,
  'newamerican': 1,
  'comfortfood': 1,
  'danceclubs': 1,
  'cafes': 1,
  'hotdogs': 1,
  'burgers': 1,
  'coffee': 1},
 'nyu_2451_34572.4': {'N': 1, 'o': 1, 'n': 1, 'e': 1},
 'nyu_2451_34572.5': {'indpak': 1,
  'pizza': 1,
  'italian': 1,
  'latin': 1,
  'sushi': 1,
  'cocktailbars': 1,
  'tapasmallplates': 1,
  'greek': 1,
  'burgers': 3,
  'vegetarian': 2,
  'themedcafes': 1,
  'sportsbars': 1,
  'sandwiches': 1,
  'kosher': 2,
  'japanese': 1,
  'chinese':

In [28]:
yelp_count_df = pd.DataFrame(data=yelp_cleaned_dict).T


In [38]:
# remove null values
yelp_count_df = yelp_count_df.fillna(value=0).drop(['N','o','n','e'], axis=1)

In [40]:
yelp_count_df

Unnamed: 0,caribbean,breakfast_brunch,pizza,burgers,sandwiches,comfortfood,soulfood,grocery,cafes,chinese,...,airportlounges,surfshop,media,shavedice,pretzels,srilankan,southafrican,personalchefs,homedecor,syrian
nyu_2451_34572.1,4.0,2.0,2.0,1.0,3.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nyu_2451_34572.2,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nyu_2451_34572.3,3.0,2.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nyu_2451_34572.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nyu_2451_34572.5,0.0,0.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nyu_2451_34572.302,1.0,4.0,3.0,2.0,5.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nyu_2451_34572.303,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nyu_2451_34572.304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nyu_2451_34572.305,0.0,3.0,2.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
yelp_count_df.to_csv('../data/data/yelp_restaurant_type_counts.csv')

In [106]:
temp_df