# Data Preparation

### Task
Parse the json **nyc_geo.json** into the dataframe with the following columns:
- Borough
- Neighborhood
- Latitude
- Longitude

In [27]:
import os
import json 
from IPython.display import JSON
import pandas as pd
import folium

import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors # Matplotlib and associated plotting modules

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from collections import Counter # count occurrences 

from sklearn.cluster import KMeans # import k-means from clustering stage

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

In [2]:
data = 'nyc_geo.json'

with open (data, 'r') as f:
    contents = json.loads(f.read())

In [3]:
contents['features'][1]

{'type': 'Feature',
 'id': 'nyu_2451_34572.2',
 'geometry': {'type': 'Point',
  'coordinates': [-73.82993910812398, 40.87429419303012]},
 'geometry_name': 'geom',
 'properties': {'name': 'Co-op City',
  'stacked': 2,
  'annoline1': 'Co-op',
  'annoline2': 'City',
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.82993910812398,
   40.87429419303012,
   -73.82993910812398,
   40.87429419303012]}}

In [4]:
contents['features'][0]['properties']['borough']

'Bronx'

In [5]:
contents['features'][0]['properties']['name']

'Wakefield'

In [6]:
contents['features'][0]['geometry']['coordinates']

[-73.84720052054902, 40.89470517661]

In [7]:
borough=[]
neighborhood=[]
latitude=[]
longitude=[]
for boro in contents['features']:
    borough.append(boro['properties']['borough'])
    neighborhood.append(boro['properties']['name'])
    latitude.append(boro['geometry']['coordinates'][0])
    longitude.append(boro['geometry']['coordinates'][1])

In [8]:
len(longitude)

306

In [9]:
df = pd.DataFrame({'borough':borough,
                   'neighborhood':neighborhood,
                   'latitude':longitude,
                   'longitude':latitude
                  })

In [10]:
JSON(data)

<IPython.core.display.JSON object>

In [11]:
df

Unnamed: 0,borough,neighborhood,latitude,longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585
...,...,...,...,...
301,Manhattan,Hudson Yards,40.756658,-74.000111
302,Queens,Hammels,40.587338,-73.805530
303,Queens,Bayswater,40.611322,-73.765968
304,Queens,Queensbridge,40.756091,-73.945631


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   borough       306 non-null    object 
 1   neighborhood  306 non-null    object 
 2   latitude      306 non-null    float64
 3   longitude     306 non-null    float64
dtypes: float64(2), object(2)
memory usage: 9.7+ KB


In [13]:
df.describe()

Unnamed: 0,latitude,longitude
count,306.0,306.0
mean,40.701362,-73.946254
std,0.097498,0.121202
min,40.505334,-74.246569
25%,40.617778,-74.000062
50%,40.702969,-73.932009
75%,40.766645,-73.857525
max,40.908543,-73.708847


In [21]:
address = 'New York City, NY'
location = None

# define an instance of the geocoder -> ny_explorer
while location == None:
    try:
        geolocator = Nominatim(user_agent="ny_explorer")
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
    except:
        pass
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [23]:


#create a map
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['latitude'], df['longitude'], df['borough'], df['neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork) 

map_newyork

### Task
Use different data sources and APIs to collect information about the neigborhoods that can be used for segmentation.

In [25]:
CLIENT_ID = 'VAWDSOTBXMAKX3B14VIBJDHCD4WRTJBW2FX1I2LIJ0QC1HBK' # your Foursquare ID
CLIENT_SECRET = 'EOVBS3UXICVFAC3OWRNL23B20WJBYHOSXL5OGFUVXPHOK5YL' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version


In [34]:
url = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET,
    VERSION)
category_results = requests.get(url).json()

In [36]:
# category_results

In [37]:
for key, value in category_results['response']['categories'][0].items():
    print(key, len(str(value)))

id 24
name 20
pluralName 20
shortName 20
icon 98
categories 16373


In [38]:
category_list = category_results['response']['categories']


In [39]:
len(category_list)


10

In [40]:
for data in category_list:
    print(data['id'], data['name'])

4d4b7104d754a06370d81259 Arts & Entertainment
4d4b7105d754a06372d81259 College & University
4d4b7105d754a06373d81259 Event
4d4b7105d754a06374d81259 Food
4d4b7105d754a06376d81259 Nightlife Spot
4d4b7105d754a06377d81259 Outdoors & Recreation
4d4b7105d754a06375d81259 Professional & Other Places
4e67e38e036454776db1fb3a Residence
4d4b7105d754a06378d81259 Shop & Service
4d4b7105d754a06379d81259 Travel & Transport


In [41]:
# function to flatten a 'parent_id' category, returns all categories if checkParentID = False
def flatten_Hierarchy(category_list, checkParentID, category_dict, parent_id = ''):
    for data in category_list:
        
        if checkParentID == True and data['id'] == parent_id:
            category_dict[data['id']] = data['name']
            flatten_Hierarchy(category_list = data['categories'], checkParentID = False, category_dict = category_dict)
        
        elif checkParentID == False:
            category_dict[data['id']] = data['name']
            if len(data['categories']) != 0:
                flatten_Hierarchy(category_list = data['categories'], checkParentID = False, category_dict = category_dict)
    
    return category_dict

In [42]:
# this function takes the parent ID and returns 'name' and 'id' of all the sub-categories
category_dict = flatten_Hierarchy(category_list, checkParentID=True, category_dict = {}, parent_id = '4d4b7105d754a06374d81259')

In [44]:
neighborhood_latitude = df.loc[0, 'latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Wakefield are 40.89470517661, -73.84720052054902.


In [82]:
limit = 1 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
categoryId = '4d4b7105d754a06374d81259' # category ID for "Food"

# create URL
api_key = os.environ["FOURSQUARE_API_KEY"]
headers = {"Accept": "application/json"}
# Add key with our API KEY
headers['Authorization'] = api_key
url = 'https://api.foursquare.com/v3/places/search?&ll={},{}&radius={}&categoryId={}&limit={}'.format(
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius,
    categoryId,
    limit)
url # display URL

'https://api.foursquare.com/v3/places/search?&ll=40.89470517661,-73.84720052054902&radius=500&categoryId=4d4b7105d754a06374d81259&limit=1'

In [83]:
results = requests.get(url, headers=headers).json()


In [84]:
JSON(results)

<IPython.core.display.JSON object>

In [184]:
results['results'][0]

{'fsq_id': '4c537892fd2ea593cb077a28',
 'categories': [{'id': 13046,
   'name': 'Ice Cream Parlor',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/icecream_',
    'suffix': '.png'}},
  {'id': 13097,
   'name': 'Caribbean Restaurant',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/caribbean_',
    'suffix': '.png'}},
  {'id': 13236,
   'name': 'Italian Restaurant',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/italian_',
    'suffix': '.png'}}],
 'chains': [],
 'distance': 127,
 'geocodes': {'main': {'latitude': 40.893585, 'longitude': -73.843692},
  'roof': {'latitude': 40.893585, 'longitude': -73.843692}},
 'location': {'address': '4120 Baychester Ave',
  'country': 'US',
  'cross_street': 'Edenwald & Bussing Ave',
  'dma': 'New York',
  'formatted_address': '4120 Baychester Ave (Edenwald & Bussing Ave), Bronx, NY 10466',
  'locality': 'Bronx',
  'neighborhood': ['Edenwald'],
  'postcode': '10466',
  'region': 'NY'},
 'name'

In [85]:
categoryId_list = []
for key, value in category_dict.items():
    categoryId_list.append(key)
categoryId = ','.join(categoryId_list)

In [100]:
df.columns

Index(['borough', 'neighborhood', 'latitude', 'longitude'], dtype='object')

In [161]:
def getNearbyFood(names, latitudes, longitudes, radius=1000, LIMIT=500):
    not_found = 0
    print('***Start ', end='')
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(' .', end='')
            
        # create the API request URL
        api_key = os.environ["FOURSQUARE_API_KEY"]
        headers = {"Accept": "application/json"}
        # Add key with our API KEY
        headers['Authorization'] = api_key
        url = 'https://api.foursquare.com/v3/places/search?&ll={},{}&radius={}&categoryId={}&limit={}'.format(
        lat, 
        lng, 
        radius,
        "4d4b7105d754a06374d81259",
        limit)
         # display URL
        
            
        try:
            # make the GET request
            results = requests.get(url, headers=headers).json()['results'][0]
            
            # return only relevant information for each nearby venue
            # for name in results:
            #     venues_list.append([(
            #                     name,
            #                    lat,
            #                    lng,
            #                     results['name'], 
            #                     results['geocodes']['main']['latitude'], 
            #                     results['geocodes']['main']['longitude'],  
            #                     results['categories'][0]['name']) 
                                
        except:
            not_found += 1


    # nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    # nearby_venues.columns = ['Neighborhood', 
    #               'Neighborhood Latitude', 
    #               'Neighborhood Longitude', 
    #               'Venue', 
    #               'Venue Latitude', 
    #               'Venue Longitude', 
    #               'Venue Category']
    print("\nDone*** with {} venues with incompelete information.".format(not_found))
    return(results)

In [162]:
 nyc_venues = getNearbyFood(names=df['neighborhood'][:1],
                                        latitudes=df['latitude'][:1],
                                        longitudes=df['longitude'][:1]
                                       )

***Start  .
Done*** with 0 venues with incompelete information.


In [164]:
test = [{'id': '4c783cef3badb1f7e4244b54',
  'name': 'Carvel Ice Cream',
  'location': {'address': '1006 E 233rd St',
   'lat': 40.890486685759605,
   'lng': -73.84856772568665,
   'labeledLatLngs': [{'label': 'display',
     'lat': 40.890486685759605,
     'lng': -73.84856772568665}],
   'distance': 483,
   'postalCode': '10466',
   'cc': 'US',
   'city': 'Bronx',
   'state': 'NY',
   'country': 'United States',
   'formattedAddress': ['1006 E 233rd St',
    'Bronx, NY 10466',
    'United States']},
  'categories': [{'id': '4bf58dd8d48988d1c9941735',
    'name': 'Ice Cream Shop',
    'pluralName': 'Ice Cream Shops',
    'shortName': 'Ice Cream',
    'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/icecream_',
     'suffix': '.png'},
    'primary': True}],
  'referralId': 'v-1571641530',
  'hasPerk': False}]

In [169]:
JSON(test)

<IPython.core.display.JSON object>

In [167]:
venues=[]

venues.append([(v['name'], 
                v['location']['lat'], 
                v['location']['lng'],  
                v['categories'][0]['name']) for v in test])

In [168]:
venues

[[('Carvel Ice Cream',
   40.890486685759605,
   -73.84856772568665,
   'Ice Cream Shop')]]

In [174]:
for i in nyc_venues:
    print(i)

fsq_id
categories
chains
distance
geocodes
location
name
related_places
timezone


In [189]:
venues_list=[]
venues_list.append([(df['neighborhood'][:1],df['latitude'][:1],df['longitude'][:1],
                                v[0], 
                     #           v['geocodes']['main']['latitude'], 
                     #            v['geocodes']['main']['longitude'],  
                     #            v['categories'][0]['name']) 
                     # for v in nyc_venues])
                                

TypeError: string indices must be integers

In [188]:
venues_list

[[(0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: longitude, dtype: float64,
   'f'),
  (0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: longitude, dtype: float64,
   'c'),
  (0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: longitude, dtype: float64,
   'c'),
  (0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: longitude, dtype: float64,
   'd'),
  (0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: longitude, dtype: float64,
   'g'),
  (0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: long

In [171]:
venues_list

[[(0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: longitude, dtype: float64,
   'Lollipops Gelato',
   40.893585,
   -73.843692,
   'Ice Cream Parlor'),
  (0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: longitude, dtype: float64,
   'Lollipops Gelato',
   40.893585,
   -73.843692,
   'Ice Cream Parlor'),
  (0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: longitude, dtype: float64,
   'Lollipops Gelato',
   40.893585,
   -73.843692,
   'Ice Cream Parlor'),
  (0    Wakefield
   Name: neighborhood, dtype: object,
   0    40.894705
   Name: latitude, dtype: float64,
   0   -73.847201
   Name: longitude, dtype: float64,
   'Lollipops Gelato',
   40.893585,
   -73.843692,
   'Ice Cream Parlor'),
  (0    Wakefield
   Name: neighborhood, dty

In [92]:
 nyc_venues = getNearbyFood(names=df['neighborhood'],
                                        latitudes=df['latitude'],
                                        longitudes=df['longitude']
                                       )

***Start  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

ValueError: Length mismatch: Expected axis has 0 elements, new values have 7 elements

In [90]:
import pickle # to serialize and deserialize a Python object structure
try:
    with open('nyc_food_venues.pkl', 'rb') as f:
        nyc_venues = pickle.load(f)
    print("---Dataframe Existed and Deserialized---")
except:
    nyc_venues = getNearbyFood(names=df['neighborhood'],
                                        latitudes=df['latitude'],
                                        longitudes=df['longitude']
                                       )
    with open('nyc_food_venues.pkl', 'wb') as f:
        pickle.dump(nyc_venues, f)
    print("---Dataframe Created and Serialized---")

***Start  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

ValueError: Length mismatch: Expected axis has 0 elements, new values have 7 elements

### Task

Visualize the neigborhoods in the graph.

In [91]:
print(nyc_venues.shape)

NameError: name 'nyc_venues' is not defined