## NYC Neighborhood data clustering
## Data Preparation

### Task
Parse the json **nyc_geo.json** into the dataframe with the following columns:
- Borough
- Neighborhood
- Latitude
- Longitude

In [19]:
# install required libraries
%pip install -r requirements.txt

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: './requirements.txt'[0m
Note: you may need to restart the kernel to use updated packages.


In [20]:
# import libraries
import os
import json 
import time
from IPython.display import JSON
import pandas as pd
import numpy as np

# map visualization
import folium
from geopy import distance
import requests as re

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [21]:
#process neighborhoods data into dataframe
file = open('data/nyc_geo.json')
jsondata = json.load(file)

In [22]:
# explore
jsondata['features']

[{'type': 'Feature',
  'id': 'nyu_2451_34572.1',
  'geometry': {'type': 'Point',
   'coordinates': [-73.84720052054902, 40.89470517661]},
  'geometry_name': 'geom',
  'properties': {'name': 'Wakefield',
   'stacked': 1,
   'annoline1': 'Wakefield',
   'annoline2': None,
   'annoline3': None,
   'annoangle': 0.0,
   'borough': 'Bronx',
   'bbox': [-73.84720052054902,
    40.89470517661,
    -73.84720052054902,
    40.89470517661]}},
 {'type': 'Feature',
  'id': 'nyu_2451_34572.2',
  'geometry': {'type': 'Point',
   'coordinates': [-73.82993910812398, 40.87429419303012]},
  'geometry_name': 'geom',
  'properties': {'name': 'Co-op City',
   'stacked': 2,
   'annoline1': 'Co-op',
   'annoline2': 'City',
   'annoline3': None,
   'annoangle': 0.0,
   'borough': 'Bronx',
   'bbox': [-73.82993910812398,
    40.87429419303012,
    -73.82993910812398,
    40.87429419303012]}},
 {'type': 'Feature',
  'id': 'nyu_2451_34572.3',
  'geometry': {'type': 'Point',
   'coordinates': [-73.82780644716412, 

In [23]:
#extract boroughs, neighborhood, and coordinates from JSON file
dictdata = []
for i in range(len(jsondata['features'])):
    borough = jsondata['features'][i]['properties']['borough']
    hood = jsondata['features'][i]['properties']['name']
    latitude = jsondata['features'][i]['geometry']['coordinates'][1]
    longitude = jsondata['features'][i]['geometry']['coordinates'][0]
    dataentry = [hood, borough, latitude, longitude]
    dictdata.append(dataentry)


In [24]:
# create a dataframe based on data
columns = ['neighborhood', 'borough', 'latitude', 'longitude']
df = pd.DataFrame(dictdata, columns=columns)

In [30]:
df

Unnamed: 0,neighborhood,borough,latitude,longitude
0,Wakefield,Bronx,40.894705,-73.847201
1,Co-op City,Bronx,40.874294,-73.829939
2,Eastchester,Bronx,40.887556,-73.827806
3,Fieldston,Bronx,40.895437,-73.905643
4,Riverdale,Bronx,40.890834,-73.912585
...,...,...,...,...
301,Hudson Yards,Manhattan,40.756658,-74.000111
302,Hammels,Queens,40.587338,-73.805530
303,Bayswater,Queens,40.611322,-73.765968
304,Queensbridge,Queens,40.756091,-73.945631


In [31]:
# save neighborhoods dataframe
df.to_csv('data/nyc_hoods_geo.csv', index=False)

In [36]:
# create a map
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add neighborhood location markers to map
for lat, lng, borough, neighborhood in zip(df['latitude'], df['longitude'], df['borough'], df['neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        tooltip=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork) 
map_newyork.save("data/map_newyork.html")
map_newyork

### Task
Use different data sources and APIs to collect information about the neigborhoods that can be used for segmentation.

#### Yelp API

In [None]:
# initialize Yelp API
yelp_api_key = os.environ['YELP_KEY']

#### FourSquare API

In [37]:
# initialize FS API
fs_api_key = os.environ['FS_API_KEY']
fs_secret = os.environ['FS_CL_SECRET']

headers = {
    'Accept': 'application/json',
    'Authorization': fs_api_key
}
url="https://api.foursquare.com/v3/places/search"
radius = "&radius=1000"
limit = "&limit=50"


In [80]:
#get venues list given a set of coordinates from FS API
def get_venues(coords):
    stripcoords = coords.replace(" ", "")
    url_keys_append = "?ll=" + stripcoords + radius + limit
    requrl = url + url_keys_append
    res = re.request("GET", requrl, headers=headers)
    dataset = res.json()
    return dataset

In [58]:
testset = get_venues("40.7128,-74.0060")

In [59]:
len(testset['results'])

50

In [78]:
testset['results'][0]

{'fsq_id': '53373f26498e940581c90985',
 'categories': [{'id': 18025,
   'name': 'Dance Studio',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/arts_entertainment/performingarts_dancestudio_',
    'suffix': '.png'}}],
 'chains': [],
 'distance': 128,
 'geocodes': {'main': {'latitude': 40.714157, 'longitude': -74.005669},
  'roof': {'latitude': 40.714157, 'longitude': -74.005669}},
 'link': '/v3/places/53373f26498e940581c90985',
 'location': {'address': '280 Broadway',
  'census_block': '360610031002003',
  'country': 'US',
  'cross_street': 'at Chambers St',
  'dma': 'New York',
  'formatted_address': '280 Broadway (at Chambers St), New York, NY 10007',
  'locality': 'New York',
  'neighborhood': ['Chinatown'],
  'postcode': '10007',
  'region': 'NY'},
 'name': 'Gibney Dance',
 'related_places': {'children': [{'fsq_id': '5e1a1205a97b280008a9e012',
    'name': 'Saltdrop'}]},
 'timezone': 'America/New_York'}

In [77]:
print(testset['results'][0]['name'])
print(testset['results'][0]['categories'][0]['name'])

Gibney Dance
Dance Studio


In [81]:
#empty dataframe for venue types
venuesdf = pd.DataFrame(columns=['neighborhood', 'borough', 'venue', 'venue_name'])

In [82]:
# for loop to get venues info for each neighborhood
#get venues info for each neighborhood
for i in range(len(df)):
    nhoodname = df.at[i, 'neighborhood']
    borough = df.at[i, 'borough']
    coord = str(df.at[i, 'latitude']) + ',' + str(df.at[i, 'longitude'])
    fs_res = get_venues(coord)
    print(nhoodname, len(fs_res['results']), end=';')
    for j in range(len(fs_res['results'])):
        if len(fs_res['results'][j]['categories']) > 0:
            venue_type = fs_res['results'][j]['categories'][0]['name']
            venue_name = fs_res['results'][j]['name']
            dict_to_df = {'neighborhood' : nhoodname, 'borough' : borough, 'venue': venue_type, 'venue_name': venue_name}
            venuesdf = venuesdf.append(dict_to_df, ignore_index=True)
    time.sleep(7)

Wakefield 40
Co-op City 44
Eastchester 41
Fieldston 34
Riverdale 26
Kingsbridge 33
Marble Hill 47
Woodlawn 31
Norwood 39
Williamsbridge 50
Baychester 47
Pelham Parkway 36
City Island 21
Bedford Park 38
University Heights 30
Morris Heights 46
Fordham 50
East Tremont 39
West Farms 23
High  Bridge 10
Melrose 50
Mott Haven 48
Port Morris 45
Longwood 50
Hunts Point 32
Morrisania 35
Soundview 40
Clason Point 5
Throgs Neck 24
Country Club 41
Parkchester 47
Westchester Square 45
Van Nest 43
Morris Park 50
Belmont 50
Spuyten Duyvil 37
North Riverdale 34
Pelham Bay 44
Schuylerville 37
Edgewater Park 29
Castle Hill 45
Olinville 50
Pelham Gardens 21
Concourse 42
Unionport 50
Edenwald 43
Bay Ridge 50
Bensonhurst 40
Sunset Park 50
Greenpoint 50
Gravesend 31
Brighton Beach 40
Sheepshead Bay 50
Manhattan Terrace 33
Flatbush 50
Crown Heights 44
East Flatbush 38
Kensington 44
Windsor Terrace 50
Prospect Heights 50
Brownsville 44
Williamsburg 50
Bushwick 38
Bedford Stuyvesant 50
Brooklyn Heights 50
Cobbl

In [132]:
venuesdf.head()

Unnamed: 0,neighborhood,borough,venue,venue_name
0,Wakefield,Bronx,Ice Cream Parlor,Lollipops Gelato
1,Wakefield,Bronx,Drugstore,Walgreens
2,Wakefield,Bronx,Ice Cream Parlor,Carvel
3,Wakefield,Bronx,Bagel Shop,Dunkin'
4,Wakefield,Bronx,Fast Food Restaurant,Subway


In [96]:
# save venue dataframe to csv
venuesdf.to_csv('data/venues.csv', index=False)

In [141]:
venuesdf.venue.value_counts()

Pizzeria                       602
Restaurant                     445
Bakery                         389
Grocery Store / Supermarket    383
Bagel Shop                     337
                              ... 
Luggage Store                    1
Golf                             1
Flea Market                      1
Rental Service                   1
Entertainment Service            1
Name: venue, Length: 419, dtype: int64

In [104]:
# rename categories to make them more general
for i in range(len(venuesdf)):
    if 'Restaurant' in venuesdf.at[i, 'venue']:
        venuesdf.at[i, 'venue'] = 'Restaurant'
    if 'Laundr' in venuesdf.at[i, 'venue']:
        venuesdf.at[i, 'venue'] = 'Laundry'
    if 'Bar' in venuesdf.at[i, 'venue']:
        venuesdf.at[i, 'venue'] = 'Bar'

In [185]:
# one hot encoding venues and drop venue column
venues_onehot = pd.get_dummies(venuesdf, prefix='', prefix_sep='', columns=['venue'])

In [187]:
venues_grouped = venues_onehot.groupby(['neighborhood', 'borough']).mean().reset_index()

In [188]:
len(venues_grouped)

306

In [190]:
# save grouped venues
venues_grouped.to_csv('data/venues_grouped.csv', index=False)

In [191]:
venues_grouped.head()

Unnamed: 0,neighborhood,borough,ATM,Accounting and Bookkeeping Service,Advertising Agency,Afghan Restaurant,African Restaurant,Agriculture and Forestry Service,Airport Service,American Restaurant,...,Waterfront,Whisky Bar,Wine Bar,Wine Store,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Youth Organization,Zoo
0,Allerton,Bronx,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.042553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Annadale,Staten Island,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arden Heights,Staten Island,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arlington,Staten Island,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arrochar,Staten Island,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [237]:
# make a dataframe with each neighborhoods top 10 venues
num_top_venues = 10

columns = ['neighborhood', 'borough']
for i in range(num_top_venues):
    columns.append(str(i+1) + ' common')

common_venues = pd.DataFrame(columns=columns)
common_venues['neighborhood'] = venues_grouped['neighborhood']
common_venues['borough'] = venues_grouped['borough']

# for each neighborhood, get the top 10 venus an add to common_venues
for i in range(len(venues_grouped)):
    venueslist = venues_grouped.iloc[i, 2:].sort_values(ascending=False)[:10].index.to_list()
    for j in range(len(venueslist)):
        common_venues.iloc[i, j + 2] = venueslist[j]

common_venues.head()

Unnamed: 0,neighborhood,borough,1 common,2 common,3 common,4 common,5 common,6 common,7 common,8 common,9 common,10 common
0,Allerton,Bronx,Pizzeria,Hair Salon,Drugstore,Fast Food Restaurant,Bagel Shop,Furniture and Home Store,Barbershop,Deli,American Restaurant,Chinese Restaurant
1,Annadale,Staten Island,Diner,Pizzeria,Dance Studio,Hiking Trail,Italian Restaurant,Playground,Chinese Restaurant,Eyecare Store,Sushi Restaurant,Hair Salon
2,Arden Heights,Staten Island,Playground,Park,Chinese Restaurant,Restaurant,Bridge,Sushi Restaurant,Spa,Shopping Mall,Bank,Hiking Trail
3,Arlington,Staten Island,Deli,Laundry Service,Storage Facility,Food Truck,General Contractor,Car Parts and Accessories,Organization,Fast Food Restaurant,Hardware Store,Coffee Shop
4,Arrochar,Staten Island,Baseball Field,Sporting Goods Retail,American Restaurant,Pizzeria,Beach,Grocery Store / Supermarket,Plaza,Lounge,Sports and Recreation,Optometrist


In [238]:
# save to csv
common_venues.to_csv('data/venues_common.csv', index=False)

#### HOUSING PRICES DATA

In [153]:
# CREATE A MAP OF HOUSING PRICES DATA TO COMPARE TO
#get housing prices data
febexcel = pd.read_excel('data/nyc_housing_prices_feb_2021.xlsx')
julcsv = pd.read_csv('data/nyc_housing_prices_jul_2020.csv')

In [154]:
#working on these columns a lot
process_cols = ['studio', '1_bedroom', '2_bedroom']

In [155]:
#remove commas and dollar signs so we can count mean
for i in process_cols:
    febexcel[i] = febexcel[i].str.replace(',', '')
    febexcel[i] = febexcel[i].str.replace('$', '')

for i in julcsv.columns[2:6]:
    julcsv[i] = julcsv[i].str.replace(',', '')
    julcsv[i] = julcsv[i].str.replace('$', '')


In [156]:
#realign excel table data because it is not aligned in the end (check febexcel.tail())
#take it all apart first. im sure theres a more elegant way
febexcelstud = febexcel[['neighborhood','studio']].copy()
febexcelonebd = febexcel[['neighborhood.1','1_bedroom']].copy()
febexcelonebd.rename(columns={"neighborhood.1":"neighborhood"}, inplace=True)
febexceltwobd = febexcel[['neighborhood.2', '2_bedroom']].copy()
febexceltwobd.rename(columns={"neighborhood.2":"neighborhood"}, inplace=True)

#drop null values from datasets, otherwise merge will give too many results
febexcelstud.dropna(subset=['studio'], inplace=True)
febexcelonebd.dropna(subset=['1_bedroom'], inplace=True)
febexceltwobd.dropna(subset=['2_bedroom'], inplace=True)

#make existing values floats
febexcelstud['studio'] = febexcelstud['studio'].astype(float)
febexcelonebd['1_bedroom'] = febexcelonebd['1_bedroom'].astype(float)
febexceltwobd['2_bedroom'] = febexceltwobd['2_bedroom'].astype(float)

In [157]:
#merge separated tables on neighborhood
#merge on right because the second and third columns had more values
feb_merged_one = pd.merge(febexcelstud, febexcelonebd, how='right', on='neighborhood')
feb_merged = pd.merge(feb_merged_one, febexceltwobd, how='right', on='neighborhood')

In [158]:
#fill nans with column mean
for i in process_cols:
    feb_merged[i].fillna((feb_merged[i].mean()), inplace=True)

In [159]:
#make new column called mean based on the other values
feb_merged['mean'] = feb_merged.mean(axis=1)

In [160]:
feb_merged.head()

Unnamed: 0,neighborhood,studio,1_bedroom,2_bedroom,mean
0,Upper West Side,1975.0,2750.0,4350.0,3025.0
1,Bedford-Stuyvesant,1650.0,1975.0,2200.0,1941.666667
2,Bushwick,1731.0,2150.0,2100.0,1993.666667
3,Upper East Side,1898.0,2395.0,3295.0,2529.333333
4,Williamsburg,2491.0,2500.0,2584.0,2525.0


In [161]:
#on to the july 2021 dataset
#spelling error in column name
julcsv.rename(columns={"neigborhood":"neighborhood"}, inplace=True)

#drop borough and 3_bedroom because we dont have that info in the prev dataset
julcsv.drop(columns=['borough', '3_bedroom'], inplace=True)

# red hook is missing a val
julcsv.at[50, 'studio'] = 0
#make those values float
for i in process_cols:
    julcsv[i] = julcsv[i].astype(float)
    
#red hook again
julcsv.at[50, 'studio'] = julcsv['studio'].mean()

#get mean for this dataset as well
julcsv['mean'] = julcsv.mean(axis=1)

In [162]:
# MERGE TWO DATASETS
prices = pd.merge(feb_merged, julcsv, how='outer', on='neighborhood')
prices.sort_values(by=['neighborhood'], inplace=True)

In [163]:
#if left dataset has nan, take it from the right dataset
for i in range(len(prices)):
    if (prices.at[i, 'studio_x'] != prices.at[i, 'studio_x']):
             prices.at[i, 'mean_x'] = prices.at[i, 'mean_y']

In [164]:
#drop columns we wont use
prices.drop(columns=['studio_x', '1_bedroom_x', '2_bedroom_x', 'studio_y', '1_bedroom_y', '2_bedroom_y', 'mean_y'], inplace=True)

In [165]:
# merge housing price data with neighborhood data
prices = pd.merge(prices, df, how='inner', on='neighborhood')


In [166]:
prices.head()

Unnamed: 0,neighborhood,mean_x,borough,latitude,longitude
0,Allerton,1625.0,Bronx,40.865788,-73.859319
1,Arden Heights,2099.274673,Staten Island,40.549286,-74.185887
2,Astoria,1883.333333,Queens,40.768509,-73.915654
3,Auburndale,1916.666667,Queens,40.76173,-73.791762
4,Bath Beach,2133.333333,Brooklyn,40.599519,-73.998752


In [169]:
# show neighborhood data on map with mean_x

# color producer function
def color_producer(mean):
    if mean < 2000:
        return 'green'
    elif mean >= 2000 and mean < 2500:
        return 'yellow'
    elif mean >= 2500 and mean < 3000:
        return 'orange'
    elif mean >= 3000:
        return 'red'

#create a map
map_nyc_housing = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map, color code mean_x
for i in range(len(prices)):
    folium.Circle(
        location=[prices.at[i, 'latitude'], prices.at[i, 'longitude']],
        radius=900,
        fill_color=color_producer(prices.at[i, 'mean_x']),
        fill_opacity=0.6,
        stroke=False,
        tooltip=prices.at[i, 'mean_x']
    ).add_to(map_nyc_housing)

# save map and diplay it
map_nyc_housing.save('data/nyc_housing_map.html')
map_nyc_housing

In [170]:
#save the merged dataset
prices.to_csv('data/housing_avg.csv')

#### PARSING LYFT PICKUP LOCATIONS

In [113]:
from tqdm import tqdm

# import lyft data
lyft = pd.read_csv('data/other_lyft_b02510.csv')

#one zero, remove it and reset index for looping
lyft = lyft.dropna().reset_index()
lyftdf = df.copy()

#make lyft pickups column
lyftdf['lyft_pickups'] = 0

In [114]:
len(lyftdf)

306

In [205]:
# assign lyft pickups to neighborhoods by checking distance from neighborhood
# assign it to neighborhood if its less than 900m from it
# This takes about an hour to run
for i in tqdm(range(len(lyft))):   
    pickupcoords = (lyft.at[i, 'start_lat'], lyft.at[i, 'start_lng'])
    for j in range(len(df)):
        nbhoodcoords = (lyftdf.at[j, 'latitude'], lyftdf.at[j, 'longitude'])
        dist = distance.distance(pickupcoords, nbhoodcoords).m
        if dist < 900:
            lyftdf.at[j, 'lyft_pickups'] += 1
            break

100%|██████████| 267700/267700 [1:06:42<00:00, 66.88it/s] 


In [206]:
lyftdf.to_csv('data/lyftpickups.csv', index=False)

In [207]:
lyftdf.head()

Unnamed: 0,neighborhood,borough,latitude,longitude,lyft_pickups
0,Wakefield,Bronx,40.894705,-73.847201,12
1,Co-op City,Bronx,40.874294,-73.829939,36
2,Eastchester,Bronx,40.887556,-73.827806,13
3,Fieldston,Bronx,40.895437,-73.905643,28
4,Riverdale,Bronx,40.890834,-73.912585,41
