# The *Perfect* Fit
  With this Notebook I will be extracting data with the Foursquare API and scrapping RentHop.com for average rent for one bedroom apartments. Once all data is obtained, it will be cleaned and processed by city, by neighborhood. This will utilize the k-means clustering algorithm to further choose between which city is my personal best fit and will contribute to neighborhood decisions as well. 

In [19]:
import numpy as np                          # library to handle data in a vectorized manner
import pandas as pd                         # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json                                 # library to handle JSON files
from geopy.geocoders import Nominatim       # convert an address into latitude and longitude values
import requests                             # library to handle requests
from pandas.io.json import json_normalize   # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium                               # map rendering library

%matplotlib inline

print('Libraries imported.')

Libraries imported.


### Downloading Datasets for each city
  New York City - This data is the same as the week 3's "Segmenting and Clustering Neighborhoods" data  
    
  Boston - https://data.boston.gov/dataset/boston-neighborhoods/resource/13ee2b65-6547-4168-b112-83995f138602 geojson data of Boston's neighborhoods  
    
  Chicago - https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Neighborhoods/bbvz-uum9 geojson data of Chicago's neighborhoods

In [20]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
    print("New York JSON data loaded")

with open('Boston_Neighborhoods.json') as json_data:
    boston_data = json.load(json_data)
    print("Boston JSON data loaded")

with open('Chicago_Neighborhoods.json') as json_data:
    chicago_data = json.load(json_data)
    print("Chicago JSON data loaded")

nyc_neighborhoods_data = newyork_data['features']
bos_neighborhoods_data = boston_data['features']
chi_neighborhoods_data = chicago_data['features']

New York JSON data loaded
Boston JSON data loaded
Chicago JSON data loaded


### Transform into pandas df

In [21]:
nyc_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
bos_names = ['Neighborhood', 'Latitude', 'Longitude']
chi_names = ['Primary Neighborhood', 'Secondary Neighborhood', 'Latitude', 'Longitude']

nyc_hoods = pd.DataFrame(columns=nyc_names)
bos_hoods = pd.DataFrame(columns=bos_names)
chi_hoods = pd.DataFrame(columns=chi_names)

### If the JSON files were all configured the same...
  ~~hoods = ny_hoods, bos_hoods, chi_hoods  
  for hood in hoods:~~

In [22]:
# Loop for New York City Data
for data in nyc_neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    nyc_hoods = nyc_hoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [23]:
# Loop for Boston Data
for data in bos_neighborhoods_data:
    neighborhood_name = data["properties"]["Name"]
    address = neighborhood_name + ", MA"

    geolocator = Nominatim(user_agent="bos_explorer")
    location = geolocator.geocode(address)
    try:
        latitude = location.latitude
        longitude = location.longitude
    except AttributeError:
        pass
    #print('The geograpical coordinate of {} are {}, {}.'.format(neighborhood_name, latitude, longitude))
    bos_hoods = bos_hoods.append({'Neighborhood': neighborhood_name,
                                  'Latitude': latitude,
                                  'Longitude': longitude}, ignore_index=True)

In [24]:
nyc_hoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [25]:
# Manually fixing specific neighborhoods
### Leather District
bos_hoods.loc[[5], ["Latitude"]] = 42.3505
bos_hoods.loc[[5], ["Longitude"]] = -71.0579

### Harbor Islands
bos_hoods.loc[[25], ["Latitude"]] = 42.3167
bos_hoods.loc[[25], ["Longitude"]] = -70.9681

### South Boston Waterfront
bos_hoods.loc[[22], ["Latitude"]] = 42.3483
bos_hoods.loc[[22], ["Longitude"]] = -71.0428

### Downtown
bos_hoods.loc[[15], ["Latitude"]] = 42.3557
bos_hoods.loc[[15], ["Longitude"]] = -71.0572

bos_hoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Roslindale,42.291209,-71.124497
1,Jamaica Plain,42.30982,-71.12033
2,Mission Hill,42.33256,-71.103608
3,Longwood,42.341718,-71.109922
4,Bay Village,42.350011,-71.066948


In [26]:
for data in chi_neighborhoods_data:
    primary_name = data["properties"]["pri_neigh"]
    secondary_name = data["properties"]["sec_neigh"].title()
    address = primary_name + ", Chicago"

    geolocator = Nominatim(user_agent="chi_explorer")
    location = geolocator.geocode(address)
    try:
        latitude = location.latitude
        longitude = location.longitude
    except AttributeError:
        pass
    #print('The geograpical coordinate of {} are {}, {}.'.format(primary_name, latitude, longitude))
    chi_hoods = chi_hoods.append({'Primary Neighborhood': primary_name,
                                  'Secondary Neighborhood': secondary_name,
                                  'Latitude': latitude,
                                  'Longitude': longitude}, ignore_index=True)

In [27]:
# Manually fixing specific neighborhoods
### River North
chi_hoods.loc[[32], ["Latitude"]] = 41.8924
chi_hoods.loc[[32], ["Longitude"]] = -87.6341

### Musuem Campus 
chi_hoods.loc[[97], ["Latitude"]] = 41.8636
chi_hoods.loc[[97], ["Longitude"]] = -87.6163

chi_hoods.head()

Unnamed: 0,Primary Neighborhood,Secondary Neighborhood,Latitude,Longitude
0,Grand Boulevard,Bronzeville,41.813923,-87.617272
1,Printers Row,Printers Row,41.873787,-87.6289
2,United Center,United Center,41.880683,-87.674185
3,Sheffield & DePaul,Sheffield & Depaul,41.880683,-87.674185
4,Humboldt Park,Humboldt Park,41.905767,-87.704174


### Creating Maps of each city

In [28]:
address = ['New York City, NY', "Boston, MA", "Chicago, IL"]
city_dict = {}
for city in address:
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(city)
    latitude = location.latitude
    longitude = location.longitude
    city_dict[city] = [latitude, longitude]
    print('The geograpical coordinate of {} are {}, {}.'.format(city, latitude, longitude))

The geograpical coordinate of New York City, NY are 40.7127281, -74.0060152.
The geograpical coordinate of Boston, MA are 42.3602534, -71.0582912.
The geograpical coordinate of Chicago, IL are 41.8755616, -87.6244212.


In [29]:
# create map of New York using latitude and longitude values
#map_newyork = folium.Map(location=city_dict['New York City, NY'], zoom_start=10)

# add markers to map
#for lat, lng, borough, neighborhood in zip(nyc_hoods['Latitude'], nyc_hoods['Longitude'], nyc_hoods['Borough'], nyc_hoods['Neighborhood']):
#    label = '{}, {}'.format(neighborhood, borough)
#    label = folium.Popup(label, parse_html=True)
#    folium.CircleMarker(
#        [lat, lng],
#        radius=5,
#        popup=label,
#        color='blue',
#        fill=True,
#        fill_color='#3186cc',
#        fill_opacity=0.7,
#        parse_html=False).add_to(map_newyork)  

#map_newyork

In [30]:
# create map of Boston using latitude and longitude values
#map_boston = folium.Map(location=city_dict['Boston, MA'], zoom_start=12)

# add markers to map
#for lat, lng, neighborhood in zip(bos_hoods['Latitude'], bos_hoods['Longitude'], bos_hoods['Neighborhood']):
#    label = '{}'.format(neighborhood)
#    label = folium.Popup(label, parse_html=True)
#    folium.CircleMarker(
#        [lat, lng],
#        radius=5,
#        popup=label,
#        color='green',
#        fill=True,
#        fill_color='#90ee90',
#        fill_opacity=0.7,
#        parse_html=False).add_to(map_boston)  

#map_boston

In [31]:
# create map of Chicago using latitude and longitude values
#map_chicago = folium.Map(location=city_dict['Chicago, IL'], zoom_start=10)

# add markers to map

#for lat, lng, borough, neighborhood in zip(chi_hoods['Latitude'], chi_hoods['Longitude'], chi_hoods['Secondary Neighborhood'], chi_hoods['Primary Neighborhood']):
#    label = '{}, {}'.format(neighborhood, borough)
#    label = folium.Popup(label, parse_html=True)
#    folium.CircleMarker(
#        [lat, lng],
#        radius=5,
#        popup=label,
#        color='red',
#        fill=True,
#        fill_color='#E62020',
#        fill_opacity=0.35,
#        parse_html=False).add_to(map_chicago)  

#map_chicago

In [32]:
# create map of United States using latitude and longitude values
map_usa = folium.Map(location=[42, -79], zoom_start=6)

for lat, lng, borough, neighborhood in zip(nyc_hoods['Latitude'], nyc_hoods['Longitude'], nyc_hoods['Borough'], nyc_hoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_usa) 

for lat, lng, neighborhood in zip(bos_hoods['Latitude'], bos_hoods['Longitude'], bos_hoods['Neighborhood']):
    label = '{}, Boston'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#90ee90',
        fill_opacity=0.7,
        parse_html=False).add_to(map_usa)  

for lat, lng, neighborhood in zip(chi_hoods['Latitude'], chi_hoods['Longitude'], chi_hoods['Primary Neighborhood']):
    label = '{}, Chicago'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#E62020',
        fill_opacity=0.35,
        parse_html=False).add_to(map_usa)  

map_usa

# Using Foursquare API to Categorize each neighborhood

### Defining Foursquare Credientials

In [33]:
CLIENT_ID = 'GNZ1Q222RCBIGH1JROMBBDG54HVDWGSTYKARWOIV2S3M2H1R' # your Foursquare ID
CLIENT_SECRET = 'H0YAPWYAUDS0OTC0XFHPWKFPY5RZCFMP1N3ZRFF0YYGUZZEL' # your Foursquare Secret
VERSION = '20200723' # Foursquare API version
LIMIT = 50 # limit of number of venues returned by Foursquare API

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GNZ1Q222RCBIGH1JROMBBDG54HVDWGSTYKARWOIV2S3M2H1R
CLIENT_SECRET:H0YAPWYAUDS0OTC0XFHPWKFPY5RZCFMP1N3ZRFF0YYGUZZEL


## Venues Function
   With this function, venues throughout all three cities will be captured

In [34]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Obtaining Venues for all neighborhoods

In [35]:
try:
    nyc_venues = getNearbyVenues(names=nyc_hoods['Neighborhood'],
                                 latitudes=nyc_hoods['Latitude'],
                                 longitudes=nyc_hoods['Longitude']
                                  )
except KeyError:
    pass

print("\n All NYC Neighborhoods Analyzed! \n")

try:
    bos_venues = getNearbyVenues(names=bos_hoods['Neighborhood'],
                                 latitudes=bos_hoods['Latitude'],
                                 longitudes=bos_hoods['Longitude']
                                 )
except KeyError:
    pass

print("\n All BOS Neighborhoods Analyzed! \n")

try:
    chi_venues = getNearbyVenues(names=chi_hoods['Primary Neighborhood'],
                             latitudes=chi_hoods['Latitude'],
                             longitudes=chi_hoods['Longitude']
                             )
except KeyError:
    pass

print("\n All CHI Neighborhoods Analyzed! \n")

Wakefield

 All NYC Neighborhoods Analyzed! 

Roslindale

 All BOS Neighborhoods Analyzed! 

Grand Boulevard

 All CHI Neighborhoods Analyzed! 



### Inspecting Data to see how it was formatted

In [36]:
print(nyc_venues.shape)
nyc_venues.head()

NameError: name 'nyc_venues' is not defined

In [None]:
print(bos_venues.shape)
bos_venues.head()

In [None]:
print(chi_venues.shape)
chi_venues.head()

### One Hot encoding technique

In [79]:
# one hot encoding
nyc_onehot = pd.get_dummies(nyc_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
nyc_onehot['Neighborhood'] = nyc_venues['Neighborhood'] + ", NY"

# move neighborhood column to the first column
fixed_columns = [nyc_onehot.loc[:, "Neightborhoods"]] + list(nyc_onehot.columns[:-1])
nyc_onehot = nyc_onehot[fixed_columns]

nyc_onehot.head()

In [None]:
# one hot encoding
bos_onehot = pd.get_dummies(bos_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
bos_onehot['Neighborhood'] = bos_venues['Neighborhood'] ", MA"

# move neighborhood column to the first column
fixed_columns = [bos_onehot.loc[:, "Neightborhoods"]] + list(bos_onehot.columns[:-1])
bos_onehot = bos_onehot[fixed_columns]

bos_onehot.head()

In [None]:
# one hot encoding
chi_onehot = pd.get_dummies(chi_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
chi_onehot['Neighborhood'] = chi_venues['Neighborhood'] ", IL"

# move neighborhood column to the first column
fixed_columns = [chi_onehot.loc[:, "Neightborhoods"]] + list(chi_onehot.columns[:-1])
chi_onehot = chi_onehot[fixed_columns]

chi_onehot.head()

### Frequencies by neighborhoods

# CREATE A FULL TABLE OF ALL NEIGHBORHOODS!!!

In [None]:
nyc_grouped = nyc_onehot.groupby('Neighborhood').mean().reset_index()
bos_grouped = bos_onehot.groupby('Neighborhood').mean().reset_index()
chi_grouped = chi_onehot.groupby('Neighborhood').mean().reset_index()

cities_grouped = 

### This function will sort venues by descending order

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = cities_grouped['Neighborhood']