## Define Neighborhoods
Neighborhoods will be defined by 5-digit zip code in New York, and three letter Postal Code in Toronto.


### Toronto Neighborhoods
Follows the proceedure developed for previous weeks.
<li>Scrape Wikipedia Page for Postal Codes</li>
<li>Add geocoding data</li>
<li>Remove three postal codes that are grouped too closely to others</li>

In [1]:
# scrape wikipedia page
import os
import urllib.request
from bs4 import BeautifulSoup

target_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
webpage = urllib.request.urlopen(target_url)
soup = BeautifulSoup(webpage,'html.parser')
#print(soup.prettify())

import pandas as pd
import numpy as np
nbh_frame = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])

for entry in soup.find_all('tr'):
    table_row = []
    for cell in entry.find_all('td'):
        table_row.append(cell.get_text().strip())
    # following line works for the postal code page, but is not robust
    if len(table_row)==3:
        table_row[2] = table_row[2].rstrip() # removes a line feed at the end of the string
        if table_row[2] == 'Not assigned':
            table_row[2] = table_row[1]
        if table_row[1] != 'Not assigned':
            if table_row[0] in nbh_frame.PostalCode.values:
                # duplicate postal zone, so append neighborhood
                idx = nbh_frame[nbh_frame['PostalCode']==table_row[0]].index.values
                nbh_frame.iloc[idx,2] += (', ' + table_row[2])
            else:
                nbh_frame.loc[len(nbh_frame)] = table_row
print("The Toronto Neighborhood defines {} individual neighborhoods".format(len(nbh_frame)))
nbh_frame.head()

The Toronto Neighborhood defines 103 individual neighborhoods


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [8]:
# add geolocations from provided table
target_url = 'http://cocl.us/Geospatial_data'
geodata_df = pd.read_csv(target_url)

toronto_geo_df = pd.merge(nbh_frame, geodata_df, left_on='PostalCode', right_on='Postal Code')
if 'Postal Code' in toronto_geo_df.columns:
    print('here')
    toronto_geo_df.drop('Postal Code', axis = 1, inplace = True)
else:
    print('not here')
toronto_geo_df.head()

here


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


In [3]:
# Map the postal codes to get an idea about distribution
import folium

latitude, longitude  = 43.66586, -79.38316 # center map on Church and Wellesley Postal Code Location 
postal_zone_map = folium.Map(location=[latitude, longitude], zoom_start=11) 
for latitude, longitude, label1, label2 in zip(toronto_geo_df['Latitude'], 
                                               toronto_geo_df['Longitude'],
                                               toronto_geo_df['PostalCode'],
                                               toronto_geo_df['Neighborhood']):
    folium.vector_layers.CircleMarker(
        [latitude, longitude],
        radius=5,
        color='blue',
        #popup=label1 + '\n'+ label2,
        popup = label1,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(postal_zone_map)
# display map
postal_zone_map

In [4]:
# remove some postal codes that are too close to others
# M5W Stn A PO Boxes 25 The Esplanade, 
# M5L Commerce Court, Victoria Hotel
# M5X First Canadian Place, Underground city
toronto_geo_df.drop(toronto_geo_df.loc[toronto_geo_df['PostalCode']=='M5W'].index, inplace=True)
toronto_geo_df.drop(toronto_geo_df.loc[toronto_geo_df['PostalCode']=='M5L'].index, inplace=True)
toronto_geo_df.drop(toronto_geo_df.loc[toronto_geo_df['PostalCode']=='M5X'].index, inplace=True)

# remove a one more neighborhoods, to make spacing of neighborhoods consistent with New York ZIP Codes

toronto_geo_df.drop(toronto_geo_df.loc[toronto_geo_df['PostalCode']=='M5K'].index, inplace=True)

In [5]:
len(toronto_geo_df)

99

### New York Neighborhoods
Uses data saved in excel spreadsheets
<li>Multiple New York zip codes codes are found in a single 'Neighborhood'  from https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm</li>
<li>Add geocoding data</li>
<li></li>

In [6]:
dfs = pd.read_excel('nyc.xlsx', sheet_name=None)
ny_df = pd.DataFrame(dfs['Sheet1'])
nyzip_df = pd.DataFrame(columns = ['ZIPCode', 'Bourough', 'Neighborhood'])
for index, row_series in ny_df.iterrows():
    for item in str(row_series[2]).split():
        nyzip_df.loc[len(nyzip_df)] = [item.rstrip(','), row_series[0], row_series[1]]

geo_df = pd.read_csv('uszipcodes.csv')
geo_df.columns=['ZIP', 'Latitude', 'Longitude']
#geo_df['ZIP']=geo_df['ZIP'].zfill(5)
def convert_zip(int_zip):
    str_zip = str(int_zip).zfill(5)
    
    return str_zip

geo_df['ZIP'] = geo_df['ZIP'].apply(convert_zip)
#for index, entry in geo_df.iterrows():
#    geo_df.loc[index][0]=str(entry[0]).zfill(5)

newyork_geo_df = pd.merge(nyzip_df, geo_df, left_on='ZIPCode', right_on='ZIP')
if 'ZIP' in newyork_geo_df.columns:
    print('here')
    newyork_geo_df.drop('ZIP', axis = 1, inplace = True)
else:
    print('not here')
newyork_geo_df.head()


here


Unnamed: 0,ZIPCode,Bourough,Neighborhood,Latitude,Longitude
0,10453,Bronx,Central Bronx,40.852779,-73.912332
1,10457,Bronx,Central Bronx,40.84715,-73.89868
2,10460,Bronx,Central Bronx,40.841758,-73.879571
3,10458,Bronx,Bronx Park and Fordham,40.862543,-73.888143
4,10467,Bronx,Bronx Park and Fordham,40.869953,-73.865746


In [8]:
# Map the ZIP codes to get an idea about distribution
import folium

latitude, longitude  = 40.852779, -73.912332 # center map on Church and Wellesley Postal Code Location 
postal_zone_map = folium.Map(location=[latitude, longitude], zoom_start=11) 
for latitude, longitude, label1, label2 in zip(newyork_geo_df['Latitude'], 
                                               newyork_geo_df['Longitude'],
                                               newyork_geo_df['ZIPCode'],
                                               newyork_geo_df['Neighborhood']):
    folium.vector_layers.CircleMarker(
        [latitude, longitude],
        radius=5,
        color='blue',
        #popup=label1 + '\n'+ label2,
        popup = label1,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(postal_zone_map)
# display map
postal_zone_map

In [9]:
# There is one pair of zip codes that are very closely spaced, I will drop one of them
# The area west of the park also seems to be missing postal codes.
newyork_geo_df.drop(newyork_geo_df.loc[newyork_geo_df['ZIPCode']=='10280'].index, inplace=True)
len(newyork_geo_df)
# For exploring distances between zip codes I am pickling these dataframes
# see distance notebook to see some of the tools used
newyork_geo_df.to_pickle("new_york_geodata.pkl")
toronto_geo_df.to_pickle("toronto_geodata.pkl")

In [10]:
# After this additional review, remove five more ZIP codes to make spacing of 
# neighborhoods consistent accross New York and Toronto

newyork_geo_df.drop(newyork_geo_df.loc[newyork_geo_df['ZIPCode']=='10007'].index, inplace=True)
newyork_geo_df.drop(newyork_geo_df.loc[newyork_geo_df['ZIPCode']=='10018'].index, inplace=True)
newyork_geo_df.drop(newyork_geo_df.loc[newyork_geo_df['ZIPCode']=='10021'].index, inplace=True)
newyork_geo_df.drop(newyork_geo_df.loc[newyork_geo_df['ZIPCode']=='10024'].index, inplace=True)
newyork_geo_df.drop(newyork_geo_df.loc[newyork_geo_df['ZIPCode']=='10028'].index, inplace=True)


In [11]:
# temporary - save neighborhood dataframes for next step
newyork_geo_df.to_pickle("new_york_n_geodata.pkl")
toronto_geo_df.to_pickle("toronto_n_geodata.pkl")