# Segmenting and Clustering Neighborhoods in New York City

In [1]:
import numpy as np 

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

import requests 
from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium 

from bs4 import BeautifulSoup

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


## Part 1: Webscrapping Toronto Data

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url)

In [3]:
soup = BeautifulSoup(source.content, 'lxml')
table = soup.find('table')

cell_data = []
for p in table.find_all('p'):
    cell_data.append(p.text.strip())
cell_data = [[cell] for cell in cell_data if len(cell) > 15]
postal_codes = []
boroughs = []
neighborhoods = []
for cell in cell_data:
    for c in cell:
        postal_codes.append(c[:3])
        borough_index = c.find('(')
        neighborhoods.append(c[3:borough_index])
        boroughs.append(c[borough_index + 1:-1])
        

data = list(zip(postal_codes, boroughs, neighborhoods))
df = pd.DataFrame(data, columns = ['Postal Code', 'Borough', 'Neighborhood'])  
df.head()          

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,Parkwoods,North York
1,M4A,Victoria Village,North York
2,M5A,Regent Park / Harbourfront,Downtown Toronto
3,M6A,Lawrence Manor / Lawrence Heights,North York
4,M7A,Ontario Provincial Government,Queen's Park


## Part 2: Cleaning Data and Finding Geocodes

### Appending Lattitude and Longitude Columns 

In [5]:
lat_long = pd.read_csv('https://cocl.us/Geospatial_data')
print(lat_long.head())

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


In [6]:
df = df.merge(lat_long, on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,Parkwoods,North York,43.753259,-79.329656
1,M4A,Victoria Village,North York,43.725882,-79.315572
2,M5A,Regent Park / Harbourfront,Downtown Toronto,43.65426,-79.360636
3,M6A,Lawrence Manor / Lawrence Heights,North York,43.718518,-79.464763
4,M7A,Ontario Provincial Government,Queen's Park,43.662301,-79.389494


### Filtering Only Neighborhoods in Toronto

In [7]:
toronto_df = df[df['Neighborhood'].str.contains('Toronto')]
toronto_df.reset_index(inplace=True, drop=True)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Regent Park / Harbourfront,Downtown Toronto,43.65426,-79.360636
1,M5B,"Garden District, Ryerson",Downtown Toronto,43.657162,-79.378937
2,M5C,St. James Town,Downtown Toronto,43.651494,-79.375418
3,M4E,The Beaches,East Toronto,43.676357,-79.293031
4,M5E,Berczy Park,Downtown Toronto,43.644771,-79.373306


## Part 3: Clustering and Analyzing Data

### Mapping Toronto Neighborhoods

In [8]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [9]:
toronto_map = folium.Map(location=[latitude, longitude],zoom_start=11)

In [10]:
for lat,lng,borough,neighborhood in zip(df['Latitude'],df['Longitude'],df['Borough'],df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(toronto_map)
toronto_map

### K-Means Clustering

In [11]:
k = 5
X = toronto_df.drop(['Postal Code', 'Borough', 'Neighborhood'], 1)
k_means = KMeans(init='k-means++', n_clusters=k, n_init=12).fit(X)
labels = k_means.labels_

In [12]:
toronto_df.insert(0, 'Cluster Labels', labels)

In [14]:
toronto_df.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,2,M5A,Regent Park / Harbourfront,Downtown Toronto,43.65426,-79.360636
1,2,M5B,"Garden District, Ryerson",Downtown Toronto,43.657162,-79.378937
2,2,M5C,St. James Town,Downtown Toronto,43.651494,-79.375418
3,1,M4E,The Beaches,East Toronto,43.676357,-79.293031
4,2,M5E,Berczy Park,Downtown Toronto,43.644771,-79.373306


In [15]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood'], toronto_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## In the map shown above, the Toronto Neighborhoods were grouped based on distance from one another.