# Week 3- Segmenting and Clustering Neighborhoods in Toronto

## This notebook contains all three parts of the Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto.

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library
from bs4 import BeautifulSoup

print('Libraries imported.')

Collecting package metadata: done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - conda-forge/linux-64::altair==2.4.1=py36_0
  - conda-forge/noarch::branca==0.3.1=py_0
  - conda-forge/linux-64::certifi==2019.3.9=py36_0
  - conda-forge/linux-64::conda==4.6.14=py36_0
  - conda-forge/noarch::folium==0.5.0=py_0
  - conda-forge/linux-64::geopy==1.11.0=py36_0
  - conda-forge/linux-64::giflib==5.1.7=h516909a_1
  - defaults/linux-64::grpcio==1.16.1=py36hf8bcb03_1
  - conda-forge/linux-64::ipython-sql==0.3.9=py36_1000
  - conda-forge/noarch::ipywidgets==7.4.2=py_0
  - conda-forge/linux-64::jpeg==9c=h14c3975_1001
  - conda-forge/linux-64::leptonica==1.76.0=h7f84942_1005
  - conda-forge/linux-64::libgcc==7.2.0=h69d50b8_2
  - conda-forge/linux-64::libwebp==1.0.2=h99fbfcb_2
  - conda-forge/linux-64::nodejs==9.11.0=0
  - conda-forge/linux-64::openjpeg==2.3.1=h58a6597_0
  - conda-forge/linux-64

### Download and Explore Dataset

In [3]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'lxml')

In [4]:
table = soup.find('table')
rows = table.find_all('td')

Postcode = []
Borough = []
Neighbourhood = []

for i in range(0, len(rows), 3):
    Postcode.append(rows[i].text.strip())
    Borough.append(rows[i+1].text.strip())
    Neighbourhood.append(rows[i+2].text.strip())
        
df = pd.DataFrame(data=[Postcode, Borough, Neighbourhood]).transpose()
df.columns = ['Postcode', 'Borough', 'Neighbourhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Cleaning the data

In [5]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [6]:
df = df.groupby(['Postcode', 'Borough'], sort = False).agg(','.join)
df.reset_index(inplace = True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [7]:
df['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [8]:
df.shape

(103, 3)

### Download the geographical coordinates of each postal code

In [9]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['Postcode', 'Latitude', 'Longitude']
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
df_new = pd.merge(df, df_geo, on=['Postcode'])
df_new.rename(columns={'Postcode': 'PostalCode'}, inplace=True)
df_new.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


### Creating a dataframe for only boroughs that contain the word Toronto

In [11]:
df_Toronto = df_new[df_new['Borough'].str.contains('Toronto')]
df_Toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


### Generate map to visualize the neighborhoods and how they cluster together

In [12]:
address = 'Toronto'
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

for latitude, longitude, borough, neighborhood in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], 
                                           df_Toronto['Borough'], df_Toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map