# Segmenting and Clustering Toronto

### First section

We will first start by importing data from the Wikipedia site into a Dataframe per specifications

In [1]:
#install BeautifulSoup and import library

!conda install beautifulsoup4

from bs4 import BeautifulSoup
import requests

Fetching package metadata ...........
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be UPDATED:

    beautifulsoup4: 4.6.0-py35h442a8c9_1 --> 4.6.3-py35_0

beautifulsoup4 100% |################################| Time: 0:00:00  39.97 MB/s


In [2]:
#start parsing the website
import urllib.request as urllib2

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

with urllib2.urlopen(url) as html_file:
    soup = BeautifulSoup(html_file)

In [3]:
#import data processing libraries and create Dataframe
import pandas as pd
import numpy as np

table = soup.find('table', class_='wikitable sortable')
df = pd.read_html(str(table), header = 0)[0]
df.rename(columns={'Postcode':'Postalcode'}, inplace=True)

#Clean NA Borough values
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], axis = 0, inplace=True)
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [4]:
#For NA neighbourhoods, fill with Borough
df['Neighbourhood'].replace('Not assigned', df['Borough'], inplace=True)
df.head(10)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [5]:
#For more than one Postcode, combine neighbourhoods
df_grouped = df.groupby(['Postalcode','Borough'])[['Neighbourhood']].agg(lambda col: ', '.join(col))
df_grouped.reset_index(inplace=True)
df_grouped.head(10)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Using Geocoder to obtain Latitude and Longitude for each Postalcode

In [10]:
#import geocoder library
!pip install geocoder
import geocoder

Requirement not upgraded as not directly required: geocoder in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: future in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: ratelim in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: click in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests->geocoder)
Requirement not upgraded as not directly required: i

In [47]:
#Getting lists for longitude and latitude using ArcGIS (World Geocoding Service)

latitude = []
longitude = []

for index, row in df_grouped.iterrows():
    while True:
        g = geocoder.arcgis('%s, Toronto, Ontario' % row['Postalcode'])
        latitude.append(g.lat)
        longitude.append(g.lng)
        if latitude[index] != None:
            break

In [54]:
#Adding columns

df_grouped['Latitude'] = latitude
df_grouped['Longitude'] = longitude

In [57]:
df_grouped.head(10)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785605,-79.158701
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175299
3,M1G,Scarborough,Woburn,43.768216,-79.21761
4,M1H,Scarborough,Cedarbrae,43.769608,-79.23944
5,M1J,Scarborough,Scarborough Village,43.743085,-79.232172
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.72626,-79.26367
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713213,-79.28491
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.723575,-79.234976
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.69669,-79.260069
