# Coursera IBM Capstone Notebook

## Purpose
Classify Toronto neighbourhoods into 5 Clusters

#### Data
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M



### Import libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    geopy-1.18.1               |             py_0          51 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          84 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0                conda-forge

The following packages will be UPDATED:

    conda:         4.5.12-py36_1000         conda-forge --> 4.6.2-py36_0            conda-forge
    cryptography:  2.3.1-py36hb7f436b_1000  conda-forge --> 2.4.2-py36h1ba5d50_0               
    curl:          7.63.0-h646f8bb_1000     conda-forge --> 7.63.0-hbc83047_1000               
 

### Read Wiki Page

In [3]:
# create URL
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

wiki_page = requests.get(url)

### Parse HTML Wikimeadia page to extract the Table of postcodes

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(wiki_page.text, 'html.parser')

# find table with values and create a dataframe
table = soup.find('table', {'class': "wikitable sortable"})
tbody = table.find('tbody')
tr = tbody.findNext('tr')

### Create Dataframes

In [5]:
#this creates a dataframe of all tables found in the HTML
html_tables = pd.read_html(url)
toronto_df = html_tables[0] # only want the first table of Toronto postcodes
toronto_df.columns = ['PostalCode','Borough','Neighborhood']
toronto_df.drop(toronto_df.index[0], inplace=True) #Drorp the title row
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### Drop all rows with "Not assigned" Borough

In [6]:
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


### Replace "Not assigned" with Borough

In [8]:
toronto_df.Neighborhood = np.where(toronto_df["Neighborhood"] == 'Not assigned', toronto_df.Borough,toronto_df.Neighborhood)
toronto_df.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


### Join Boroughs

Join neighbourhoods

In [10]:
#Very efficient way to merge
unique_toronto_df = toronto_df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()
unique_toronto_df.shape

(103, 3)

In [11]:
unique_toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Read Geospatial data from file

Append the Lat/Long to the data based on the file.

#### Data
http://cocl.us/Geospatial_data

In [12]:
#Read data from URL
latlng = pd.read_csv("http://cocl.us/Geospatial_data")
latlng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge with Geo Data

In [16]:
geo_toronto_df = unique_toronto_df.set_index('PostalCode').join(latlng.set_index('Postal Code'))
print(geo_toronto_df.shape)
geo_toronto_df.head()

(103, 4)


Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
