Before we get the data and start exploring it, let's download all the dependencies that we will need


In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20181102125445-0000
Solving environment: done

## Package Plan ##

  environment location: /opt/ibm/conda/miniconda3

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    conda-4.5.11               |           py35_0         636 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The followin

# 1. Obtain Dataset

In [15]:
import requests
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Get Wikipedia page is converted to a string
raw_random_wikipedia_page=requests.get(wikipedia_link)
#extract the XML as a text file a string and assign the result variable page
page = raw_random_wikipedia_page.text
# Extracting the postal codes into dataframe
import pandas as pd
from bs4 import BeautifulSoup as bs
html_soup = bs(page,'html.parser')
table_rows = html_soup.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        if str(row).startswith("['NL") or str(row).startswith("['A"):
            res
        else:
            res.append(row)
        
df = pd.DataFrame(res, columns=["PostalCode", "Borough", "Neighbourhood"])
#df_sorted = pd.DataFrame(res, columns=["PostalCode", "Borough", "Neighbourhood"])
df_sorted = df.sort_values(by='PostalCode').reset_index()
del df_sorted['index']

df_sorted = df_sorted.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

for index, row in df_sorted.iterrows():
    
    if (df_sorted.loc[index,'Borough'] != 'Not assigned') & (df_sorted.loc[index,'Neighbourhood'] == 'Not assigned'):
        Borough = df_sorted.loc[index,'Borough']
        df_sorted.loc[index,'Neighbourhood'] = Borough
                      
website_URL = "https://cocl.us/Geospatial_data"
    
df_geo = pd.read_csv(website_URL)
df_geo.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)
df_postal = df_sorted.merge(df_geo, on="PostalCode", how = 'inner')
print (df_postal.head(10))

  PostalCode      Borough                                    Neighbourhood  \
0        M1B  Scarborough                                   Malvern, Rouge   
1        M1C  Scarborough           Port Union, Rouge Hill, Highland Creek   
2        M1E  Scarborough                West Hill, Morningside, Guildwood   
3        M1G  Scarborough                                           Woburn   
4        M1H  Scarborough                                        Cedarbrae   
5        M1J  Scarborough                              Scarborough Village   
6        M1K  Scarborough      East Birchmount Park, Ionview, Kennedy Park   
7        M1L  Scarborough                  Oakridge, Golden Mile, Clairlea   
8        M1M  Scarborough  Cliffcrest, Cliffside, Scarborough Village West   
9        M1N  Scarborough                      Cliffside West, Birch Cliff   

    Latitude  Longitude  
0  43.806686 -79.194353  
1  43.784535 -79.160497  
2  43.763573 -79.188711  
3  43.770992 -79.216917  
4  43.77313

# 2. Explore and cluster the neighborhoods in Toronto

In [16]:
df_postal = df_postal[df_postal['Borough'].str.contains("Toronto")].reset_index(drop=True)
df_postal.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


# 3. List number of boroughs and neighbourhoods

In [17]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_postal['Borough'].unique()),
        df_postal.shape[0]
    )
)

The dataframe has 4 boroughs and 38 neighborhoods.


# 4. Use geopy library to get the latitude and longitude values of Toronto

In [18]:
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


# 5. Create a map of Toronto with neighborhoods superimposed on top

In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_postal['Latitude'], df_postal['Longitude'], df_postal['Borough'], df_postal['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto