# Dependencies Download

In [16]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans



print('Libraries imported.')

Libraries imported.


# Dataset Scraping

In [17]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [18]:
# scraping
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html_doc = urlopen(url)

soup = BeautifulSoup(html_doc, 'lxml')


In [19]:
# get the data from the table and upload it to kanj_data
table = soup.find("table", class_="wikitable")

kanj_data = []
for items in table.find_all("tr")[:-1]:
    data = [' '.join(item.text.split()) for item in items.find_all(['th','td'])]
    
    kanj_data.append(data)
    
kanj_data

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', 'Downtown Toronto', "Queen's Park"],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', "Queen's Park", 'Not assigned'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B',

In [20]:
# naming columns
df = pd.DataFrame(kanj_data, columns=['Postcode', 'Borough', 'Neighbourhoud'])
df.drop(0, axis=0, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhoud
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [21]:
#droping rows with Not assigned values in Borough
indexNames = df[ (df['Borough'] == 'Not assigned')].index
df.drop(indexNames , inplace=True)
df = df.reset_index(drop=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhoud
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [22]:
# Replacing Not assigned cells with their values in Borough
df['Neighbourhoud']=df['Borough'].where(df['Neighbourhoud'].eq('Not assigned'),df['Neighbourhoud'])
df

Unnamed: 0,Postcode,Borough,Neighbourhoud
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [23]:
#creating a new dataframe with no duplicates in Postcode
df_stack = df.groupby('Postcode')['Neighbourhoud'].apply(', '.join)
df_stack = df_stack.reset_index()

df_stack.head()

Unnamed: 0,Postcode,Neighbourhoud
0,M1B,"Rouge, Malvern"
1,M1C,"Highland Creek, Rouge Hill, Port Union"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [24]:
#Merging df and df_stack
df_clean = pd.merge(df_stack, df, on='Postcode', how='right')

df_clean = df_clean.drop_duplicates()
df_clean

Unnamed: 0,Postcode,Neighbourhoud_x,Borough,Neighbourhoud_y
0,M1B,"Rouge, Malvern",Scarborough,Rouge
1,M1B,"Rouge, Malvern",Scarborough,Malvern
2,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,Highland Creek
3,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,Rouge Hill
4,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,Port Union
5,M1E,"Guildwood, Morningside, West Hill",Scarborough,Guildwood
6,M1E,"Guildwood, Morningside, West Hill",Scarborough,Morningside
7,M1E,"Guildwood, Morningside, West Hill",Scarborough,West Hill
8,M1G,Woburn,Scarborough,Woburn
9,M1H,Cedarbrae,Scarborough,Cedarbrae


In [25]:
#Deleting Neighbourhoud_y column
df_clean=df_clean.drop('Neighbourhoud_y',axis=1)
df_clean

Unnamed: 0,Postcode,Neighbourhoud_x,Borough
0,M1B,"Rouge, Malvern",Scarborough
1,M1B,"Rouge, Malvern",Scarborough
2,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
3,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
4,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
5,M1E,"Guildwood, Morningside, West Hill",Scarborough
6,M1E,"Guildwood, Morningside, West Hill",Scarborough
7,M1E,"Guildwood, Morningside, West Hill",Scarborough
8,M1G,Woburn,Scarborough
9,M1H,Cedarbrae,Scarborough


In [26]:
#Rearrangements and dropping duplicates
cols=['Postcode', 'Borough', 'Neighbourhoud_x']
df_clean=df_clean[cols]
df_clean = df_clean.rename(columns={"Neighbourhoud_x": "Neighbourhoud"}).reset_index(drop=True)
df_clean = df_clean[['Postcode', 'Borough', 'Neighbourhoud']].drop_duplicates()
df_clean

Unnamed: 0,Postcode,Borough,Neighbourhoud
0,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
5,M1E,Scarborough,"Guildwood, Morningside, West Hill"
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae
10,M1J,Scarborough,Scarborough Village
11,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
14,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
17,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
20,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [27]:
df_clean.shape

(103, 3)

In [28]:
!conda install -c conda-forge folium=0.5.0 --yes 
!conda install -c conda-forge geopy --yes
import folium
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import requests
import json
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors
print('done')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

done


# Map of Toronto

In [35]:
import urllib.request
url = 'http://cocl.us/Geospatial_data'
filename = 'geospatial_coordinates.csv'
urllib.request.urlretrieve(url, filename)

('geospatial_coordinates.csv', <http.client.HTTPMessage at 0x7fd5f6102e80>)

In [36]:
df_geo = pd.read_csv(filename)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merging two dataframes together

In [37]:
data = pd.merge(df_geo, df_clean, left_on='Postal Code', right_on='Postcode')

data = data[['Postcode', 'Borough', 'Neighbourhoud', 'Latitude', 'Longitude']]

data.head()


Unnamed: 0,Postcode,Borough,Neighbourhoud,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Generating Toronto Map

In [38]:
address = 'Toronto'
geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of Tornoto are {}, {}.'.format(latitude, longitude))

The coordinates of Tornoto are 43.653963, -79.387207.


In [40]:
# create map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, postalcode, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Postcode'], data['Borough'], data['Neighbourhoud']):
    label = '{}; {}; {}'.format(postalcode, borough, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Fourth neighborhood exploration

In [41]:
data.loc[3, 'Neighbourhoud']

'Woburn'

In [43]:
neighbourhood_latitude = data.loc[3, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = data.loc[3, 'Longitude'] # neighborhood longitude value

neighbourhood_name = data.loc[3, 'Neighbourhoud'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Woburn are 43.7709921, -79.21691740000001.
