# Part 1

In [9]:
import pandas as pd
import numpy as np
import requests
import geocoder

In [10]:
#get data
html = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [11]:
#load into pandas, first table is the one we want. Also we set first row as header
loaded_table = pd.read_html(html.text,header=0)[0]
loaded_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Now we have to remove rows where Borough is not assigned:

In [12]:
#this filters out all rows, where Borough has value 'Not assigned'
loaded_table = loaded_table[loaded_table['Borough'] != 'Not assigned']
loaded_table.head(7)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue


For cells which have _Not assigned_ neighbourhood use borough name as neighbourhood name:

In [13]:
#define function applied to each row. This will change Neighbourhood in case its value is 'Not assigned'
def change(row):
    postcode,borough,neighbourhood = row
    if neighbourhood == 'Not assigned':
        neighbourhood = borough
    return pd.Series([postcode,borough,neighbourhood])

#apply the function to each row
loaded_table = loaded_table.apply(func=change, axis=1, result_type='broadcast')
loaded_table.head(7)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue


Now we need to regroup rows in our table, so that all neighbourhoods with same postcode (and same borough) are on one row:

In [14]:
#grouping by postcode --> that means each row will represent unique postcode. Then aggregating by unique neighbourhoods
final_table = loaded_table.groupby('Postcode').agg(lambda x: ', '.join(set(x))).reset_index()
final_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, West Hill, Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Now we will print shape of final output

In [15]:
print('shape of final table is: '+str(final_table.shape))

shape of final table is: (103, 3)


# Part 2
We will use geospatial data from https://cocl.us/Geospatial_data to get latitude and longtitude of each post code.

In [16]:
geospat = pd.read_csv('https://cocl.us/Geospatial_data')

In [17]:
#create function which requests coordinates for each postcode from ArcGis
def add_lat_long(row):
    postcode,borough,neigh = row
    latlong = geospat[geospat['Postal Code'] == postcode].iloc[0].to_dict()
    return pd.Series([latlong['Latitude'],latlong['Longitude']])

#get coordinates of each postcode, save it to coordinates dataframe
coordinates = final_table.apply(axis=1, func=add_lat_long, result_type=None)
coordinates.columns = ['latitude','longtitude']
coordinates.head()

Unnamed: 0,latitude,longtitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476


In [18]:
#now concatenate columns from coordinates dataframe with columns from original dataframe
data_with_coordinates = pd.concat([final_table,coordinates],axis=1)
data_with_coordinates.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longtitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, West Hill, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 3
Lets show a map of boroughs and neighbourhoods. Each point represents one postcode

In [19]:
import folium

# create map of Torronto
map_torr = folium.Map(location=[np.median(data_with_coordinates['latitude']), np.median(data_with_coordinates['longtitude'])], 
                      zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data_with_coordinates['latitude'],
                                           data_with_coordinates['longtitude'],
                                           data_with_coordinates['Borough'],
                                           data_with_coordinates['Neighbourhood']):
    label = 'Borough: {},\n Neighbourhoods: {}'.format(borough,neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_torr)  
    
map_torr

Now I will cluster different zip code locations by there distance from Torronto City Airport, which Latitude and Longitude are 43.628724, -79.395968 (found by Google Maps).

For doing this, another column, called _AirportDistance_ will be added, and it will be calculated using function _distance_ from geopy library. This function allows us to calculate distance between two locations when we know only their latitudes and longitudes

In [20]:
from geopy import distance
airport_coordinates = [43.628724, -79.395968]

#define function which calculates distance from the airport.
#It takes only one argument which is row from df with latitude and longitude
def airportDist(latlong):
    apdist = distance.distance(latlong,airport_coordinates)
    return round(apdist.miles,1)

#now add new column and determine its values by calculating distances of each location from airport
data_with_coordinates['AirportDistance'] = data_with_coordinates[['latitude','longtitude']].apply(airportDist, axis=1)
data_with_coordinates.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longtitude,AirportDistance
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,15.9
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497,16.0
2,M1E,Scarborough,"Guildwood, West Hill, Morningside",43.763573,-79.188711,13.9
3,M1G,Scarborough,Woburn,43.770992,-79.216917,13.3
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,12.7


Now when we have distance of each location to AirPort, we can perform some clustering. For that we will use KMeans, implemented by scikit

In [30]:
from sklearn.cluster import KMeans

#now initialize DBSCAN
clst = KMeans(n_clusters=4)
clst.fit(data_with_coordinates[['AirportDistance']])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

Now we can print category for each zip code. We can see that algorithm created 4 categories

In [31]:
clst.labels_

array([3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 3, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2,
       2, 2, 2, 2, 2, 2, 0, 0, 3, 0, 0, 0, 0, 3, 0])

Now we have clustered data, but we need to append them to original dataframe, to be able to plot results to map of Torronto. Categories for each location are saved in _clst.labels_ in same order like locations from original dataframe. So to merge the data together, we will just add categories as new column.

In [32]:
#lets copy original data
data_clustered = pd.DataFrame(data_with_coordinates)

#add column with categories
data_clustered['AirPortDistCategories'] = clst.labels_

Now we will show locations on map, even with their clusters. We will use different colors for each cluster

In [33]:
# create dictionary with different colors for different clusters
colors_mapping = {0:'green',1:'yellow',2:'orange',3:'red'}

# create map of Torronto with categories
map_torr_apdist = folium.Map(location=[np.median(data_clustered['latitude']),
                                       np.median(data_clustered['longtitude'])], 
                      zoom_start=10)

# add marker for airport
folium.CircleMarker(airport_coordinates,
                    radius=12,
                    tooltip='Airport Here',
                    fill=True,
                    color='black',
                    fill_color='white',
                    fill_opacity=0.5).add_to(map_torr_apdist)

# add markers to map
for lat, lng, borough, neighborhood, airportdist in zip(
                                           data_clustered['latitude'],
                                           data_clustered['longtitude'],
                                           data_clustered['Borough'],
                                           data_clustered['Neighbourhood'],
                                           data_clustered['AirPortDistCategories']):
    label = 'Borough: {},\n Neighbourhoods: {}'.format(borough,neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=colors_mapping[airportdist],
        fill=True,
        fill_color=colors_mapping[airportdist],
        fill_opacity=0.7,
        parse_html=False).add_to(map_torr_apdist)
    
map_torr_apdist