# 1) First Part: Cleaning Dataframe

In [1]:
# Importing all the necessary libraries
import pandas as pd
!conda install -c anaconda beautifulsoup4 --y
from bs4 import BeautifulSoup
import requests

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.9.3       |     pyhb0f4dca_0          87 KB  anaconda
    ca-certificates-2020.10.14 |                0         128 KB  anaconda
    certifi-2020.6.20          |           py36_0         160 KB  anaconda
    openssl-1.1.1h             |       h7b6447c_0         3.8 MB  anaconda
    soupsieve-2.0.1            |             py_0          33 KB  anaconda
    ------------------------------------------------------------
                                           Total:         4.2 MB

The following NEW packages will be INSTALLED:

  beautifulsoup4     anaconda/noarch::beautifulsoup4-4.9.3-pyhb0f4dca_0
  soupsieve       

First, Let's get the table from wikipedia

In [21]:
# get the response in the form of html
# Checking status
wikiurl="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

200


We got a 200 as a response. That means It's Ok!
So We proceed to extract the table on the page:

In [22]:
df = pd.read_html(wikiurl,flavor='html5lib')
# convert list to dataframe
df=pd.DataFrame(df[0])
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


To avoid confusion with other data. Let's rename the column Neighbourhood. 

In [23]:
df = df.rename(columns={'Neighbourhood': 'Neighborhood'})

We check the shape of the dataframe:

In [24]:
df.shape

(180, 3)

We need to drop the rows with NOT ASSIGNED cells:

First we need to make a condition to localize those rows that have an 'assigned' Borough , but 'Not Assigned' Neighborhoods.

In [25]:
df2 = df.loc[(df['Borough'] != 'Not assigned') & (df['Neighborhood'] == 'Not assigned')]
df2

Unnamed: 0,Postal Code,Borough,Neighborhood


We see the combination of such values do not exist. So we proceed to take out all rows with 'Not Assigned' Boroughs

In [26]:
df = df[df.Borough != 'Not assigned']
df

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Some postal codes might repeat. So we group them and get their values together separated by commas

In [27]:
df_exp = df.groupby('Postal Code').agg({'Neighborhood': ','.join})

# We reset the indexes
df.reset_index(inplace = True, drop = True)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [28]:
df.shape

(103, 3)

We see that the new dataframe have the same dimmensions. So nothing changed. It seems that the values were already grouped together.

# 2. Second part: Latitude and Longitude

Let's import the csv file with all the latitudes and longitudes so we can add them to our dataframe.

In [29]:
df_coordinates = pd.read_csv('http://cocl.us/Geospatial_data')
df_coordinates

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Now we merge both dataframes:

In [30]:
df = pd.merge(df, df_coordinates, on='Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# 3. Third Part: Plotting and analysis

In [31]:
# Let's import the libraries so we can plot graphics
import folium
from sklearn.cluster import KMeans

Let's use k means to cluster the Boroughs

First, we need to know how many unique Boroughs there are.

In [32]:
n_boroughs = len(df['Borough'].unique())
print('There are {} unique boroughs'.format(n_boroughs))

There are 10 unique boroughs


In [33]:
lat = 43.6532
lon = -79.3832

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[lat, lon], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto


We can see all the neighborhoods in the Toronto Area. Now, Let's apply k-means to see how the algorithm divides each Borough.

In [34]:
# set number of clusters
kclusters = 10

df2 = df[['Latitude','Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df2)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 8, 1, 5, 1, 9, 7, 4, 8, 1], dtype=int32)

In [35]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

df.insert(0, 'Cluster Labels', kmeans.labels_)
map_clusters = folium.Map(location=[lat, lon], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Let's count each neighborhoods

In [36]:
grouped = df.groupby('Borough').count()
grouped

Unnamed: 0_level_0,Cluster Labels,Postal Code,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Central Toronto,9,9,9,9,9
Downtown Toronto,19,19,19,19,19
East Toronto,5,5,5,5,5
East York,5,5,5,5,5
Etobicoke,12,12,12,12,12
Mississauga,1,1,1,1,1
North York,24,24,24,24,24
Scarborough,17,17,17,17,17
West Toronto,6,6,6,6,6
York,5,5,5,5,5


Let's check just all Boroughs that only have York as part of their name

In [37]:
grouped = df[df['Borough'].str.contains('York')]
# Let's drop the previous labels since we want to apply k-means again
grouped = grouped.drop(['Cluster Labels'], axis = 1)
grouped.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937


In [39]:
#Don't forget to reinclude the line that inserts the cluster labels, bye!
kclusters = 3

df3 = grouped[['Latitude','Longitude']]

# run k-means clustering
kmeans2 = KMeans(n_clusters=kclusters, random_state=0).fit(df3)

# check cluster labels generated for each row in the dataframe
grouped.insert(0, 'Cluster Labels', kmeans2.labels_)
map_clusters2 = folium.Map(location=[lat, lon], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = ['red','green','blue']
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(grouped['Latitude'], grouped['Longitude'], grouped['Neighborhood'], grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters2)

map_clusters2

# Examine Clusters

I thought that the algorithm would difference each Borough by name, but turns out some of the North York Neighborhood are in the same cluster as York, that is cluster 1. In cluster 2 the same happens, there are North York Boroughs insted of just East ones. The only one that was correct was cluster number 3. This could be that the dataset was mostly North York Neighborhoods.

Cluster 1

In [35]:
grouped.loc[grouped['Cluster Labels'] == 0]

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
10,0,M6B,North York,Glencairn,43.709577,-79.445073
16,0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
21,0,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512
34,0,M3J,North York,"Northwood Park, York University",43.76798,-79.487262
40,0,M3K,North York,Downsview,43.737473,-79.464763
46,0,M3L,North York,Downsview,43.739015,-79.506944
49,0,M6L,North York,"North Park, Maple Leaf Park, Upwood Park",43.713756,-79.490074
50,0,M9L,North York,Humber Summit,43.756303,-79.565963
53,0,M3M,North York,Downsview,43.728496,-79.495697


Cluster 2

In [33]:
grouped.loc[grouped['Cluster Labels'] == 1]

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,1,M3A,North York,Parkwoods,43.753259,-79.329656
1,1,M4A,North York,Victoria Village,43.725882,-79.315572
7,1,M3B,North York,Don Mills,43.745906,-79.352188
8,1,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
13,1,M3C,North York,Don Mills,43.7259,-79.340923
14,1,M4C,East York,Woodbine Heights,43.695344,-79.318389
23,1,M4G,East York,Leaside,43.70906,-79.363452
29,1,M4H,East York,Thorncliffe Park,43.705369,-79.349372
35,1,M4J,East York,"East Toronto, Broadview North (Old East York)",43.685347,-79.338106


Cluster 3

In [34]:
grouped.loc[grouped['Cluster Labels'] == 2]

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
27,2,M2H,North York,Hillcrest Village,43.803762,-79.363452
28,2,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259
33,2,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
39,2,M2K,North York,Bayview Village,43.786947,-79.385975
45,2,M2L,North York,"York Mills, Silver Hills",43.75749,-79.374714
52,2,M2M,North York,"Willowdale, Newtonbrook",43.789053,-79.408493
55,2,M5M,North York,"Bedford Park, Lawrence Manor East",43.733283,-79.41975
59,2,M2N,North York,"Willowdale, Willowdale East",43.77012,-79.408493
66,2,M2P,North York,York Mills West,43.752758,-79.400049
72,2,M2R,North York,"Willowdale, Willowdale West",43.782736,-79.442259
