# 1. Data Collection

In [150]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [151]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [152]:
panel = soup.find('div', class_='mw-parser-output')

# 2. Data Formation

## 2.1 Data Preprocess

In [153]:
# Set up empty lists to save the results
PostalCodes_list = []
Boroughs_list = []
Neigborhoods_list = []
assigned_counter = 0
unassigned_counter = 0
total = 0
exception_counter = 0
#Scan segment with 'p' in the the table
for Block in panel.find('table').find_all('p'):
    #Set up empty temporary variables for data preprocessing
    PostalCode = ''
    Borough = ''
    Neigborhoods = ''
    Neigborhood = ''
    #Fetch the Postal Code
    PostalCode = Block.b.text
    #Fetch all the cities with the same Postal Code
    Cities = Block.span.text
    total = total + 1
    try:
        #Extract the first city that is the Borough
        Borough = Cities.split("(")[0].strip()
        #Extract the Neigborhoods near their Borough
        if Borough != 'Not assigned':
            #print(Borough)
            Neigborhoods = Cities.split("(")[1].split(")")[0].strip().split('/')
            Neigborhoods = [string.strip() for string in Neigborhoods]
            Neigborhood = ", ".join(Neigborhoods)
            # Appending the Postal Code, Borough and Neigborhood in their lists
            PostalCodes_list.append(PostalCode)
            Boroughs_list.append(Borough)
            Neigborhoods_list.append(Neigborhood)
            assigned_counter = assigned_counter + 1
        else:
            unassigned_counter = unassigned_counter + 1
    except Exception as e:
        exception_counter = exception_counter + 1
        PostalCodes_list.append(PostalCode)
        Boroughs_list.append(Borough)
        Neigborhoods_list.append(Neigborhood)
        pass
print(assigned_counter)
print(unassigned_counter)
print(total)
print(exception_counter)

102
77
180
1


## 2.2 Dataframe Creation

In [154]:
dict = {'PostalCode': PostalCodes_list, 'Borough': Boroughs_list, 'Neigborhoods': Neigborhoods_list}
#print(len(PostalCodes_list))
#print(len(Boroughs_list))
#print(len(Neigborhoods_list))
#print(PostalCodes_list[-1])
#print(Boroughs_list[-1])
#print(Neigborhoods_list[-1])
df = pd.DataFrame(dict)
df.shape

(103, 3)

# 3. Locations

## 3.1 Download locations

In [155]:
#!wget -O Geospatial_Coordinates.csv http://cocl.us/Geospatial_data

In [156]:
df_locations = pd.read_csv('Geospatial_Coordinates.csv')

In [157]:
df_locations.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
df_locations.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## 3.2 Combining dataframes

In [158]:
df.head()

Unnamed: 0,PostalCode,Borough,Neigborhoods
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park / Ontario Provincial Government,


In [159]:
df_merge = df.set_index('PostalCode').join(df_locations.set_index('PostalCode'))
df_merge.head()

Unnamed: 0_level_0,Borough,Neigborhoods,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Queen's Park / Ontario Provincial Government,,43.662301,-79.389494


In [160]:
df_merge = df_merge.reset_index()
df_merge.head()

Unnamed: 0,PostalCode,Borough,Neigborhoods,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park / Ontario Provincial Government,,43.662301,-79.389494


# 4. Clustering

## 4.1 Explore Neighborhoods

In [161]:
Toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neigborhoods,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [162]:
# Grouping the neighborhoods of Toronto
Toronto_data = df_merge[df_merge['Borough'] == 'Downtown Toronto']
Toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neigborhoods,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


### Let's check the size of the neigborhoods

In [163]:
Toronto_data.groupby('Borough').count()

Unnamed: 0_level_0,PostalCode,Neigborhoods,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Downtown Toronto,17,17,17,17


In [164]:
Toronto_data

Unnamed: 0,PostalCode,Borough,Neigborhoods,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
36,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
42,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
48,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817


In [165]:
neighborhoods = Toronto_data.drop(['PostalCode', 'Borough'], axis=1)

In [166]:
neighborhoods.head()

Unnamed: 0,Neigborhoods,Latitude,Longitude
2,"Regent Park, Harbourfront",43.65426,-79.360636
9,"Garden District, Ryerson",43.657162,-79.378937
15,St. James Town,43.651494,-79.375418
20,Berczy Park,43.644771,-79.373306
24,Central Bay Street,43.657952,-79.387383


## 4.2 Cluster neighborhoods

Run *k*-means to cluster the neighborhood into 6 clusters.

In [167]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 6

toronto_grouped_clustering = neighborhoods.drop('Neigborhoods', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 2, 2, 5, 3, 1, 0, 3])

In [168]:
# add clustering labels
Toronto_data.insert(0, 'Cluster Labels', kmeans.labels_)
Toronto_data

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neigborhoods,Latitude,Longitude
2,1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,4,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
36,0,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
42,0,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
48,0,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817


In [169]:
Toronto_data.rename(columns={'Neigborhoods':'Neighborhoods'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


## 4.3 Visualization

### Use geopy to retrieve the latitude and longitude of Downtown Toronto

In [170]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geopy.geocoders
geopy.geocoders.options.default_timeout = 20
address = 'Downtown Toronto'
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.


In [171]:
import folium
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Neighborhoods'], Toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# 5. Examine Clusters

#### Cluster 1

In [172]:
Toronto_data.loc[Toronto_data['Cluster Labels']==0]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhoods,Latitude,Longitude
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
36,0,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
42,0,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
48,0,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817
97,0,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.38228


#### Cluster 2

In [173]:
Toronto_data.loc[Toronto_data['Cluster Labels']==1]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhoods,Latitude,Longitude
2,1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
96,1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675


Cluster 3

In [174]:
Toronto_data.loc[Toronto_data['Cluster Labels']==2]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhoods,Latitude,Longitude
80,2,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049
84,2,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049


Cluster 4

In [175]:
Toronto_data.loc[Toronto_data['Cluster Labels']==3]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhoods,Latitude,Longitude
91,3,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
99,3,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316


Cluster 5

In [176]:
Toronto_data.loc[Toronto_data['Cluster Labels']==4]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhoods,Latitude,Longitude
25,4,M6G,Downtown Toronto,Christie,43.669542,-79.422564


Cluster 6

In [177]:
Toronto_data.loc[Toronto_data['Cluster Labels']==5]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhoods,Latitude,Longitude
87,5,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
