## Part 1

Transform the Wikipedia table into a dataframe.

In [81]:

import pandas as pd

wiki = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=946126446'

#Capture the first table on the webpage
df = pd.read_html(wiki)[0] 
#print(df)

#Exclude rows with 'Not assigned'Borough
df1 = df.loc[df['Borough'] != 'Not assigned'].reset_index()[['Postcode','Borough','Neighbourhood']]
#print(df1)

#Combine rows with the same PostCode and Brough into one with the neighborhoods separated by a comma.
df2 = df1.groupby(['Postcode','Borough'])['Neighbourhood'].agg(', '.join).reset_index()
print(df2)

    Postcode      Borough                                      Neighbourhood
0        M1B  Scarborough                                     Rouge, Malvern
1        M1C  Scarborough             Highland Creek, Rouge Hill, Port Union
2        M1E  Scarborough                  Guildwood, Morningside, West Hill
3        M1G  Scarborough                                             Woburn
4        M1H  Scarborough                                          Cedarbrae
..       ...          ...                                                ...
98       M9N         York                                             Weston
99       M9P    Etobicoke                                          Westmount
100      M9R    Etobicoke  Kingsview Village, Martin Grove Gardens, Richv...
101      M9V    Etobicoke  Albion Gardens, Beaumond Heights, Humbergate, ...
102      M9W    Etobicoke                                          Northwest

[103 rows x 3 columns]


In [82]:
#number of rows of your dataframe is 103
df2.shape

(103, 3)

## Part 2

Create a new dataframe including geograpical coordinates.

In [83]:
import pandas as pd

geo_csv = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'

grid = pd.read_csv(geo_csv)
print(grid)

    Postal Code   Latitude  Longitude
0           M1B  43.806686 -79.194353
1           M1C  43.784535 -79.160497
2           M1E  43.763573 -79.188711
3           M1G  43.770992 -79.216917
4           M1H  43.773136 -79.239476
..          ...        ...        ...
98          M9N  43.706876 -79.518188
99          M9P  43.696319 -79.532242
100         M9R  43.688905 -79.554724
101         M9V  43.739416 -79.588437
102         M9W  43.706748 -79.594054

[103 rows x 3 columns]


In [84]:
df3 = pd.merge(df2, grid, left_on = "Postcode", right_on = "Postal Code")[['Postal Code','Borough','Neighbourhood','Latitude','Longitude']]
df3

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


## Part 3

Explore and cluster the neighborhoods in Toronto.

In [108]:
#Only select Boroughs that contain the word "Toronto"
df4 = df3[df3['Borough'].str.contains('Toronto')]
df4

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [107]:
#Create Tronto map

import folium 

toronto_map1 = folium.Map(location=[df4['Latitude'].mean(),df4['Longitude'].mean()],zoom_start=12)

for lat,lng,borough,neighbourhood in zip(df4['Latitude'],df4['Longitude'],df4['Borough'],df4['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng], 
    radius=3,
    popup=label,
    color='blue',
    fill=False,
    parse_html=False).add_to(toronto_map1)
toronto_map1

In [109]:
#Cluster
from sklearn.cluster import KMeans

toronto_clustered = df4.drop(['Postal Code','Borough','Neighbourhood'], axis=1)

k = 5

KMeans(n_clusters = k, random_state=0).fit(toronto_clustered)

df4.insert(0, 'Cluster_No', kmeans.labels_)

In [80]:
df4

Unnamed: 0,Cluster_No,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,4,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,4,M4M,East Toronto,Studio District,43.659526,-79.340923
44,3,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,3,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,3,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,3,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,3,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,3,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [115]:
toronto_map2 = folium.Map(location=[df4['Latitude'].mean(),df4['Longitude'].mean()],zoom_start=12)

# set color scheme for the clusters
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df4['Latitude'], df4['Longitude'], df4['Neighbourhood'], df4['Cluster_No']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(toronto_map2)
       
toronto_map2