# IBM CAPSTONE PROJECT
## Notebook Part 3: Clustering and Analysis
### by Ignacio de Juan
November 2020

In [1]:
pip install lxml html5lib beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)

print(len(dfs))

3


In [3]:
print(dfs[0])

    Postal Code           Borough  \
0           M1A      Not assigned   
1           M2A      Not assigned   
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
..          ...               ...   
175         M5Z      Not assigned   
176         M6Z      Not assigned   
177         M7Z      Not assigned   
178         M8Z         Etobicoke   
179         M9Z      Not assigned   

                                         Neighbourhood  
0                                         Not assigned  
1                                         Not assigned  
2                                            Parkwoods  
3                                     Victoria Village  
4                            Regent Park, Harbourfront  
..                                                 ...  
175                                       Not assigned  
176                                       Not assigned  
177                                       

In [4]:
df_toronto = dfs[0]
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
df_toronto = df_toronto[df_toronto['Borough'] != 'Not assigned']

In [6]:
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
df_toronto.shape

(103, 3)

In [8]:
df_toronto['Postal Code'].nunique()

103

All the Postal Codes are unique

In [9]:
df_toronto[df_toronto['Neighbourhood'] == ' Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


We check that the data is clean.

In [10]:
df_toronto.shape

(103, 3)

In [11]:
df_toronto = df_toronto.assign(latitude = '', longitude = '')
df_toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude
2,M3A,North York,Parkwoods,,
3,M4A,North York,Victoria Village,,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",,
165,M4Y,Downtown Toronto,Church and Wellesley,,
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",,
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",,


## Part 2: Adding Latitude and Longitude

In [12]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [13]:
import geocoder # import geocoder

# reset index
df_toronto = df_toronto.reset_index()

# loop through the values

for i in range(len(df_toronto)):
  postal_code = (df_toronto.loc[i, 'Postal Code'])
  # initialize your variable to None
  lat_lng_coords = None
  # loop until you get the coordinates
  while(lat_lng_coords is None):
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code), key='AIzaSyAE9QPXl440xFfD48DGd4j4BUVybYIFu5o')
    lat_lng_coords = g.latlng
    df_toronto.loc[i,'latitude'] = lat_lng_coords[0]
    df_toronto.loc[i,'longitude'] = lat_lng_coords[1]
  # print (df_toronto.loc[i, 'Postal Code'], lat_lng_coords[0],lat_lng_coords[1] )

In [14]:
df_toronto.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,latitude,longitude
0,2,M3A,North York,Parkwoods,43.7533,-79.3297
1,3,M4A,North York,Victoria Village,43.7259,-79.3156
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7185,-79.4648
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623,-79.3895


## Part 3: Clustering and Analysis

In [17]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


latitude = 43.6623
longitude = -79.3895

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['latitude'], df_toronto['longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto