# IBM Capstone Assignment


## Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto


### Load required libs

In [1]:
# import libs
import pandas as pd
import seaborn as sns
import requests
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup


### Data scrapping

In [2]:
# data scrapping
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data  = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
tables = soup.find_all('table')

# create dataframe
pdf= pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])
for row in tables[0].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        pcode = col[0].text
        borough = col[1].text
        nbhood = col[2].text.strip()
        pdf = pdf.append({"PostalCode":pcode, "Borough":borough, "Neighborhood":nbhood}, ignore_index=True)

### Data wrangling

In [3]:
# remove new line symbols (\n)
pdf = pdf.replace('\n',' ', regex=True)

# remove empty space after postal code
pdf['PostalCode'] = pdf['PostalCode'].str.strip()

# remove rows with not assigned boroughs
pdf = pdf[~pdf['Borough'].str.contains('Not assigned')]
pdf

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### Quick data analysis on repeated values

1) Whenever a "Not assigned" value was present for a Borough, it would also be present for its corresponding Neighborhood. Thus, when those values from Borough were removed, they were also removed for the Neighborhood column.

2) Below we can checked that there are no duplicated values for Postal code.

3) Then, the "groupby" function is not going to affect the data frame.

In [4]:
# check repeated values
multi_pcode = pdf['PostalCode'].duplicated().any()
multi_borough = pdf['Borough'].duplicated().any()
multi_neighborhood= pdf['Neighborhood'].duplicated().any()
str1=('Are there repeted values for Postal Code? {}' )
str2=('Are there repeted values for Borough? {}' )
str3=('Are there repeted values for Neighborhood? {}' )

print(str1.format(multi_pcode))
print(str2.format(multi_borough))
print(str3.format(multi_neighborhood))

Are there repeted values for Postal Code? False
Are there repeted values for Borough? True
Are there repeted values for Neighborhood? True


In [5]:
# shape before group by
print('Shape before groupby: ', pdf.shape)

# shape after group by
pdf.groupby(['PostalCode'])
print('Shape after groupby: ', pdf.shape)

Shape before groupby:  (103, 3)
Shape after groupby:  (103, 3)


### Get the coordinates for each postal code

#### Using geocoder library:

Although not reliable, we try to use the geocoder function. If it doesn't work, the CSV will be loaded by using the provided link.

In [6]:
import geocoder

# initialize your variable to None
lat_lng_coords = None

# get list of postal codes
postal_codes=pdf['PostalCode']

# loop until you get the coordinates - try getting lat/lon100 times, else use provided CSV file
def get_geocode(postal_codes,pdf):
    # assign initial lat_lng
    lat_lng_coords = None
    # create columns
    pdf['Latitude']=lat_lng_coords
    pdf['Longitude']=lat_lng_coords
    # assign if geocoder is working
    geocoder_status=1
    for i in postal_codes:
        count=0
        while(lat_lng_coords is None and count < 100):
            g = geocoder.google('{}, Toronto, Ontario'.format(i))
            lat_lng_coords = g.latlng
            count+=1
            #print (i,count,lat_lng_coords,geocoder_status)
            if  lat_lng_coords != None:
                # append lat/lon values to dataframe
                latitude = lat_lng_coords[0]
                longitude = lat_lng_coords[1]
                idx=pdf.index[pdf['PostalCode'] == postal_codes.iloc[0]]
                pdf.loc[idx[0], 'Latitude'] = latitude
                pdf.loc[idx[0], 'Longitude'] = longitude
            elif count == 99 and lat_lng_coords == None:
                print('Geocoder failed')
                geocoder_status=0
                return geocoder_status,pdf
    return geocoder_status,pdf

# calls geocoder function
geocoder_status,df_latlon=get_geocode(postal_codes,pdf)


Geocoder failed


#### Using CSV file (whenever geocode function fails)

In [7]:
if geocoder_status==0:
    df_latlon.drop(['Latitude'],axis=1, inplace=True)
    df_latlon.drop(['Longitude'],axis=1, inplace=True)
    csv_url='https://cocl.us/Geospatial_data'
    csv_df=pd.read_csv(csv_url)
    csv_df=csv_df.rename({'Postal Code':'PostalCode'}, axis=1)
    print('Data was successfully loaded from remote CSV')

Data was successfully loaded from remote CSV


In [8]:
csv_df
# allocate Latitude and Longitude in the main data frame
df_latlon=pd.merge(df_latlon, csv_df, on="PostalCode")
df_latlon

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Create zones based on geographical coordinates

We are going to divide the region in 5 different clusters based on latitude and longitude data. This could be useful for organizing waste management operation in zones/areas, i.e. creating operational headquarters to improve logistics in each zone.

In [9]:
# set number of clusters
kclusters = 5

canada_clustering = df_latlon[['Latitude','Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(canada_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

# add clustering labels to dataframe
df_latlon.insert(5, 'Cluster', kmeans.labels_)
df_latlon


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M3A,North York,Parkwoods,43.753259,-79.329656,4
1,M4A,North York,Victoria Village,43.725882,-79.315572,4
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,2
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,1
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,2
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,4
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,1


### Map design

In [11]:
# create map

# initial latitude and logitude to start the map
lat_i,lon_i=df_latlon[['Latitude','Longitude']].mean(axis=0)

map_clusters = folium.Map(location=[lat_i, lon_i], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_latlon['Latitude'], df_latlon['Longitude'], df_latlon['Neighborhood'], df_latlon['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

# trigger the map
map_clusters

### Based on proximity, we have our region separated in 5 different zones.