## Segmenting and Clustering Neighborhoods in Toronto

#### Assignment Notebook

In [1]:
import numpy as np 
import pandas as pd 

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
import lxml.html as lh # to parse the relevant fields
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


In [165]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Create a handle, page, to handle the contents of the website
page = requests.get(url)

# Store the contents of the website under doc
doc = lh.fromstring(page.content)

# Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [166]:
# Parse first row as header

tr_elements = doc.xpath('//tr')

#Create empty list
col=[]
i=0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print (i,name)
    col.append((name,[]))

1 Postcode
2 Borough
3 Neighbourhood



In [167]:
# Since our first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [168]:
# Create the dataframe
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [169]:
# Remove unwanted characters
df.rename(columns = {'Neighbourhood\n':'Neighbourhood'}, inplace = True)
df = df.replace({'\n':''}, regex=True)

In [170]:
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [171]:
# Drop rows with a 'Not assigned Borough'
df=df[df.Borough != 'Not assigned']

In [172]:
# Merge postocodes with more than one neighborhood

df=df.groupby('Postcode').agg({'Borough':lambda x:x.max(), 'Neighbourhood':lambda x:', '.join(x)})

# reset index
df=df.reset_index()

In [173]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [174]:
# Find which cells have a borough but a 'Not assigned' neighborhood
df[df['Neighbourhood'].str.contains("Not assigned")]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


In [175]:
# Making the neighborhood same as the borough
df.loc[85,'Neighbourhood']=df.loc[85,'Borough']

In [176]:
df.shape

(103, 3)