# Applied Data Science Capstone Project: Segmenting and Clustering Neighborhoods in Toronto-Problem 3

In [3]:
# Import necessary libraries

import requests
import lxml.html as lh
import pandas as pd

In [4]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Create a handle, page, to handle the contents of the website
page = requests.get(url)

#Store the contents of the website under doc
doc = lh.fromstring(page.content)

#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [5]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

This means that there are 3 columns per row

In [6]:
# Parse the first row as our header
tr_elements = doc.xpath('//tr')

#Create empty list
col=[]
i=0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighborhood
"


**Creating Pandas DataFrame** <br>
Each header is appended to a tuple along with an empty list.

In [7]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [8]:
# Check the length of each column. Ideally, they should all be the same
[len(C) for (title,C) in col]

[287, 287, 287]

This shows that each of the 3 columns has exactly 287 rows

**Creating the pandas data frame**

In [9]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [10]:
# Access the top 5 rows of the data frame 
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


**Rearranging and renaming the columns**

In [14]:
df.columns = ['Borough', 'Neighbourhood','Postcode']

cols = df.columns.tolist()
cols

cols = cols[-1:] + cols[:-1]

df = df[cols]

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


**Cleaning the messy string in the Borough column**

In [15]:
df = df.replace('\n',' ', regex=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


**Dropping all cells with a borough that is Not assigned**

In [16]:
df.drop(df.index[df['Borough'] == 'Not assigned'], inplace = True)

# Reset the index and dropping the previous index
df = df.reset_index(drop=True)

df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Not assigned
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


**Combining Neighbourhoods based on similar Postcode and Borough**

In [17]:
df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()
df.columns = ['Postcode','Borough','Neighbourhood']
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge ,Malvern"
1,M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union"
2,M1E,Scarborough,"Guildwood ,Morningside ,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park ,Ionview ,Kennedy Park"
7,M1L,Scarborough,"Clairlea ,Golden Mile ,Oakridge"
8,M1M,Scarborough,"Cliffcrest ,Cliffside ,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff ,Cliffside West"


In [18]:
#Removing any space in the start of the string
df['Neighbourhood'] = df['Neighbourhood'].str.strip()

In [19]:
#Assigning Borough values to the Neighbourhood where vlaue is "Not assigned"
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']

In [20]:
# Check if the Neighbourhood for Queen's Park changed 
df[df['Borough'] == 'Queen\'s Park']

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [21]:
# Check the shape of the data frame
df.shape

(103, 3)

In [22]:
#Save this file to a csv
df.to_csv(r'df_can.csv')

In [23]:
#Extracting the Latitude and Longitude of Canada Ontario
"""import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')"""

"import requests # library to handle requests\nimport pandas as pd # library for data analsysis\nimport numpy as np # library to handle data in a vectorized manner\nimport random # library for random number generation\n\n!conda install -c conda-forge geopy --yes \nfrom geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values\n\n# libraries for displaying images\nfrom IPython.display import Image \nfrom IPython.core.display import HTML \n    \n# tranforming json file into a pandas dataframe library\nfrom pandas.io.json import json_normalize\n\n!conda install -c conda-forge folium=0.5.0 --yes\nimport folium # plotting library\n\nprint('Folium installed')\nprint('Libraries imported.')"

In [24]:
"""import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]"""

"import geocoder # import geocoder\n\n# initialize your variable to None\nlat_lng_coords = None\n\n# loop until you get the coordinates\nwhile(lat_lng_coords is None):\n  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))\n  lat_lng_coords = g.latlng\n\nlatitude = lat_lng_coords[0]\nlongitude = lat_lng_coords[1]"

### Couldn't figure out the error, So I used the csv given in the link

In [25]:
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

link = "http://cocl.us/Geospatial_data"
df1 = pd.read_csv(link)

df1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [26]:
df1.shape

(103, 3)

Both the data frames have 103 rows and 3 columns

**Changing the column name Postal code to Postcode to merge the two data frames together**

In [27]:
df1.columns = ['Postcode','Latitude','Longitude']

cols = df1.columns.tolist()
cols

['Postcode', 'Latitude', 'Longitude']

In [28]:
#Read in the CSV file saved in the previous assignment
df = pd.read_csv(r'df_can.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood
0,0,M1B,Scarborough,"Rouge ,Malvern"
1,1,M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union"
2,2,M1E,Scarborough,"Guildwood ,Morningside ,West Hill"
3,3,M1G,Scarborough,Woburn
4,4,M1H,Scarborough,Cedarbrae


**Merging the two data frames together based on their Postcode**

In [29]:
df_new = pd.merge(df, df1, on='Postcode')
df_new.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,0,M1B,Scarborough,"Rouge ,Malvern",43.806686,-79.194353
1,1,M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union",43.784535,-79.160497
2,2,M1E,Scarborough,"Guildwood ,Morningside ,West Hill",43.763573,-79.188711
3,3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [30]:
df_new.to_csv(r'df_final.csv')

# Problem 3: Explore and cluster the neighborhoods in Toronto

In [31]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Libraries imported.


**Note<br>
I will focus on a subset of all the Borough that contain the word "Toronto".<br>
There are 38 Borough with word Toronto.**

In [32]:
toronto_data= df_new[df_new['Borough'].str.contains('Toronto', na = False)].reset_index(drop=True)
toronto_data.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,41,M4K,East Toronto,"The Danforth West ,Riverdale",43.679557,-79.352188
2,42,M4L,East Toronto,"The Beaches West ,India Bazaar",43.668999,-79.315572
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [82]:
toronto_data.shape

(39, 6)

## From google : the latitud and longitud for Toronto, as follows: 43.6532° N, 79.3832° W

In [83]:
latitude = 43.6532
longitude= -79.3832

## Let's create a map of Toronto with Boroughs that cointain the word Toronto

In [84]:
# create map of TORONTO using latitude and longitude values above:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto