# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto - Week 3 Assessment

First, it is necessary to import all the required dependencies and libraries

In [24]:
#import
import pandas as pd
from bs4 import BeautifulSoup
import requests
import collections

We will use BeautifulSoup to retrieve the information from the specified Wikipedia page

In [25]:
#Alternatively, if we want to feed the webpage directly we can do the following:
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
newsoup = BeautifulSoup(source, 'lxml') #create a new beautiful soup object
#print(newsoup.prettify())

Because the postal codes and neighbourhoods are in a table, we can just work with the tbody element from the website and then find all the table rows (trs)

In [26]:
tbl = newsoup.find('tbody') #find the table body element where all the information is embedded
trs= tbl.find_all('tr') #find all the table rows containing the information
#print(trs[0])

#from the first table row trs[0], obtain the table headings and save them in an array of strings
titles=trs[0].find_all('th')
title1=titles[0].text
title2=titles[1].text
title3=titles[2].text
title3=title3.split('\n')[0]
titles=[title1,title2,title3] #these are the titles for the dataframe colums
print(titles)

['Postcode', 'Borough', 'Neighbourhood']


We have to reorganize the information from the table so that it can be used to create a dataframe, for this we have created three empty lists: postcodes, boroughs and neighbourhoods

In [27]:

info = []
postcodes=[]
boroughs=[]
neighbourhoods=[]

for x in range(1,len(trs)):  #this would be the beginning of the code needed to iterate over the rest of the rows

#trying with the first row containing valuable informtaion:
    tds=trs[x].find_all('td') #from the row, find all the tds
    cell1=tds[0].text
    cell2=tds[1].text
    cell3=tds[2].text.split('\n')[0]
    if cell2 != 'Not assigned':
        if cell3 == 'Not assigned':
            cell3=cell2
        row1=[cell1,cell2,cell3]
        postcodes.append(cell1) #append the the first cell of the row (Postcode) to the postcodes list
        boroughs.append(cell2) #append the second cell of the row (Borough) to the boroughs list
        neighbourhoods.append(cell3) #append the third cell of the row (Neighbourhood) to the neighbourhoods list
        
        info.append(row1) #create a list with lists containing the rows of the Postcodes' table and append information
        
#print(info) #in case someone wants to look at the information collected from the website

Then we can create a dictionary using the previously mentioned lists (arrays), and pass this as an argument to create a pandas dataframe

In [29]:
columnas={'Postcode':postcodes,'Borough':boroughs,'Neighbourhood':neighbourhoods}
df=pd.DataFrame(columnas)
len(df.Postcode.unique())

103

Because there are some neighbourhoods that have the same postcode, we use the groupby method and display the first five entries of the updated dataframe

In [30]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head()
#df.loc[df['pcode'] == 'M5A']

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Finally, we display the number of rows in the dataframe

In [31]:
df.shape

(103, 3)

### Include Latitude and Longitude Coordinates 

In [32]:
import geocoder
print('geocoder has been imported!')

geocoder has been imported!


In [33]:
# initialize your variable to None
lat_lng_coords = None
lats=[]
longs=[]

# loop until you get the coordinates
#for postal_code in postcodes:
 #   while(lat_lng_coords is None):
  #      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
   #     lat_lng_coords = g.latlng
    #lats.append(lat_lng_coords[0])
    #longs.append(lat_lng_coords[1])

To ensure the information retrieved is reliable and can be properly added to our dataframe, we will use a csv file containing the relevant latitude and longitude coordinates for the different postcodes

In [34]:
data = pd.read_csv('http://cocl.us/Geospatial_data') #read the csv file as a dataframe
# Preview the first 5 lines of the loaded data 
data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [35]:
df['Latitude'] = data['Latitude'] #incorporate the latitude column of the newly created dataframe to our previous dataframe
df['Longitude'] = data['Longitude'] #incorporate longitude column

In [36]:
df.head() #preview the first 5 entries of the updated dataframe

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Clustering Step

Now that we have our dataframe with all the proper information about postal codes, boroughs, neighbourhoods and coordinates, we can cluster them based on Boroughs (Given that these are repeated and encompass more than one postcode and/or neighbourhood)

In [43]:

print('There are '+ str(len(df.Borough.unique())) +' Boroughs in the dataframe')


There are 11 Boroughs in the dataframe


In [44]:
tor_lat=43.6532
tor_lon=-79.3832
print('The geograpical coordinates of Toronto are {}, {}.'.format(tor_lat, tor_lon))

The geograpical coordinates of Toronto are 43.6532, -79.3832.


We need to import folium library so that a map can be created

In [46]:
import folium # map rendering library
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [57]:
bor_names=df.Borough.unique()
bor_names[0] #shows the first borough from the Borough column in the dataframe 

'Scarborough'

We will create markers that will vary in colour based on the borough the dataframe row belongs to

In [61]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[tor_lat, tor_lon], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    colores=['red', 'blue', 'green', 'purple', 'black', 'darkblue', 'darkgreen','darkpurple', 'pink', 'lightgreen', 'gray']
    if borough==bor_names[0]:
        c_out=colores[0]
    elif borough==bor_names[1]:
        c_out=colores[1]
    elif borough==bor_names[2]:
        c_out=colores[2]
    elif borough==bor_names[3]:
        c_out=colores[3]
    elif borough==bor_names[4]:
        c_out=colores[4]
    elif borough==bor_names[5]:
        c_out=colores[5]
    elif borough==bor_names[6]:
        c_out=colores[6]
    elif borough==bor_names[7]:
        c_out=colores[7]
    elif borough==bor_names[8]:
        c_out=colores[8]
    elif borough==bor_names[9]:
        c_out=colores[9]
    elif borough==bor_names[10]:
        c_out=colores[10]
    c_fill=c_out
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=c_out,
        fill=True,
        fill_color=c_fill,
        fill_opacity=0.75,
        parse_html=False).add_to(map_toronto)  

map_toronto