<h4> Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto </h4> $Part 1$

In [3]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.pyplot as plt
%matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests # library to handle requests
import csv

In [55]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)
source = requests.get(url)
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

#### Finding html code related to table in webpage

In [161]:
table = soup.find( 'table' , class_ = 'wikitable sortable')
#print(table.prettify())
type(table)

bs4.element.Tag

#### using panda read_html to create a DataFrame 

In [245]:
df = pd.read_html(str(table))
df_table = df[0]
df_table.rename(columns = {'Neighbourhood':'Neighborhood'} , inplace= True)
df_table.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Cleaning Data 

In [247]:
#Ignore cells with a borough that is Not assigned
df_table = df_table[df_table['Borough'] != 'Not assigned' ]

df_table.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


##### if a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [249]:
df_table[df_table['Neighborhood'] == 'Not assigned'  ]

Unnamed: 0,Postcode,Borough,Neighborhood
9,M9A,Queen's Park,Not assigned


In [250]:
df_table.loc[df_table['Neighborhood']=='Not assigned' , 'Neighborhood' ] = df_table[df_table['Neighborhood'] == 'Not assigned' ]['Borough']


#### Group df by Postcode and join other column by ','

In [251]:
df_group = df_table.groupby('Postcode').agg(lambda x: ','.join(x.unique()) ).reset_index()

df_group

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [252]:
df_group.shape


(103, 3)

## Part 2
#### creating geo df using the given csv file

In [211]:
df_geo = pd.read_csv('https://cocl.us/Geospatial_data')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### joining both df ON postal code

In [253]:
df_merge = pd.merge(df_group, df_geo, left_on='Postcode', right_on='Postal Code').drop('Postal Code' , axis=1)
df_merge.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Part 3 
#### Segmenting only boroughs that contain the word Toronto

In [254]:
df_toronto = df_merge.loc[df_merge['Borough'].str.contains("Toronto")]

df_toronto

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


#### Getting lat and lng for Toronto

In [255]:
address = 'Toronto Ontario, CA'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Visualizing Neighbors in Toronto using Folium

In [257]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto