# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup as soup   # Used to extract data from HTML, which is useful for web scraping
import pandas as pd
import requests 

In [2]:
url = ("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [3]:
data = requests.get(url)
data

<Response [200]>

In [4]:
page_data = soup(data.text,'html.parser') # or html5lib
#page_data

#page_data = page_data.prettify()
#page_data

### From the above raw data, lets get a better view of what we want to scrap

In [5]:
page_data2 = page_data.find_all('tr')[1:]
#page_data2

In [6]:
page_data2[0].find('td')

<td>M1A</td>

In [7]:
page_data2[1].text.split('\n')[1] # needed variables index 1,2,3

'M2A'

### Lets create an empty dataframe

In [8]:
df = pd.DataFrame()
df

### Lets append into the dataframe the useful data

In [9]:
for each in page_data2: 
    try:
        Postal_ = each.text.split('\n')[1]
        Borough_ = each.text.split('\n')[2]
        Neighourhood_ = each.text.split('\n')[3]
          
        df =df.append(pd.DataFrame({'PostalCode': Postal_,'Borough': Borough_,'Neighborhood': Neighourhood_} , index = [0]),ignore_index = True)
    except:
        pass
   

In [10]:
df = df[['PostalCode','Borough','Neighborhood']]
df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
287,M9Z,Not assigned,Not assigned
288,,,Canadian postal codes
289,,,
290,NL,,NS
291,A,,B


### We will need to remove the 'not assigned' and empty space from the dataframe

In [11]:
df=df[df.Borough != 'Not assigned']

In [12]:
df=df[df.Borough != '']

In [13]:
df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


In [14]:
df = df.reset_index()
del df['index']
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [15]:
df.shape

(211, 3)

### At this point we will need to merge perhaps group duplicate PostalCode and reset index accordingly 

In [18]:
new_df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(tuple)

new_df = new_df.reset_index()

new_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"(Rouge, Malvern)"
1,M1C,Scarborough,"(Highland Creek, Rouge Hill, Port Union)"
2,M1E,Scarborough,"(Guildwood, Morningside, West Hill)"
3,M1G,Scarborough,"(Woburn,)"
4,M1H,Scarborough,"(Cedarbrae,)"
5,M1J,Scarborough,"(Scarborough Village,)"
6,M1K,Scarborough,"(East Birchmount Park, Ionview, Kennedy Park)"
7,M1L,Scarborough,"(Clairlea, Golden Mile, Oakridge)"
8,M1M,Scarborough,"(Cliffcrest, Cliffside, Scarborough Village West)"
9,M1N,Scarborough,"(Birch Cliff, Cliffside West)"


In [19]:
new_df.shape

(103, 3)

### Lets import csv file of the latitude and longitude into our notebook - 'Geospatial_Coordinates.csv'

In [20]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share your notebook.
client_fda2ef03aa0b4e0f9c3a2bb219b26ebe = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='juIEbKwpG8GERJbfMxowv1NLqAmk4zKYMf8n20UIxeAQ',
    ibm_auth_endpoint="https://iam.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_fda2ef03aa0b4e0f9c3a2bb219b26ebe.get_object(Bucket='adscapstone-donotdelete-pr-jklj5f01yjwchv',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()



Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
df_data_1.columns = ['PostalCode', 'Latitude', 'Longitude']
df_data_1.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Lets add Latitide and Longitude into initial dataframe

In [22]:
new_df['Latitude'] = df_data_1['Latitude']

new_df['Longitude'] = df_data_1['Longitude']

In [23]:
new_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"(Rouge, Malvern)",43.806686,-79.194353
1,M1C,Scarborough,"(Highland Creek, Rouge Hill, Port Union)",43.784535,-79.160497
2,M1E,Scarborough,"(Guildwood, Morningside, West Hill)",43.763573,-79.188711
3,M1G,Scarborough,"(Woburn,)",43.770992,-79.216917
4,M1H,Scarborough,"(Cedarbrae,)",43.773136,-79.239476
5,M1J,Scarborough,"(Scarborough Village,)",43.744734,-79.239476
6,M1K,Scarborough,"(East Birchmount Park, Ionview, Kennedy Park)",43.727929,-79.262029
7,M1L,Scarborough,"(Clairlea, Golden Mile, Oakridge)",43.711112,-79.284577
8,M1M,Scarborough,"(Cliffcrest, Cliffside, Scarborough Village West)",43.716316,-79.239476
9,M1N,Scarborough,"(Birch Cliff, Cliffside West)",43.692657,-79.264848


### Importing necessary libraries to help plot location 

In [24]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


### Use geopy library to get the latitude and longitude values of Toronto

In [25]:
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Lets create map of Toronto using latitude and longitude values

In [26]:

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(new_df['Latitude'], new_df['Longitude'], new_df['Borough'], new_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto