# Capstone: Segmenting and Clustering Neighborhoods in Toronto

## Importing the Libraries:

In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import json
import xml
import numpy as np
import pandas as pd

## Downloading the Wiki URL:

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
WikiToronto = requests.get(url).text

## Parsing the URL into a BeautifulSoup Object:

In [3]:
Toronto_soup = BeautifulSoup(WikiToronto, 'lxml')

## Finding the Table that is needed:

In [4]:
Toronto_table=Toronto_soup.find('table')

## Finding all of the fields (td) data:

In [5]:
fields = Toronto_table.find_all('td')
len(fields)

864

## Creating the Data Frame:

In [6]:
# range(start, stop, step)
# declaring variables
postcode = []
borough = []
neighbourhood = []

#looping through the dataset appending to the variables
for i in range(0, len(fields), 3):
    postcode.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighbourhood.append(fields[i+2].text.strip())

# creating the pandas dataframe:        
df_tor = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df_tor.columns = ['Postalcode', 'Borough', 'Neighborhood']
df_tor.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Removing the "Not assigned" Boroughs from the DataFrame

In [7]:
df_tor['Borough'].replace('Not assigned', np.nan, inplace=True)
df_tor.dropna(subset=['Borough'], inplace=True)
df_tor.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## Replacing the "Not assigned" Neighborhoods with the Borough Name

In [8]:
df_tor[df_tor['Neighborhood'].str.match('Not assigned')]

Unnamed: 0,Postalcode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


In [9]:
df_tor['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)

In [10]:
df_tor[df_tor['Neighborhood'].str.match('Not assigned')]

Unnamed: 0,Postalcode,Borough,Neighborhood


In [11]:
#Visualizing the first few rows of the dataframe
df_tor.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


## Grouping the Neighborhoods together for the same Postalcode and Borough

In [12]:
tor_df = df_tor.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [13]:
tor_df.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Displaying the shape of the dataframe

In [14]:
tor_df.shape

(103, 3)

## Loading in the Geospatial data from a file

In [15]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


## Merging the data together

In [16]:
df_geo.columns = ['Postalcode', 'Latitude', 'Longitude']

In [17]:
df_torcomplete = pd.merge(tor_df, df_geo, on=['Postalcode'], how='inner')

In [18]:
df_torcomplete.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [19]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_torcomplete['Borough'].unique()),
        df_torcomplete.shape[0]
    )
)


The dataframe has 11 boroughs and 103 neighborhoods.


In [20]:
df_torcomplete.shape

(103, 5)

## Loading in more Libraries

In [24]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library
print('Libraries imported.')

Libraries imported.


## Find the coordinates of Toronto

In [22]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of the City of Toronto are 43.653963, -79.387207.


In [23]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_torcomplete['Latitude'], df_torcomplete['Longitude'], df_torcomplete['Borough'], df_torcomplete['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto