# Capstone: Segmenting and Clustering Neighborhoods in Toronto

## Importing the Libraries:

In [14]:
from bs4 import BeautifulSoup
import requests
import csv
import json
import xml
import pandas as pd
import numpy as np


## Downloading the Wiki URL:

In [38]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
WikiToronto = requests.get(url).text

## Parsing the URL into a BeautifulSoup Object:

In [39]:
Toronto_soup = BeautifulSoup(WikiToronto, 'lxml')

## Finding the Table that is needed:

In [40]:
Toronto_table=Toronto_soup.find('table')

## Finding all of the fields (td) data:

In [18]:
fields = Toronto_table.find_all('td')
len(fields)

864

## Creating the Data Frame:

In [23]:
# range(start, stop, step)
# declaring variables
postcode = []
borough = []
neighbourhood = []

#looping through the dataset appending to the variables
for i in range(0, len(fields), 3):
    postcode.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighbourhood.append(fields[i+2].text.strip())

# creating the pandas dataframe:        
df_tor = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df_tor.columns = ['Postalcode', 'Borough', 'Neighborhood']
df_tor.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Removing the "Not assigned" Boroughs from the DataFrame

In [24]:
df_tor['Borough'].replace('Not assigned', np.nan, inplace=True)
df_tor.dropna(subset=['Borough'], inplace=True)
df_tor.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## Replacing the "Not assigned" Neighborhoods with the Borough Name

In [25]:
df_tor[df_tor['Neighborhood'].str.match('Not assigned')]

Unnamed: 0,Postalcode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


In [26]:
df_tor['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)

In [27]:
df_tor[df_tor['Neighborhood'].str.match('Not assigned')]

Unnamed: 0,Postalcode,Borough,Neighborhood


In [28]:
#Visualizing the first few rows of the dataframe
df_tor.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


## Grouping the Neighborhoods together for the same Postalcode and Borough

In [32]:
tor_df = df_tor.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [41]:
tor_df.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Displaying the shape of the dataframe

In [37]:
tor_df.shape

(103, 3)

In [43]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [44]:
df_geo.columns = ['Postalcode', 'Latitude', 'Longitude']

In [45]:
df_torcomplete = pd.merge(df_tor, df_geo, on=['Postalcode'], how='inner')

In [46]:
df_torcomplete.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
5,M6A,North York,Lawrence Manor,43.718518,-79.464763
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353
