## Segmenting and Clustering Neighborhoods in Toronto
### A Work by Joey Higgins

*Here I'm importing the libraries reccomended in the BeautifulSoup YouTube tutorial I watched*


In [211]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import csv

*In this chunk I'm parsing through the HTML data available within the Wiki page and "prettifying" it*

In [168]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'html5')
# print(soup.prettify())
table = soup.find('table')
print(table.prettify())

<table class="wikitable">
 <tbody>
  <tr>
   <th>
    Postal code
   </th>
   <th>
    Borough
   </th>
   <th>
    Neighborhood
   </th>
  </tr>
  <tr>
   <td>
    M1A
   </td>
   <td>
    Not assigned
   </td>
   <td>
   </td>
  </tr>
  <tr>
   <td>
    M2A
   </td>
   <td>
    Not assigned
   </td>
   <td>
   </td>
  </tr>
  <tr>
   <td>
    M3A
   </td>
   <td>
    North York
   </td>
   <td>
    Parkwoods
   </td>
  </tr>
  <tr>
   <td>
    M4A
   </td>
   <td>
    North York
   </td>
   <td>
    Victoria Village
   </td>
  </tr>
  <tr>
   <td>
    M5A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Regent Park / Harbourfront
   </td>
  </tr>
  <tr>
   <td>
    M6A
   </td>
   <td>
    North York
   </td>
   <td>
    Lawrence Manor / Lawrence Heights
   </td>
  </tr>
  <tr>
   <td>
    M7A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Queen's Park / Ontario Provincial Government
   </td>
  </tr>
  <tr>
   <td>
    M8A
   </td>
   <td>
    Not assigned
   </

*In this chunk I'm structuring the HTML data and printing it*

In [212]:
table_rows = table.find_all('tr')

for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    print(row)

[]
['M1A\n', 'Not assigned\n', '\n']
['M2A\n', 'Not assigned\n', '\n']
['M3A\n', 'North York\n', 'Parkwoods\n']
['M4A\n', 'North York\n', 'Victoria Village\n']
['M5A\n', 'Downtown Toronto\n', 'Regent Park / Harbourfront\n']
['M6A\n', 'North York\n', 'Lawrence Manor / Lawrence Heights\n']
['M7A\n', 'Downtown Toronto\n', "Queen's Park / Ontario Provincial Government\n"]
['M8A\n', 'Not assigned\n', '\n']
['M9A\n', 'Etobicoke\n', 'Islington Avenue\n']
['M1B\n', 'Scarborough\n', 'Malvern / Rouge\n']
['M2B\n', 'Not assigned\n', '\n']
['M3B\n', 'North York\n', 'Don Mills\n']
['M4B\n', 'East York\n', 'Parkview Hill / Woodbine Gardens\n']
['M5B\n', 'Downtown Toronto\n', 'Garden District, Ryerson\n']
['M6B\n', 'North York\n', 'Glencairn\n']
['M7B\n', 'Not assigned\n', '\n']
['M8B\n', 'Not assigned\n', '\n']
['M9B\n', 'Etobicoke\n', 'West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale\n']
['M1C\n', 'Scarborough\n', 'Rouge Hill / Port Union / Highland Creek\n']
['M2C\n', 'No

*In this chunk I'm cleaning the data, removing unnecessary rows and outputting to a local repository*

In [249]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)

Postal_Code_Scrape = pd.DataFrame(l, columns=["Postal_Code", "Borough", "Neighborhood"])
Postal_Code_Scrape = Postal_Code_Scrape[Postal_Code_Scrape.Borough != "Not assigned\n"]
Postal_Code_Scrape = Postal_Code_Scrape.iloc[1:]

Postal_Code_Scrape['Postal_Code'] = Postal_Code_Scrape['Postal_Code'].str.strip("\n")
Postal_Code_Scrape['Borough'] = Postal_Code_Scrape['Borough'].str.strip("\n")
Postal_Code_Scrape['Neighborhood'] = Postal_Code_Scrape['Neighborhood'].str.strip("\n")
Postal_Code_Scrape['Neighborhood'] = Postal_Code_Scrape['Neighborhood'].str.replace("/",",")

Postal_Code_Scrape.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park , Harbourfront"
6,M6A,North York,"Lawrence Manor , Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [210]:
Postal_Code_Scrape.shape

(103, 3)

In [219]:
Geo_Data = pd.read_csv('C:/Users/Broseph Higgins/Coursera_Capstone/Data/Geospatial_Coordinates.csv', names=['Postal_Code', 'Latitude', 'Longitude'])
Geo_Data.head()

Unnamed: 0,Postal_Code,Latitude,Longitude
0,Postal Code,Latitude,Longitude
1,M1B,43.8066863,-79.1943534
2,M1C,43.7845351,-79.1604971
3,M1E,43.7635726,-79.1887115
4,M1G,43.7709921,-79.2169174


In [252]:
df1 = Postal_Code_Scrape
df2 = Geo_Data

Neighborhood_Data = pd.merge(df1, df2, on='Postal_Code', how='inner')
Neighborhood_Data.head(10)

Unnamed: 0,Postal_Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7532586,-79.3296565
1,M4A,North York,Victoria Village,43.7258823,-79.3155716
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.6542599,-79.3606359
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.4647633
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.6623015,-79.3894938
5,M9A,Etobicoke,Islington Avenue,43.6678556,-79.5322424
6,M1B,Scarborough,"Malvern , Rouge",43.8066863,-79.1943534
7,M3B,North York,Don Mills,43.7459058,-79.352188
8,M4B,East York,"Parkview Hill , Woodbine Gardens",43.7063972,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6571618,-79.3789371
