In [None]:
## Segmenting and Clustering Neighborhoods in Toronto
### Author: Isaac Shareef

#### This notebook is to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe.



In [1]:
#importing necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [33]:
#define the URL where the data will be scraped
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#response and soup object
response = requests.get(url).text
soup = BeautifulSoup(response, 'xml')

In [34]:
#assigning the HTML table to a variable named data
data = soup.find('table')

In [35]:
#creating the pandas dataframe and assigning the df a variable name 'df'
column_names = ['Postal Code', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns = column_names)
df

Unnamed: 0,Postal Code,Borough,Neighborhood


In [43]:
# Search all the postalcodes, boroughs, & neighborhoods
for tr_cell in data.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [37]:
#using .info and .head to determine object type, comlumns, rows, etc/
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180 entries, 0 to 179
Data columns (total 3 columns):
Postal Code     180 non-null object
Borough         180 non-null object
Neighborhood    180 non-null object
dtypes: object(3)
memory usage: 5.6+ KB


(None,   Postal Code           Borough               Neighborhood
 0         M1A      Not assigned               Not assigned
 1         M2A      Not assigned               Not assigned
 2         M3A        North York                  Parkwoods
 3         M4A        North York           Victoria Village
 4         M5A  Downtown Toronto  Regent Park, Harbourfront)

### There are 180 entries, and 3 columns

In [38]:
#dropping 'Not assigned' from both the 'Borough' and 'Neighborhood' column
indexNames = df[ (df['Borough'] =='Not assigned') & (df['Neighborhood'] == 'Not assigned') ].index
df.drop(indexNames , inplace=True)

In [40]:
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 3 columns):
Postal Code     103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 3.2+ KB


(None,
   Postal Code           Borough                                 Neighborhood
 2         M3A        North York                                    Parkwoods
 3         M4A        North York                             Victoria Village
 4         M5A  Downtown Toronto                    Regent Park, Harbourfront
 5         M6A        North York             Lawrence Manor, Lawrence Heights
 6         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government)

### After dropping all of the Not Assigned rows we now have a dataframe with 103 rows, and 3 columns

In [42]:
df.shape

(103, 3)

In [45]:
#pulling the coordinate data from cocl
coords = pd.read_csv('http://cocl.us/Geospatial_data')

In [46]:
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [47]:
#merge on 'Postal Code'
merged = pd.merge(coords, df, on = 'Postal Code' )

In [48]:
merged.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae
