# Segmenting and Clustering Neighborhoods in Toronto
## Part 1 - Obtaining the Data


Start by importing the necessary packages


In [2]:
# import packages
import pandas as pd
from bs4 import BeautifulSoup
import requests

print("Packages imported")

Packages imported


We will use beautiful soup to scrape the neighborhood information table
on Wikipedia. `url` points to the correct page.

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

wiki = requests.get(url)
toronto_soup = BeautifulSoup(wiki.text, 'html.parser')

Find the table in the webpage and extract the headings and data

In [4]:
# Find the table in the page
toronto_table = toronto_soup.find('table', attrs={'class': 'wikitable'})
toronto_table_data = toronto_table.tbody.find_all('tr')

# Make lists to hold headings and data
toronto_table_headings = []
toronto_table_body = []

number_of_rows = len(toronto_table_data)

print('Table countains {} rows'.format(number_of_rows))

# Extract the table heading from the first row
for th in toronto_table_data[0].find_all('th'):
    toronto_table_headings.append(th.text.replace('\n', ''))

# Define a counter for the final rows
table_row=0

# Extract the rows of data from the remaining rows
for row in range(1,number_of_rows):
    # Only extract data if the postal code is assigned
    if "Not assigned" not in toronto_table_data[row].text:

        toronto_table_body.append([])

        for td in toronto_table_data[row].find_all('td'):
            toronto_table_body[table_row].append(td.text.replace('\n', ''))

        table_row+=1

print('Number of rows={}'.format(len(toronto_table_body)))
print('Data table generated')

Table countains 181 rows
Number of rows=103
Data table generated


Finally, the extracted data can be converted to a pandas dataframe


In [7]:
# Convert to a pandas dataframe
df_toronto = pd.DataFrame(toronto_table_body)

print("DataFrame generated")

# Assign column headings
df_toronto.columns = toronto_table_headings

# export to csv
df_toronto.to_csv('Toronto_neighborhoods.csv', index=False)



# display dataframe
df_toronto

DataFrame generated


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [10]:
print('Dataframe has shape: {}'.format(df_toronto.shape))

Dataframe has shape: (103, 3)
