### Create and clean the Toronto neighborhood dataset

This dataset excludes any postcode that has an unassigned borough. Unassigned neighborhoods take the value of their borough.

Source: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [23]:
#import libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [24]:
#scrape information from Wikipedia to build the Toronto dataset
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url, 'lxml')
#print out soup
#print(soup.prettify())
#extract table
my_table = soup.find('table', {'class':'wikitable sortable'})
my_table_string = pd.read_html(str(my_table))
df = pd.DataFrame({'PostalCode':my_table_string[0][0], 'Borough':my_table_string[0][1], 'Neighborhood': my_table_string[0][2]}, columns = ['PostalCode', 'Borough', 'Neighborhood'])

In [25]:
### data cleaning

#drop first row
df = df.drop([0], axis = 0)
#remove every row where "borough" = "not assigned"
df.drop(df[df['Borough'].str.contains("Not assigned")].index, inplace = True)

In [27]:
#group neighborhoods with the same postal ccode
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [46]:
#get index of "Not assigned" neighborhoods
nassign = df.index[df['Neighborhood'] == "Not assigned"].tolist()
#replace "Not assigned neighborhoods with the name of their boroughs
for index in nassign:
    df['Neighborhood'].replace(df['Neighborhood'][index], df['Borough'][index], inplace = True)

In [52]:
#write csv file
df.to_csv('toronto.csv')

In [51]:
#get the shape of the dataset
df.shape

(103, 3)