In [129]:
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup

## Get data

In [113]:
html = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").read()
soup = BeautifulSoup(html, "html.parser")

In [114]:
table = soup.find('table', attrs={'class':'wikitable'})
table_rows = table.find_all('tr')

In [115]:
data = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        data.append(row)


df = pd.DataFrame(data, columns=["PostalCode", "Borough", "Neighborhood"])

In [130]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Filter not assigned borough

In [131]:
df2 = df[df.Borough != 'Not assigned']

## Populate not assigned neighborhood with the value of boroguh

In [124]:
df2[df2['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


In [132]:
df2.loc[df2['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df2['Borough']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [134]:
df2[df2['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


## Group by PostalCode and Borough so that Neighborhoods are separated by comma

In [135]:
df3 = df2.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x)).reset_index(name='Neighborhood')

In [137]:
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Get the shape of the data set

In [139]:
df3.shape

(103, 3)

## Save dataframe for further use

In [140]:
df3.to_pickle('clustering-1.pkl')