## Import functions for dataframes and webscraping

In [171]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

## Pull the data from wikipedia

In [172]:
data = []
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table')
rows = table.find_all('tr')
for row in rows[1:]:  #loop through but skip first row
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele])


## Create the dataframe from the wikipedia output

In [173]:
df = pd.DataFrame(data,columns=['PostalCode', 'Borough', 'Neighborhood'] )
df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Clean the data
The first thing I did was removed all the rows where the borough was not assigned,
then, I hate when indexes skip numbers, so I reset the index, however, the drop function
in reset_index was not dropping the previous index as expected, so I dropped it on 
a separate row. Then I looked for all rows where the Neighborhood was Not assigned
and set it to be the same as the Borough. I only found 1.

In [174]:
df = df[df.Borough != 'Not assigned'] #remove the not assigned boroughs
df.reset_index(level=None, inplace=True) #reset the index to 0
df.drop(['index'], axis=1, inplace=True)  #drop in reset_index wasn't working
df.loc[df.Neighborhood.isin(['Not assigned']), 'Neighborhood'] = df.Borough #set the Not assigned neighborhoods to match the borough

df.head(7)




Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park


## Show shape of dataframe

In [175]:
df.shape

(211, 3)