## Import functions for dataframes and webscraping

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

## Pull the data from wikipedia

In [2]:
data = []
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table')
rows = table.find_all('tr')
for row in rows[1:]:  #loop through but skip first row
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele])


## Create the dataframe from the wikipedia output

In [3]:
df = pd.DataFrame(data,columns=['PostalCode', 'Borough', 'Neighborhood'] )
df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Clean the data
The first thing I did was removed all the rows where the borough was not assigned,
then, I hate when indexes skip numbers, so I reset the index, however, the drop function
in reset_index was not dropping the previous index as expected, so I dropped it on 
a separate row. Then I looked for all rows where the Neighborhood was Not assigned
and set it to be the same as the Borough. I only found 1.

In [4]:
df = df[df.Borough != 'Not assigned'] #remove the not assigned boroughs
df.reset_index(level=None, inplace=True) #reset the index to 0
df.drop(['index'], axis=1, inplace=True)  #drop in reset_index wasn't working
df.loc[df.Neighborhood.isin(['Not assigned']), 'Neighborhood'] = df.Borough #set the Not assigned neighborhoods to match the borough

df.head(7)




Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park


## Show shape of dataframe

In [5]:
df.shape

(211, 3)

## Get lat and long values
Geocoder's loop just kept running so I'm using the csv file. Watson Studio imported the file on its own

In [6]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0


# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share your notebook.

#removed

body = #removed
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()



Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## join the dataframes

In [7]:
df = df.join(df_data_1.set_index('Postal Code'), on='PostalCode')
df.head(12)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
5,M6A,North York,Lawrence Manor,43.718518,-79.464763
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353
