In [1]:
!pip install bs4
!pip install lxml

from bs4 import BeautifulSoup
import pandas as pd
import requests
print('bs4 and lxml installed')

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.1-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 3.4 MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.0.1-py3-none-any.whl (32 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1272 sha256=c8c5c75393f0c62bde11b437e83ce9cc281cd938c5b7f1b7087bdf89423eec80
  Stored in directory: /home/jovyan/.cache/pip/wheels/19/f5/6d/a97dd4f22376d4472d5f4c76c7646876052ff3166b3cf71050
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.9.1 bs4-0.0.1 soupsieve-2.0.1
Collecting lxml
  Downloading lxml-4.5.1-cp36-cp36m-manylinux1_x86_64.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 3.3 MB/s eta 0:00:01
[?25hInstalling collected package

# Scraping the wikipedia page of Canada's postal codes

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')
table = soup.find('table')

In [3]:
list_columns = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = list_columns)

In [4]:
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [5]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Cleaning the data

In [6]:
df = df[ df['Borough']!= 'Not assigned']

In [7]:
df = df[df['Neighborhood']!= 'Not assigned']

In [8]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
temp_df = df.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ','.join(x))
temp_df = temp_df.reset_index(drop=False)
temp_df.rename(columns={'Neighborhood':'Neighborhood_joined'}, inplace=True)

In [10]:
df_merge = pd.merge(df, temp_df, on='Postalcode')

In [11]:
df_merge.drop(['Neighborhood'], axis=1, inplace=True)

In [12]:
df_merge.drop_duplicates(inplace=True)

In [13]:
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'}, inplace=True)

In [14]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [15]:
df_merge.shape

(103, 3)

In [18]:
def get_geocode(postal_code):
    #initialize your variable to None
    lat_lng_coords = None

    #loop until you get the coordenates
    while(lat_lng_coords is None):
        g = geocoder.goole('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    
    latitulde = lat_lng_coords[0]
    longitulde = lat_lng_coords[1]
    return latitulde, longitulde

In [19]:
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [27]:
geo_df.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [28]:
geo_df.rename(columns={'Postal Code':'Postalcode'}, inplace=True)

In [30]:
geo_merged = pd.merge(geo_df, df_merge, on= 'Postalcode')

In [32]:
geo_data = geo_merged[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]

In [33]:
geo_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
