# Ilyes Kabbourim Notebook for scraping wikipedia tables

## Scraping the table

In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(URL, allow_redirects=False, timeout=10)

In [4]:
# The soup object contains all of the original HTML, but structured
soup = BeautifulSoup(response.content, 'html.parser')
#
wikitables = soup.body.find_all('table', class_='wikitable sortable')
#len(wikitables)
#
rows = wikitables[0].find_all('tr')
len(rows)
#
def get_headers(row):
    return [th.text for th in row.find_all('th')]
#
df = pd.DataFrame(columns=get_headers(rows[0]))

for i in range(len(rows)-1):
    df.loc[i] = rows[i+1].text.split('\n')[1:4]

df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


## Dropping the 'Borough' 'Not assigned' rows

In [5]:
df.set_index('Borough', inplace = True)
df.drop(index='Not assigned', axis = 0, inplace = True)
df.reset_index(inplace = True)
data = df[['Postcode','Borough','Neighbourhood\n']]

In [6]:
data

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## Grouping and merging and Boroughs neighborhoods by postcode

In [7]:
data_n = pd.DataFrame(data.groupby('Postcode')['Neighbourhood\n'].apply(lambda x: "%s" % ', '.join(x))).reset_index()
data_n.head()

Unnamed: 0,Postcode,Neighbourhood
0,M1B,"Rouge, Malvern"
1,M1C,"Highland Creek, Rouge Hill, Port Union"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [8]:
data_b = data[['Postcode','Borough']].groupby('Postcode').first()
data_b.reset_index(inplace = True)
data_b.head()

Unnamed: 0,Postcode,Borough
0,M1B,Scarborough
1,M1C,Scarborough
2,M1E,Scarborough
3,M1G,Scarborough
4,M1H,Scarborough


In [9]:
final = pd.merge(data_b, data_n, how='inner', on='Postcode',left_index=True, right_index=True, sort=True, copy=True, indicator=False,
         validate=None)

In [10]:
final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Now we have a dataframe with all neighborhoods grouped by their postcodes, however we are not done yet, we still have to see if there are any rows where Borough is assigned but
the neighbourhood is not.

In [11]:
final.loc[final['Neighbourhood\n'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


Let's have a loop that can be applied to other data, that looks for the Neighbourhood not assigned and replace the name of the neighbourhood with the name of the Borough:

In [12]:
for idx,neigh in enumerate(final['Neighbourhood\n']):
    if neigh == 'Not assigned':
        final.set_value(idx, "Neighbourhood\n", final['Borough'][idx])
        print(final['Borough'][idx])

Queen's Park


  This is separate from the ipykernel package so we can avoid doing imports until


Let's see if there are any neighbourhoods not assigned !

In [13]:
final.loc[final['Neighbourhood\n'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [14]:
final.shape

(103, 3)

# Second Part

from geopy.geocoders import Nominatim

neighborhood = 'M2H'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode('Toronto ON {}'.format(neighborhood))
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

In [50]:
location = geolocator.geocode('Highland Creek, Toronto, Ontario'.format(neighborhood))
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.7901172 -79.1733344


In [None]:
from geopy.geocoders import Nominatim

df_loc = pd.DataFrame(columns = ['Latitude','Longitude'])

geolocator = Nominatim(user_agent="pp")

for idx,p in enumerate(final['Borough']):
    location = geolocator.geocode('{}, Toronto ON'.format(p))
    df_loc['Latitude'] = location.latitude
    df_loc['Longitude'] = location.longitude
    df_loc.set_value(idx, "Latitude", location.latitude)
    df_loc.set_value(idx, "Longitude", location.latitude)    

In [41]:
final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [43]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="pp")

for idx,p in enumerate(data['Neighbourhood\n']):
    #print('Toronto ON {}'.format(p))
    #location = geolocator.geocode('Toronto {}'.format(p))
    lat=geolocator.geocode('{}, Toronto ON'.format(p)).latitude
    lon=geolocator.geocode('{}, Toronto ON'.format(p)).longitude
    print(lat,lon)

43.7612239 -79.3239857
43.732658 -79.3111892
43.6400801 -79.3801495
43.6607056 -79.3604569
43.7227784 -79.4509332
43.7220788 -79.4375067


AttributeError: 'NoneType' object has no attribute 'latitude'

In [55]:
location = geolocator.geocode('North York, Toronto ON'.format(p))
df_loc['Latitude'] = location.latitude
df_loc['Longitude'] = location.longitude
print(latitude, longitude)

43.7901172 -79.1733344
