## PART 1

In [2]:
import pandas as pd
import numpy as np
import requests

In [13]:
pip install beautifulsoup4  #Installing new package 


Note: you may need to restart the kernel to use updated packages.


In [14]:
pip install lxml     #Installing the parser

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 7.7MB/s eta 0:00:01     |█████████                       | 1.6MB 7.7MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from bs4 import BeautifulSoup

### Getting data from the wikipedia link

In [14]:
link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page= requests.get(link).text

soup = BeautifulSoup(raw_wikipedia_page,"html.parser")


In [18]:
raw_wikipedia_page_table=soup.table

### Extracting the table

In [20]:
results=raw_wikipedia_page_table.find_all('tr')
nrows=len(results)
nrows         #We check the number of rows in the table

181

In [29]:
results[1].text    # We check any row

'\nM1A\n\nNot assigned\n\n\n'

In [30]:
results[1].text.split('\n')    #We see how the data is divided

['', 'M1A', '', 'Not assigned', '', '', '']

In [34]:

Postcode=results[1].text.split('\n')[1]  #We index the postcode 
Postcode

'M1A'

In [35]:

Borough=results[1].text.split('\n')[3]   #We index the Borough
Borough

'Not assigned'

In [45]:

Neighborhood=results[3].text.split('\n')[5]   #We index the Neighborhood
Neighborhood

'Parkwoods'

In [46]:
records =[]               # We will run a while loop to add the values of the table to variable records
n=1
while n <nrows :
    Postcode=results[n].text.split('\n')[1]
    Borough=results[n].text.split('\n')[3]
    Neighborhood=results[n].text.split('\n')[5]
    records.append((Postcode, Borough,Neighborhood))
    n=n+1

    #We create a dataframe
df=pd.DataFrame(records, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [47]:
# We see hoow many rows have Borough equal to 'Not assigned'
df[df['Borough']=='Not assigned'].count()

PostalCode       77
Borough          77
Neighbourhood    77
dtype: int64

In [48]:

# drops those rows where 'Not assigned' appears in column '[Borough]'
df_cleaned=df[~df.Borough.str.contains("Not assigned")]
df_cleaned=df_cleaned.reset_index(drop=True)

In [49]:
df_cleaned.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [52]:
#We will look for repeated postalcodes and consolidate into one postalcode with the name of the neighbourhoods merged.

postalcodes = df_cleaned['PostalCode'].nunique()
boroughs = df_cleaned['Borough'].nunique()
neighbourhoods= df_cleaned['Neighbourhood'].nunique()
print('Unique Postalcodes : ' + str(postalcodes))
print('Unique Boroughs  : '+ str(boroughs))
print('Unique Neighbourhoods  :' + str(neighbourhoods))

Unique Postalcodes : 103
Unique Boroughs  : 10
Unique Neighbourhoods  :98


In [60]:
#We group and sort the data

df_cleaned.groupby(['PostalCode','Borough','Neighbourhood']).size().reset_index(name='Count').head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Count
0,M1B,Scarborough,Malvern / Rouge,1
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,1
2,M1E,Scarborough,Guildwood / Morningside / West Hill,1
3,M1G,Scarborough,Woburn,1
4,M1H,Scarborough,Cedarbrae,1


In [56]:
df_cleaned.sort_values('PostalCode').head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
6,M1B,Scarborough,Malvern / Rouge
12,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
18,M1E,Scarborough,Guildwood / Morningside / West Hill
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae
32,M1J,Scarborough,Scarborough Village
38,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
44,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
51,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
58,M1N,Scarborough,Birch Cliff / Cliffside West


In [61]:
df_cleaned.shape

(103, 3)

## PART 2

###  We get the latitude and the longitude coordinates of each neighborhood

#### I will be using the csv file as the geocoder but it did not work reliably

In [64]:
# Read the cvs file and convert it to a dataframe

url='http://cocl.us/Geospatial_data'
df_pcodes=pd.read_csv(url)
df_pcodes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [66]:
df_pcodes.columns = ['PostalCode', 'Latitude', 'Longitude'] #Rename column to PostalCode to merge the above dataset
df_pcodes.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [67]:
# Now we will merge the two datasets!

df_merged=pd.merge(df_cleaned,df_pcodes, how='right', on = 'PostalCode')
df_merged.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,Parkview Hill / Woodbine Gardens,43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [68]:
df_merged.shape

(103, 5)