##  importing libraries

In [1]:
import numpy as np 
import pandas as pd
import json # (library to handle JSON files)
import requests # (library to handle requests)
from pandas.io.json import json_normalize # (tranform JSON file into a pandas dataframe)

# import k-means from clustering stage
from sklearn.cluster import KMeans

# for webscraping import Beautiful Soup 
from bs4 import BeautifulSoup

import xml


In [2]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
scrap= BeautifulSoup(url,'lxml')


In [3]:
table_post = scrap.find('table')
fields = table_post.find_all('td')

postal_code = []
borough = []
neighbourhood = []

for i in range(0, len(fields), 3):
    postal_code.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighbourhood.append(fields[i+2].text.strip())
        
df= pd.DataFrame(data=[postal_code, borough, neighbourhood]).transpose()
df.columns = ['Postal_code', 'Borough', 'Neighbourhood']
df

Unnamed: 0,Postal_code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [4]:
# printing 1st 5 rows
df.head()

Unnamed: 0,Postal_code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## removing "not assigned " from borough coloumn

In [5]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)


In [6]:
# printing dataframe after removing not assigned from borough coloumn
df

Unnamed: 0,Postal_code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
import geocoder

In [8]:
print(df.shape)
df.describe()

(103, 3)


Unnamed: 0,Postal_code,Borough,Neighbourhood
count,103,103,103
unique,103,10,99
top,M5M,North York,Downsview
freq,1,24,4


In [9]:
def get_latilong(postal_code):
    lati_long_coords = None
    while(lati_long_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lati_long_coords = g.latlng
    return lati_long_coords
    
get_latilong('M4G')

[43.70909000000006, -79.36409999999995]

## Retrieving Postal Code Co-ordinates

In [10]:
postalcodes = df['Postal_code']    
coords = [ get_latilong(postal_code) for postal_code in postalcodes.tolist() ]

## Adding Columns Latitude & Longitude

In [11]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

In [12]:
df.head(15)


Unnamed: 0,Postal_code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.65514,-79.36265
3,M4A,North York,Victoria Village,43.72321,-79.45141
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.66449,-79.39302
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.66277,-79.52831
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.81153,-79.19552
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.70794,-79.3116
9,M1B,Scarborough,"Malvern, Rouge",43.65736,-79.37818
11,M3B,North York,Don Mills,43.65279,-79.55406
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.78564,-79.15871
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.72184,-79.3434
