In [146]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd
import requests
import json

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
#!conda install geocoder --y
import geocoder # import geocoder

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


print('Libraries imported.')

Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


## Importing data from wiki page

In [288]:
df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]

In [289]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [290]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 3 columns):
Postcode         288 non-null object
Borough          288 non-null object
Neighbourhood    288 non-null object
dtypes: object(3)
memory usage: 6.8+ KB


In [291]:
df['Borough'].value_counts()

Not assigned        77
Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

#### Drop rows where Borough equals to 'Not assigned'

In [292]:
df = df.loc[df['Borough'] != 'Not assigned']
print(df.shape)
df['Borough'].value_counts()

(211, 3)


Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

#### Inspect rows with duplicated (Postcode, Borough)

In [293]:
df.sort_values(by = ['Postcode','Neighbourhood']).head()

Unnamed: 0,Postcode,Borough,Neighbourhood
12,M1B,Scarborough,Malvern
11,M1B,Scarborough,Rouge
27,M1C,Scarborough,Highland Creek
29,M1C,Scarborough,Port Union
28,M1C,Scarborough,Rouge Hill


#### Transform column 'Neighbourhood' by joing values with the same Postcode and Borough

In [294]:
df['Neighbourhood'] = df.groupby(['Postcode','Borough'])['Neighbourhood'].transform(lambda x: ','.join(x))
df = df.drop_duplicates()
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [233]:
df.shape

(103, 3)

## Using Google API  to find latitude and longitude data

In [295]:
df['Address'] = 'Tornto'+ ', '+ df['Borough'] +', ' + df['Postcode']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Address
0,M3A,North York,Parkwoods,"Tornto, North York, M3A"
1,M4A,North York,Victoria Village,"Tornto, North York, M4A"
2,M5A,Downtown Toronto,"Harbourfront,Regent Park","Tornto, Downtown Toronto, M5A"
3,M6A,North York,"Lawrence Heights,Lawrence Manor","Tornto, North York, M6A"
4,M7A,Queen's Park,Not assigned,"Tornto, Queen's Park, M7A"


#### using a single address to find the latitude and longitude

In [300]:
r = requests.get("https://maps.googleapis.com/maps/api/geocode/json?address="+ "Tornto, North York, M3A" +"&key="+ "AIzaSyCkYuko2tS24H4hllsnranGW6HwmRIlAwk")

location = json.loads(r.content)
location['results'][0]['geometry']['location']

{'lat': 43.7532586, 'lng': -79.3296565}

#### using pandas to create two columns latitude and longitude

I'll use Google API to loop through each address and find the latitude and longitude data.

In [303]:
# define the dataframe columns
column_names = ['Latitude', 'Longitude'] 

# instantiate the dataframe
locations = pd.DataFrame(columns = column_names)

apiKey = "AIzaSyCkYuko2tS24H4hllsnranGW6HwmRIlAwk"
for index, row in df.iterrows():
    address = row['Address']
    r = requests.get("https://maps.googleapis.com/maps/api/geocode/json?address="+ address +"&key="+ apiKey)
    location = json.loads(r.content)
    if location['status'] != 'ZERO_RESULTS':
        locations = locations.append({'Latitude': location['results'][0]['geometry']['location']['lat'],
                                          'Longitude': location['results'][0]['geometry']['location']['lng'],}, ignore_index=True)
df = df.join(locations)
df = df.drop(['Address'], axis = 1)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Not assigned,43.662278,-79.391527
