# Week 5-1: Extract Toronto's geometric information from wikipedia

In [7]:
!pip install lxml
import pandas as pd



### Import wiki into dataframe

In [8]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
toronto=pd.read_html(url)[0]

###  Ignore cells with a borough that is Not assigned.

In [9]:
toronto=toronto[toronto.Borough != 'Not assigned']

### Ignore cells with a Neighbourhood that is Not assigned

In [10]:
toronto['Neighbourhood']=toronto['Neighbourhood'].replace('Not assigned', toronto['Borough'])
toronto

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
...,...,...,...
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West


### Combine rows with same Postcode

In [11]:
toronto=toronto.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join)
toronto=pd.DataFrame(toronto).reset_index()
toronto.columns = ['PostalCode','Borough','Neighborhood']
toronto

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [12]:
toronto.shape

(103, 3)

# Week 5-2: Add latitude and longitude 

### Import geographical coordinates info

In [13]:
postal_code=pd.read_csv('http://cocl.us/Geospatial_data')
postal_code.columns = ['PostalCode','Latitude','Longitude']
postal_code.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge two dataframes into one dataframe

In [14]:
toronto=pd.merge(toronto,postal_code,on='PostalCode')
toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


# Week 5-3: Explore neighborhoods in Cherryhill and Toronto

### Define Foursquare Credentials and Version

In [15]:
CLIENT_ID = 'DVJYVJITRMLOHB323J0VA0VTXY0JNQJJTLP50PEYPXM00WWT' 
CLIENT_SECRET = 'UPAI3UP4F4YCVLS0Y44G2Z3KZHSGZEEJOGCCFDMD40ZEIHTS' 
VERSION = '20191001' 

### Extraxt info only have Toronto

In [17]:
toronto=toronto[toronto.Borough=='Downtown Toronto'].reset_index(drop=True)
toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752


### Create a dataframe with Cherryhill's geo infomation

In [38]:
## From google map, Cherryhill's postcode is N6H 2M4, latitude and longitude are 42.9964° N and 81.3344° W
london={'PostalCode':['N6H'],'Borough':['London'],'Neighborhood':['Cherryhill'],'Latitude':[42.9964],'Longitude':[-81.3344]}
cherryhill=pd.DataFrame(data=london)
cherryhill

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,N6H,London,Cherryhill,42.9964,-81.3344


### Merge Cherryhill and Toronto's dataframe

In [39]:
geo=pd.concat([toronto,cherryhill])
geo

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752


### Create a function that will return nearby venues

In [18]:
import numpy as np 

import json 

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Getting Cherryhill and Toronto's venues info

In [40]:
geo_venues = getNearbyVenues(names=geo['Neighborhood'],
                                   latitudes=geo['Latitude'],
                                   longitudes=geo['Longitude']
                                  )
geo_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"Cabbagetown,St. James Town",43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant


### Converting venue to category

In [42]:
# one hot encoding
geo_onehot = pd.get_dummies(geo_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
geo_onehot['Neighborhood '] = geo_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [geo_onehot.columns[-1]] + list(geo_onehot.columns[:-1])
geo_onehot = geo_onehot[fixed_columns]

geo_onehot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,"Cabbagetown,St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [44]:
geo_grouped = geo_onehot.groupby('Neighborhood ').mean().reset_index()
geo_grouped

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011765,0.0,...,0.0,0.0,0.0,0.0,0.011765,0.0,0.0,0.011765,0.0,0.011765
5,Cherryhill,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.05,0.0,0.05,0.01,0.0,0.0
7,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Church and Wellesley,0.011494,0.0,0.0,0.0,0.0,0.0,0.0,0.011494,0.0,...,0.011494,0.0,0.0,0.0,0.0,0.011494,0.011494,0.0,0.011494,0.011494
9,"Commerce Court,Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0


In [51]:
geo_grouped.shape

(19, 205)

# Week 5-4: Calculate similarity using Euclidean distance

In [45]:
import scipy

### Calculate similarity using Euclidean distance

In [126]:
x=geo_grouped.iloc[5,1:205].to_numpy()
d=[]
for i in range(0,19):
    y=geo_grouped.iloc[i,1:205].to_numpy()
    d.append(scipy.spatial.distance.euclidean(x,y))

In [127]:
distance=pd.DataFrame(d)
distance.columns=['euclidean_distance']
distance

Unnamed: 0,euclidean_distance
0,0.603987
1,0.613586
2,0.690335
3,0.570684
4,0.632825
5,0.0
6,0.610082
7,0.678924
8,0.618864
9,0.622254


### Merge two dataframes and extract Neighborhood and euclidean_distance

In [147]:
similarity=geo_grouped.merge(distance,left_index=True, right_index=True)
r=similarity.drop(similarity.columns[1:205],axis=1).sort_values('euclidean_distance').reset_index(drop=True)
r

Unnamed: 0,Neighborhood,euclidean_distance
0,Cherryhill,0.0
1,"Cabbagetown,St. James Town",0.570684
2,St. James Town,0.596825
3,"Adelaide,King,Richmond",0.603987
4,"Ryerson,Garden District",0.603987
5,"Chinatown,Grange Park,Kensington Market",0.610082
6,"First Canadian Place,Underground city",0.612372
7,Berczy Park,0.613586
8,Stn A PO Boxes 25 The Esplanade,0.616049
9,"Harbourfront East,Toronto Islands,Union Station",0.616766
