In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bsoup
import requests

Hi. I am using the same notebook for both questions in this assignment. 

The first part 'Data Wrangling' is where I scrape the web data and create the working dataframe. 

The second part is my analysis.

# Data Wrangling

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
data = bsoup(page.text,'html.parser')

In [19]:
postcodes = data.find_all('td')[:540:3]
borough = data.find_all('td')[1:540:3]
hood = data.find_all('td')[2:541:3]

i=0
while i < len(postcodes):
    postcodes[i] = postcodes[i].string.strip('\n')
    i=i+1
i=0
while i < len(borough):
    borough[i] = borough[i].string.strip('\n')
    i=i+1
i=0

while i < len(hood):
    if not hood[i].text is None:
        hood[i] = hood[i].text.strip('\n')        
    i=i+1

info = {'Postal Code':postcodes,\
        'Borough':borough,\
        'Neighbourhood':hood}

frame = pd.DataFrame(data=info)

frame = frame[frame['Neighbourhood'] != ''] #drop empty unasigned postal codes
frame = frame.sort_values(['Postal Code'])
frame = frame.reset_index(drop=True) 
#frame.head()

#----------------------------------
geo = pd.read_csv("geocoor.csv")
frame[['Latitude','Longitude']] = geo[['Latitude','Longitude']]
frame.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Great! A full dataset was made, now I get to pick out what to look at specifically. 

In this case I followed the assignments suggestion and isolated all Toronto Boroughs from the set. 

In [37]:
tor_dat = frame[frame['Borough'].str.contains("Toronto")]
tor_dat.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
42,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


# Analysis

First. we will import all the visualisation goodies. 

In [91]:
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim #
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [23]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.6534817, -79.3839347.


In [54]:
map_tor = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(tor_dat['Latitude'], tor_dat['Longitude'], tor_dat['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor

Great! Now that we have this visualized. Lets take a closer look at one particular Borough  and see what amenities are available.

Lets arbitrarily choose 'the Beaches' Neighbourhood

In [86]:
neighbourhood_lat = float(tor_dat.head(1)['Latitude'])
neighbourhood_lng = float(tor_dat.head(1)['Longitude'])
neighbourhood_name = "The Beaches"

Now lets load the Foursquare API to do a little analysis on this neighbourhood

In [83]:
CLIENT_ID = 'RA245WQGCR2DRQOXIOOE3STMEUUPZ0JEDXNLQ1JH15MUS0TY' # your Foursquare ID
CLIENT_SECRET = '5A1S200S1P1K3G1NK3TB0IDLVNZRXNOCTDHLHZNTDZUXK0TM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RA245WQGCR2DRQOXIOOE3STMEUUPZ0JEDXNLQ1JH15MUS0TY
CLIENT_SECRET:5A1S200S1P1K3G1NK3TB0IDLVNZRXNOCTDHLHZNTDZUXK0TM


In [87]:
radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_lat, 
    neighbourhood_lng, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=RA245WQGCR2DRQOXIOOE3STMEUUPZ0JEDXNLQ1JH15MUS0TY&client_secret=5A1S200S1P1K3G1NK3TB0IDLVNZRXNOCTDHLHZNTDZUXK0TM&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [88]:
results = requests.get(url).json()

In [89]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [92]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869
