In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Libraries for scrapping 
#!conda install -c conda-forge butifulsoup4 
from bs4 import BeautifulSoup

#!conda install -c conda-forge lxml

#!conda install -c conda-forge html5lib

print('Libraries imported.')

Libraries imported.


# Scrapping Data  

In [2]:
# getting HTML as a text
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())

In [3]:
# Our required data is @ table class="wikitable sortable jquery-tablesorter"

table = soup.find('table', class_='wikitable sortable')
#print(table.prettify())

#### Tranform the data into a *pandas* dataframe

The next task is essentially transforming this data of nested Python dictionaries into a *pandas* dataframe. 

In [4]:
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)

neighborhoods = pd.DataFrame(res, columns=['Postcode','Borough','Neighbourhood'])

#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [5]:
neighborhoods['Borough'].replace('Not assigned', np.nan, inplace=True)
neighborhoods.dropna(subset=['Borough'], inplace=True)

neighborhoods.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


#### Aggregate data 

In [6]:
neighborhoods = neighborhoods.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
neighborhoods.columns = ['Postcode', 'Borough', 'Neighbourhood']
neighborhoods(

SyntaxError: unexpected EOF while parsing (<ipython-input-6-ec3cc7b8b0e9>, line 3)

In [7]:
neighborhoods[neighborhoods.Neighbourhood == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M9A,Queen's Park,Not assigned


In [8]:
neighborhoods['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)
neighborhoods.tail(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
268,M8Y,Etobicoke,Mimico NE
269,M8Y,Etobicoke,Old Mill South
270,M8Y,Etobicoke,The Queensway East
271,M8Y,Etobicoke,Royal York South East
272,M8Y,Etobicoke,Sunnylea
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


In [9]:
neighborhoods.shape

(210, 3)

# Adding geograpical coordinate

In [10]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['Postcode', 'Latitude', 'Longitude']

In [11]:
df_pos = pd.merge(neighborhoods, df_geo, on=['Postcode'], how='inner')

df_tor = df_pos[['Borough', 'Neighbourhood', 'Postcode', 'Latitude', 'Longitude']].copy()

df_tor.head()

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,North York,Parkwoods,M3A,43.753259,-79.329656
1,North York,Victoria Village,M4A,43.725882,-79.315572
2,Downtown Toronto,Harbourfront,M5A,43.65426,-79.360636
3,North York,Lawrence Heights,M6A,43.718518,-79.464763
4,North York,Lawrence Manor,M6A,43.718518,-79.464763


In [12]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of the City of Toronto are 43.653963, -79.387207.


### Toronto map

In [16]:


#create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Borough'], df_tor['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Exploration

Define Foursquare Credentials and Version

In [17]:
CLIENT_ID = '1DASOPA2LCFSX2R1YVW2KMZEKDRTIEQG1QJ1B5IUXE2MILGZ' # your Foursquare ID
CLIENT_SECRET = 'UR40SGY2M2BONJVMMKOQXPSIRCDITLASCDC5H2MKMG3GEDTE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1DASOPA2LCFSX2R1YVW2KMZEKDRTIEQG1QJ1B5IUXE2MILGZ
CLIENT_SECRET:UR40SGY2M2BONJVMMKOQXPSIRCDITLASCDC5H2MKMG3GEDTE


In [18]:
df_t4 = df_tor[df_tor['Borough'].str.contains('Toronto')]

to_data = df_t4.reset_index(drop=True)
to_data

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,Downtown Toronto,Harbourfront,M5A,43.65426,-79.360636
1,Downtown Toronto,Queen's Park,M7A,43.662301,-79.389494
2,Downtown Toronto,Ryerson,M5B,43.657162,-79.378937
3,Downtown Toronto,Garden District,M5B,43.657162,-79.378937
4,Downtown Toronto,St. James Town,M5C,43.651494,-79.375418
5,East Toronto,The Beaches,M4E,43.676357,-79.293031
6,Downtown Toronto,Berczy Park,M5E,43.644771,-79.373306
7,Downtown Toronto,Central Bay Street,M5G,43.657952,-79.387383
8,Downtown Toronto,Christie,M6G,43.669542,-79.422564
9,Downtown Toronto,Adelaide,M5H,43.650571,-79.384568


## Explore the first neighbourhood¶


In [19]:
to_data.loc[0, 'Neighbourhood']

'Harbourfront'

#### let's grab the neighbourhood long and lat values¶

In [20]:
neighbourhood_latitude = to_data.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = to_data.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = to_data.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Harbourfront are 43.6542599, -79.3606359.


In [21]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=1DASOPA2LCFSX2R1YVW2KMZEKDRTIEQG1QJ1B5IUXE2MILGZ&client_secret=UR40SGY2M2BONJVMMKOQXPSIRCDITLASCDC5H2MKMG3GEDTE&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100'

In [22]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e379b62f7706a001be98891'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 46,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

In [23]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### Clean json and structure into a pandas dataframe

In [24]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()


Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Gym / Fitness Center,43.653191,-79.357947
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698


#### and how many venus were returned from Foursquare?

In [25]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

46 venues were returned by Foursquare.
