In [2]:
#%%===== Initialization =====%%#
#===== Import Libs =====#
print('Importing libraries...')

import numpy as np
import pandas as pd
import json as js
import requests
import geocoder # import geocoder
import folium

from bs4 import BeautifulSoup
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.pyplot as plt # plotting library
import matplotlib.colors as colors

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

print('Libraries imported.')
#===========================#
#===== Define Foursquare Credentials and other variables =====#
print('Defining Foursquare credentials...')
CLIENT_ID = 'BIJW5DEUCZQWIW4DIUQHYVJDRNBIFH0PHNPWUX1PFYRVTJJY' # your Foursquare ID
CLIENT_SECRET = 'NM1S4N0GGK2F2FDXRQFL50AR4K4SSY0USJWVOTMCCITL3DQ2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
searchEverywhere = False # Search the entire Toronto metropolitan area. Caution: Very large, may exceed the Foursquare daily call limit!
searchOnly_Borough = 'Downtown Toronto' # Define the target area/borough
explore_radius = 500 # The radius around a neighborhood to be explored
venue_limit = 100 # The first 100 venues near a neighborhood
num_clusters = 5 # Number of clusters in the K-Mean clustering method
print('Explore radius: {}, venue limit: {}'.format(explore_radius,venue_limit))
if searchEverywhere == False:
    print('Only search the venues in this borough: {}'.format(searchOnly_Borough))
#=====================================================#
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
print('Foursquare credentials defined successfully.')
print('Search all the neiborhoods in the Toronto metropolitan area: {}'.format(searchEverywhere))
if searchEverywhere == True:
    print('Warning: You are searching the venues in the entire Toronto metropolitan area. May exceed the Foursquare daily call limit. Proceed with caution!')
    print("If 'KeyError: groups' appears, you reach the Foursquare daily call limit. Please search for a smaller area/less venues/less premium contents, or search 24 hours later.")
#=====================================================#
##========================##

Importing libraries...
Libraries imported.
Defining Foursquare credentials...
Explore radius: 500, venue limit: 100
Only search the venues in this borough: Downtown Toronto
Your credentails:
CLIENT_ID: BIJW5DEUCZQWIW4DIUQHYVJDRNBIFH0PHNPWUX1PFYRVTJJY
CLIENT_SECRET:NM1S4N0GGK2F2FDXRQFL50AR4K4SSY0USJWVOTMCCITL3DQ2
Foursquare credentials defined successfully.
Search all the neiborhoods in the Toronto metropolitan area: False


In [3]:
#%%===== Read the postal code table from Wiki =====%%#
print('Reading Canadian zip data from Wiki...')

Url_Wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# Read the zip codes on Wiki #
Wiki_html = requests.get(Url_Wiki).text

zipCodes_Soup = BeautifulSoup(Wiki_html,'lxml')
# print(Can_Zip_Soup.prettify()) #For test use. "soup.prettify is a method that is used to display the html file in a readable way"

zipCodes_table = zipCodes_Soup.find('table', class_='wikitable sortable')
# print(zipCodes_table.prettify()) # For test use. Display the html in a readable way
# print(type(zipCodes_table)) # For test use.

zipCodes_list = [] # Create an empty array that can be used to store the zip codes
borough_list = [] # Create an empty array that can be used to store the boroughs
neighborhood_list = [] # Create an empty array that can be used to store the neighborhoods

for row in zipCodes_table.findAll("tr"):
    cells = row.findAll("td")
    # For each "tr", assign each "td" to a variable.
    if len(cells) == 3: # Number of columns in the table
        zipCodes_txt = cells[0].find(text=True) # Index from the first element
        borough_txt = cells[1].find(text=True)
        neighborhood_txt = cells[2].find(text=True)
#        print('Post code=',postCodes_txt,',', 'Borough=',borough_txt,',', 'Neighborhood=',neighborhood_txt) # For test use
        
        zipCodes_list.append(zipCodes_txt)
        borough_list.append(borough_txt)
        neighborhood_list.append(neighborhood_txt)

df_CanZipCodes = pd.DataFrame()
df_CanZipCodes['Postal Code'] = zipCodes_list
df_CanZipCodes['Borough'] = borough_list
df_CanZipCodes['Neighborhood'] = neighborhood_list
df_CanZipCodes = df_CanZipCodes[df_CanZipCodes.Borough != 'Not assigned']
print('Canadian Postal Code dataframe has been created successfully.')
df_CanZipCodes.head()
##==========================##

Reading Canadian zip data from Wiki...
Canadian Postal Code dataframe has been created successfully.


Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [4]:
#%%===== Get the Longitudes and Latitudes of rach neighborhood =====%%#
print('Collecting the geological information of each neighborhood...')
# Try to use the Geocoder lib first #
print('Trying to use the Geocoders lib...')
# initialize your variable to None
lat_lng_coords = None
tryCount = 0 # Initialize a counter
geoLibFail = False # Initialize a failer indicator

# loop until you get the coordinates
while(geoLibFail == False):
    while(lat_lng_coords is None):
        if tryCount == 5:
            geoLibFail = True
            print('Geocoder lib failed. Will read the geological information csv file instead.')
            break
        g = geocoder.google('{}, Toronto, Ontario'.format('M2H'))
        lat_lng_coords = g.latlng
        tryCount = tryCount + 1
    if tryCount < 5:
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
if geoLibFail == True:
    # Read the csv file for the geological information
    print('Reading the csv file geological information...')
    geoCsvUrl = 'http://cocl.us/Geospatial_data' # The URL of the geo csv file
    df_geoInfo = pd.read_csv(geoCsvUrl) # Read the geo csv file
    df_geoInfo.head()
    print('Geological information of each neighborhood has been successfully collected.')
##====================================================##
#%%===== Insert the longitudinal information to the postal code DataFrame =====%%#
    df_geoInfo.set_index('Postal Code', inplace=True)
    df_CanZipCodes.set_index('Postal Code', inplace=True)
    df_CanZipCodes['Latitude'] = ''
    df_CanZipCodes['Longitude'] = ''
    allZipCodes = df_geoInfo.index # Get all of the zip codes
    for postCode in allZipCodes: # Access every row in the geoInfo dataframe
        df_CanZipCodes.loc[postCode, 'Latitude'] = df_geoInfo.loc[postCode,'Latitude']
        df_CanZipCodes.loc[postCode, 'Longitude'] = df_geoInfo.loc[postCode,'Longitude']
#        print('Postal Code: ',postCode, 'Latitude: ',df_CanZipCodes.loc[postCode, 'Latitude'], 'Longitude: ',df_CanZipCodes.loc[postCode, 'Longitude'])
    # Test if there is any 'Not assigned' in the dataframe
    df_notAssigned = df_CanZipCodes.loc[df_CanZipCodes['Neighborhood'] == 'Not assigned', ['Neighborhood']]
    if df_notAssigned.empty:
        print ("Test passed successfully. No 'Not assigned' value in the 'Neighborhood' column.")
##===========================================================================##

Collecting the geological information of each neighborhood...
Trying to use the Geocoders lib...
Geocoder lib failed. Will read the geological information csv file instead.
Reading the csv file geological information...
Geological information of each neighborhood has been successfully collected.
Test passed successfully. No 'Not assigned' value in the 'Neighborhood' column.


In [5]:
df_CanZipCodes.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.7533,-79.3297
M4A,North York,Victoria Village,43.7259,-79.3156
M5A,Downtown Toronto,Harbourfront,43.6543,-79.3606
M5A,Downtown Toronto,Regent Park,43.6543,-79.3606
M6A,North York,Lawrence Heights,43.7185,-79.4648
