### Question 1
Use pandas, or the BeautifulSoup package, or any other way you are comfortable with to transform the data in the table on the Wikipedia page into the above pandas dataframe.

#### Import required libraries for Webscraping and setting up DataFrame

In [1]:
import pandas as pd
import numpy as np
import bs4 as BeautifulSoup
import matplotlib.pyplot as plt
import requests

In [2]:
url = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).text
soup = BeautifulSoup.BeautifulSoup(html,'html5lib')

#### Webscrapping Steps taken:
- Find the Table containing the location information
- Extract the Postal Code through the "b" tag
- Borough and Neighbourhood data scraped from the row's span text and seperated between the two using the '('
- If no Neighbourhood found set Borough and Neighbourhood to the Borough

In [3]:
table = soup.find('table')
table_contents=[]
for i, row in enumerate(table.find_all('td')):
    cell = {}
    if 'Not assigned' in row.span.text:
        pass
    else:
        if '(' in row.span.text:
            # Splits the Borough from the Neighbourhood into a list of length 2
            split = row.span.text.split("(")
            cell['Postal Code'] = row.b.string
            cell['Borough'] = split[0]
            
            # Splits the Neighbourhoods and joins them together with ","
            cell['Neighbourhood'] = ','.join(split[1][:split[1].rfind(')')].split(' /'))
            table_contents.append(cell)
        else:
            # If no Neighbourhood is assigned to the postal code set Neighbourhood to Borough
            cell['Borough'] = row.span.text
            cell['Neighbourhood'] = row.span.text

# Convert list of data cells into a DataFrame
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [4]:
df.shape

(103, 3)

### Question 2
Append the longitude and latitude to the above DataFrame

Once you are able to create the above dataframe, submit a link to the new Notebook on your Github repository. (2 marks)

In [5]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
# import geocoder # import geocoder

Set a rate limiter for applying to full dataframes

In [6]:
geolocator = Nominatim(user_agent="toronto_neighbourhoods")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [11]:
def check_location_geocoder(df):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(df['Postal Code']))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return pd.Series([latitude, longitude])

Tested Geolocator code with smaller dataset. Was unable to reliably retreive the correct location data

In [12]:
df1 = df.head(10)
print(df1.apply(check_location_geocoder, axis=1))

Status code Unknown from https://maps.googleapis.com/maps/api/geocode/json: ERROR - HTTPSConnectionPool(host='maps.googleapis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://maps.googleapis.com/maps/api/geocode/json: ERROR - HTTPSConnectionPool(host='maps.googleapis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://maps.googleapis.com/maps/api/geocode/json: ERROR - HTTPSConnectionPool(host='maps.googleapis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://maps.googleapis.com/maps/api/geocode/json: ERROR - HTTPSConnectionPool(host='maps.googleapis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://maps.googleapis.com/maps/api/geocode/json: ERROR - HTTPSConnectionPool(host='maps.googleapis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://maps.googleapis.com/maps/api/geocode/json: ERROR - HTTPSConnectionPool(host='

KeyboardInterrupt: 

In [7]:
def check_location_geopy(df):
    location_info = None
    i=0
    # Limited number of calls fdrom geocode
    while (location_info is None) and i <= 5:
        location_info = geocode(f'{df.Neighbourhood}, {df.Borough}', country_codes='ca', addressdetails=True)
        i+=1
    if location_info != None:
        return pd.Series([location_info.latitude, location_info.longitude])
    else: 
        return pd.Series([0, 0])

In [8]:
df2 = df.head(10)
print(df2.apply(check_location_geopy, axis=1))

           0          1
0  43.758800 -79.320197
1  43.732658 -79.311189
2   0.000000   0.000000
3  43.716391 -79.442566
4   0.000000   0.000000
5  43.638959 -79.521050
6  43.809196 -79.221701
7  43.775347 -79.345944
8   0.000000   0.000000
9   0.000000   0.000000


#### Loaded in CSV data of the location data, using Postal Code as index for joining DataFrames together

In [7]:
loc_data = pd.read_csv(r'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv').set_index('Postal Code')
df = df.set_index('Postal Code')
df = df.join(loc_data).reset_index()
display(df.head())

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


#### Question 4



In [49]:
# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
from sklearn.cluster import KMeans
from details import CLIENT_ID, CLIENT_SECRET, LIMIT, VERSION
import matplotlib.cm as cm
import matplotlib.colors as colors

In [9]:
address = 'Toronto, ON'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [115]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)
toronto_df = df.loc[df.Borough.str.contains('Toronto')].reset_index(drop=True)
# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [12]:
def getNearbyVenues(df, radius=500):
    
    col_names = ['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']
    
    venues_list=[]
    for row in df[col_names].iterrows():
        neigh = row[1]['Neighbourhood']
        lat = row[1]['Latitude']
        lng = row[1]['Longitude']
        print(neigh)

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            neigh, 
            lat,
            lng,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
        'Neighbourhood', 
        'Latitude', 
        'Longitude',
        'Venue', 
        'Venue Latitude', 
        'Venue Longitude', 
        'Venue Category']
    
    return(nearby_venues)

In [15]:
toronto_venues = getNearbyVenues(toronto_df)

Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Enclave of M5E
St. James Town, Cabbagetown
First Canadi

In [16]:
toronto_venues

Unnamed: 0,Neighbourhood,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.654260,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.654260,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.654260,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.654260,-79.360636,Impact Kitchen,43.656369,-79.356980,Restaurant
4,"Regent Park, Harbourfront",43.654260,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
...,...,...,...,...,...,...,...
1571,Enclave of M4L,43.662744,-79.321558,Jonathan Ashbridge Park,43.664702,-79.319898,Park
1572,Enclave of M4L,43.662744,-79.321558,The Ten Spot,43.664815,-79.324213,Spa
1573,Enclave of M4L,43.662744,-79.321558,TTC Stop #03049,43.664470,-79.325145,Light Rail Station
1574,Enclave of M4L,43.662744,-79.321558,Greenwood Cigar & Variety,43.664538,-79.325379,Smoke Shop


Create a one hot dataframe listing each venue and type

In [101]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.shape

(1576, 233)

In [102]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.shape

(39, 233)

In [103]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Restaurant,Beer Bar,Cheese Shop,Seafood Restaurant,Pharmacy,Farmers Market,Fish Market
1,"Brockton, Parkdale Village, Exhibition Place",Café,Bakery,Breakfast Spot,Coffee Shop,Italian Restaurant,Stadium,Furniture / Home Store,Nightclub,Climbing Gym,Bar
2,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Boutique,Harbor / Marina,Rental Car Location,Sculpture Garden,Boat or Ferry,Plane,Airport,Airport Food Court
3,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Burger Joint,Middle Eastern Restaurant,Japanese Restaurant,Salad Place,Bubble Tea Shop,Thai Restaurant
4,Christie,Grocery Store,Café,Park,Restaurant,Candy Store,Italian Restaurant,Athletics & Sports,Coffee Shop,Baby Store,Nightclub


# Clustering Algorithm
Apply a k-means clustering algorithm to classify each type of neighbourhood

In [107]:
kclusters =7

toronto_grouped_cluster = toronto_grouped.drop('Neighbourhood', 1)
k_mean = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_cluster)

k_mean.labels_

array([2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 6, 2, 2, 0, 0, 2, 5, 2, 1,
       2, 0, 2, 2, 3, 4, 2, 2, 0, 2, 2, 0, 2, 3, 0, 2, 0], dtype=int32)

In [106]:
# add clustering labels
toronto_merged = toronto_df

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged['Cluster Labels'] = k_mean.labels_

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,2
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0


In [108]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Cluster properties 
- List the 10 most common ammeneties within heach cluster
- First by assigning the cluster labels to each neighbourhood in the toronto_grouped venue list
- Groupby and average based on the new cluster label

In [109]:
neighborhoods_venues_sorted['Cluster Labels'] = k_mean.labels_

In [110]:
toronto_grouped['Cluster Labels'] = k_mean.labels_

In [111]:
group_groupby = toronto_grouped.groupby('Cluster Labels').mean().reset_index()
group_groupby

Unnamed: 0,Cluster Labels,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,0,0.0,0.0,0.005917,0.005917,0.011834,0.017751,0.001876,0.003205,0.0,...,0.002198,0.001876,0.0,0.0,0.002481,0.0,0.0,0.0,0.0,0.004358
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.000649,0.000649,0.0,0.0,0.0,0.0,0.013293,0.001636,0.0025,...,0.0,0.01,0.0015,0.011842,0.0,0.008764,0.005846,0.001136,0.000806,0.010274
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Cluster Labels']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
group_venues_sorted = pd.DataFrame(columns=columns)
group_venues_sorted['Cluster Labels'] = group_groupby['Cluster Labels']

for ind in np.arange(group_groupby.shape[0]):
    group_venues_sorted.iloc[ind, 1:] = return_most_common_venues(group_groupby.iloc[ind, :], num_top_venues)

group_venues_sorted

Unnamed: 0,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Café,Coffee Shop,Park,Italian Restaurant,Grocery Store,Restaurant,Bakery,Sandwich Place,Bar,Pizza Place
1,1,Park,Restaurant,Adult Boutique,Music Venue,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant
2,2,Coffee Shop,Café,Restaurant,Hotel,Park,Pub,Clothing Store,Italian Restaurant,Japanese Restaurant,Bakery
3,3,Park,Playground,Trail,Metro Station,Convenience Store,Hospital,Home Service,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant
4,4,Garden,Home Service,Adult Boutique,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark
5,5,Park,Bus Line,Swim School,Adult Boutique,Music Venue,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant
6,6,Park,Trail,Sushi Restaurant,Jewelry Store,Adult Boutique,Museum,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant
