## Import Libraries

In [1]:
#! pip install beautifulsoup4  / delete # if needed
#! pip install lxml            / delete # if needed
#! pip install 'pandas==1.1.0'

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

## Create request reponse, convert to text, soupify, and narrow down to just the table

In [2]:
#create request response and convert it to text, then turn it into soup

response = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

html = response.text

soup = BeautifulSoup(html)

In [3]:
#narrow the soup down to just the table and print the contents to ensure it was read correctly

table = soup.table
#table.contents

## Convert to DataFrame

In [4]:
#probably a better way to do this, but each row in the table has 3 <td> tags - one for postal code, one for borough,
#and one for neighborhood. Adding all the <td> tags and dividing by three then should calculate number of rows to be expected
#from the dataframe created below (180)

value_count = 0
for child in table.find_all('td'):
    value_count = value_count + 1
total_rows = value_count / 3

print(total_rows)

180.0


In [5]:
#simplest way I could find to convert table contents into df and double-checked to ensure 180 rows and three columns

table = soup.find_all('table')
df = pd.read_html(str(table))[0]
#df

## Clean dataframe of 'not assigned" rows, combine multiple neighborhoods with same postal code, make 'not assigned' neighborhoods equivalent to boroughs

In [6]:
#Replace all 'not assigned' values with NumPy's NaN in order to drop them with dropna function

df = df.replace({'Borough': 'Not assigned'}, value=np.nan)
df = df.dropna()
#print('Boroughs: \n{}'.format(df['Borough'].value_counts()))


In [7]:
# Check to see if multiple neighborborhoods in one postal code need combined

duplicateSeries = df.duplicated(subset='Postal Code')
duplicateSeries.value_counts()

# as it turns out, the table must have been updated since creating the assignment, as the the postal codes that 
# represent multiple neighborhoods have already been consolidated

False    103
dtype: int64

In [8]:
#df['Neighbourhood'].value_counts()

In [9]:
# rename "not assigned" neighborhoods to their corresponding Borough

for neighborhood in df['Neighbourhood']:
    df = df.replace({'Neighbourhood': 'Not assigned'}, len(df['Borough']))

In [10]:
df.shape

(103, 3)

## Bring in CSV and append latitude/longitude to Dataframe

In [11]:
# Read CSV and set/sort index to standardize 
latLong_df = pd.read_csv('/Users/jimmy/Downloads/Geospatial_Coordinates.csv')
latLong_df = latLong_df.set_index('Postal Code')
latLong_df = latLong_df.sort_index()

In [12]:
new_df = df[['Postal Code']]

new_df = new_df.set_index('Postal Code')
new_df = new_df.sort_index()

# Luckily for the sake of this project, the CSV file containing lat/long is the same indexes in the same order as 
# the new dataframe, so the only thing necessary was to make the indexes the same
new_df = new_df.join(latLong_df)

    


In [13]:
df = df.set_index('Postal Code')
df = df.sort_index()

In [14]:
# notable this is a terrible way of doing this as there is no real assurance that the data matches other than 
# picking a few values and making sure they're correct, a better way would have been to implement a RaiseError 
# condition if there was any index that did not match in the two dataframes


df = df.join(new_df)

In [15]:
# could also check if there are any null values to make sure all was added correctly
df.head(20)

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
M1J,Scarborough,Scarborough Village,43.744734,-79.239476
M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [58]:
import json 

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

from pandas import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium 


Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [17]:
# I'm just going to use Downtown Toronto data points
downtown_data = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_data.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Downtown Toronto,Rosedale,43.679563,-79.377529
1,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
2,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [18]:
address = 'Downtown Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6563221, -79.3809161.


In [19]:
#code monkey stuff - copy and pasting and slightly editing to fit my new data set

map_downtownToronto = folium.Map(location=[latitude, longitude], zoom_start=10)


for lat, lng, borough, neighborhood in zip(downtown_data['Latitude'], downtown_data['Longitude'], downtown_data['Borough'], downtown_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtownToronto)  
    
map_downtownToronto

In [20]:
# assigned Foursquare API stuff

CLIENT_ID = 'FQATKQEFO0CWQSG0GGXEEHPOPQJSFZ4N5T1H2FBZMTKR4CU4' 
CLIENT_SECRET = '' #hidden
ACCESS_TOKEN = 'WRUBXT3GTE1XL5IR5PO2UVF2G4CWSHISGXPGAFZ05XOXOUSB'
VERSION = '20180604'
LIMIT = 100
radius = 500


print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)


Your credentials:
CLIENT_ID: FQATKQEFO0CWQSG0GGXEEHPOPQJSFZ4N5T1H2FBZMTKR4CU4


In [21]:
neighbourhood_latitude = downtown_data.loc[0, 'Latitude'] 
neighbourhood_longitude = downtown_data.loc[0, 'Longitude'] 

neighbourhood_name = downtown_data.loc[0, 'Neighbourhood'] 

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Rosedale are 43.6795626, -79.37752940000001.


In [61]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    ACCESS_TOKEN,
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)


In [62]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [91]:
# codemonkey the getNearbyVenues after struggling and maxing out the requests per day two days in a row

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
            
            
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
            venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [92]:
# run the function and create a dataframe
downtown_venues = getNearbyVenues(names=downtown_data['Neighbourhood'],
                                   latitudes=downtown_data['Latitude'], 
                                   longitudes=downtown_data['Longitude'])


Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Queen's Park, Ontario Provincial Government


## analyze each neighborhood

In [98]:
#one hot encodemonkey
toronto_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")


toronto_onehot['Neighborhood'] = downtown_venues['Neighborhood'] 


fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thai Restaurant,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head(5)

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thai Restaurant,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017241,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.015152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030303,0.0,0.0,0.0,0.0,0.015152,0.0,0.0,0.015152,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.025316,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.012658,...,0.012658,0.012658,0.012658,0.0,0.0,0.0,0.0,0.012658,0.0,0.0


In [104]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
            venue  freq
0     Coffee Shop  0.10
1    Cocktail Bar  0.05
2          Bakery  0.03
3  Farmers Market  0.03
4      Restaurant  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3     Boat or Ferry  0.06
4   Harbor / Marina  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.18
1      Sandwich Place  0.06
2  Italian Restaurant  0.06
3                Café  0.06
4     Bubble Tea Shop  0.03


----Christie----
           venue  freq
0  Grocery Store  0.25
1           Café  0.19
2           Park  0.12
3      Nightclub  0.06
4     Restaurant  0.06


----Church and Wellesley----
                  venue  freq
0           Coffee Shop  0.08
1      Sushi Restaurant  0.06
2   Japanese Restaurant  0.06
3            Restaurant  0.04
4  Fast Food Restaurant  0.0

In [105]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [107]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Cheese Shop,Bakery,Restaurant,Farmers Market,Seafood Restaurant,Liquor Store,Museum
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Sculpture Garden,Bar
2,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Thai Restaurant,Salad Place,Bubble Tea Shop,Burger Joint,Fried Chicken Joint,Miscellaneous Shop
3,Christie,Grocery Store,Café,Park,Coffee Shop,Nightclub,Italian Restaurant,Restaurant,Candy Store,Athletics & Sports,Baby Store
4,Church and Wellesley,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Fast Food Restaurant,Gay Bar,Restaurant,Café,Pub,Hotel,Yoga Studio


## cluster analysis

In [111]:

kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 0, 4, 0, 0, 0, 0, 0, 0], dtype=int32)

In [119]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = downtown_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,Rosedale,43.679563,-79.377529,1,Park,Trail,Playground,Cupcake Shop,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner
1,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,0,Pizza Place,Coffee Shop,Restaurant,Pub,Bakery,Italian Restaurant,Park,Café,Chinese Restaurant,Grocery Store
2,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,0,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Fast Food Restaurant,Gay Bar,Restaurant,Café,Pub,Hotel,Yoga Studio
3,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Bakery,Park,Café,Pub,Breakfast Spot,Theater,Wine Shop,Event Space,Performing Arts Venue
4,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Café,Middle Eastern Restaurant,Cosmetics Shop,Japanese Restaurant,Hotel,Bubble Tea Shop,Ramen Restaurant,Pizza Place


In [121]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## I hope you had as much fun as I did