FIRST PART -CREATING DF-

In [1]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import numpy as np

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
page= requests.get(url)

In [4]:
soup=BeautifulSoup(page.text,'html.parser') 

In [5]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)


In [6]:
# creating the df and cleaning up data
hoods=pd.DataFrame.from_dict(table_contents)
hoods['Borough']=hoods['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [45]:
# FIRST QUESTION ANSWER
hoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889


In [8]:
print('The number of rows is : ',hoods.shape[0])

The number of rows is :  103


SECOND PART -GETTING LATITUDE AND LONGITUDE--

In [9]:
# I used a different package to get location lat and long
# ! pip install pgeocode

In [10]:
import pgeocode

In [11]:
nomi = pgeocode.Nominatim('ca')  # creating an instance with contry Canada

In [12]:
def get_coordinates(postal):  # function to get coordinates, returns both lat and long in a tupple (to save calls)
    latitude=nomi.query_postal_code(postal).latitude
    longitude= nomi.query_postal_code(postal).longitude
    return latitude,longitude

In [13]:
hoods['coordinates']=hoods['PostalCode'].apply(get_coordinates) # applying function 

In [14]:
hoods['latitude']=hoods['coordinates'].apply(lambda x: x[0]) # splitting coordinates into LAT and LONG columns
hoods['longitude']=hoods['coordinates'].apply(lambda x: x[1])

In [15]:
hoods.drop('coordinates',axis=1,inplace=True)

In [16]:
hoods[hoods['longitude'].isnull()==True]

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
76,M7R,Mississauga,Enclave of L4W,,


In [17]:
# since it was not able to detect Mississauga lat and long we insert it manually

In [18]:
hoods.loc[76,'latitude']=43.5488
hoods.loc[76,'longitude']=-79.6627


In [19]:
hoods.isnull().sum()

PostalCode      0
Borough         0
Neighborhood    0
latitude        0
longitude       0
dtype: int64

In [20]:
#now all rows can be used

THIRD PART --- 

In [21]:
# !conda install -c conda-forge folium=0.5.0 --yes 

import folium # map rendering library

In [22]:
# create map of Toronto using latitude and longitude values
lat_toronto=43.6532
long_toronto=-79.3832
map_toronto = folium.Map(location=[lat_toronto, long_toronto], zoom_start=10)
map_toronto

In [23]:
# creating a df only with Boroughs containing Toronto
toronto_hoods=hoods[hoods['Borough'].str.contains('Toronto')]

In [24]:
toronto_hoods.shape

(39, 5)

In [25]:
# SECOND PART ANSWER
toronto_hoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
15,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756
19,M4E,East Toronto,The Beaches,43.6784,-79.2941
20,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754


In [26]:
for lat, lng, borough, postal in zip(toronto_hoods['latitude'], toronto_hoods['longitude'], toronto_hoods['Borough'], toronto_hoods['PostalCode']):
    label = '{}, {}'.format(postal, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [27]:
# details to work with FourSquare
CLIENT_ID = 'FFZEHH2EXHUKK0FC34GUCMXJSRFMSAGF1V2UJ0IARRZ22TIJ' # your Foursquare ID
CLIENT_SECRET = 'JZAW5CKEBGZZAKT35FFCGG5DEBMVOQOU0RG2WJV3RATGOBFK' # your Foursquare Secret
ACCESS_TOKEN = 'FVU5BQH3NGFLUR5NH4A5L5GWGYNFJWHJERCUEW3ZMXJ0KTQV' # your FourSquare Access Token
VERSION = '20180605'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FFZEHH2EXHUKK0FC34GUCMXJSRFMSAGF1V2UJ0IARRZ22TIJ
CLIENT_SECRET:JZAW5CKEBGZZAKT35FFCGG5DEBMVOQOU0RG2WJV3RATGOBFK


In [28]:
# function to get all the venues in each neighbourhood
def getNearbyVenues(names,pcodes,latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, pcode,lat, lng in zip(names, pcodes , latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()     
        results=results['response']['groups'][0]['items']
       
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            pcode,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                   'Postal Code',
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:
# getting all venues in Toronto dataframe 
toronto_venues = getNearbyVenues(names=toronto_hoods['Borough'],
                                pcodes=toronto_hoods['PostalCode'],
                                   latitudes=toronto_hoods['latitude'],
                                   longitudes=toronto_hoods['longitude'])

Downtown Toronto
Downtown Toronto
Downtown Toronto
East Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
West Toronto
East York/East Toronto
Downtown Toronto
West Toronto
East Toronto
Downtown Toronto
West Toronto
East Toronto
Downtown Toronto
East Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
West Toronto
Central Toronto
Central Toronto
West Toronto
Central Toronto
Downtown Toronto
West Toronto
Central Toronto
Downtown Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto Stn A
Downtown Toronto
Downtown Toronto
Downtown Toronto
East Toronto Business


In [30]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Downtown Toronto,M5A,43.6555,-79.3626,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,Downtown Toronto,M5A,43.6555,-79.3626,Roselle Desserts,43.653447,-79.362017,Bakery
2,Downtown Toronto,M5A,43.6555,-79.3626,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
3,Downtown Toronto,M5A,43.6555,-79.3626,The Yoga Lounge,43.655515,-79.364955,Yoga Studio
4,Downtown Toronto,M5A,43.6555,-79.3626,Body Blitz Spa East,43.654735,-79.359874,Spa


In [31]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 176 uniques categories.


In [32]:
# transforming venue categories into columns
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

In [33]:
# add neighborhood and postal code columns back to dataframe
toronto_onehot['Hood'] = toronto_venues['Neighborhood'] 
toronto_onehot['PostalCode']=toronto_venues['Postal Code']
# move neighborhood column to the first column and postal code to second
fixed_columns = [toronto_onehot.columns[-2]] +[toronto_onehot.columns[-1]]+ list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [34]:
toronto_onehot.head()

Unnamed: 0,Hood,PostalCode,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio,Hood.1
0,Downtown Toronto,M5A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Downtown Toronto
1,Downtown Toronto,M5A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Downtown Toronto
2,Downtown Toronto,M5A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Downtown Toronto
3,Downtown Toronto,M5A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Downtown Toronto
4,Downtown Toronto,M5A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Downtown Toronto


In [35]:
# grouping postal codes and getting the average of each venue category
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,PostalCode,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333
3,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# creating column names for new df
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

In [37]:
# create a new dataframe to hold values
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

In [38]:
# filling out the dataframe with values from above
for i in list(range(neighborhoods_venues_sorted.shape[0])):
    temp_df=pd.DataFrame(toronto_grouped.iloc[i,1:].sort_values(ascending=False))
    temp_df=temp_df[temp_df.iloc[:,0]>0].head(10)    
    for j in list(range(temp_df.shape[0])): 
        neighborhoods_venues_sorted.iat[i,j+1]=temp_df.transpose().columns[j]                                                 

In [39]:
neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Pub,Trail,Bakery,Neighborhood,Cheese Shop,Asian Restaurant,Health Food Store,Gastropub,Pizza Place,
1,M4J,Convenience Store,Coffee Shop,Park,,,,,,,
2,M4K,Greek Restaurant,Restaurant,Ice Cream Shop,Italian Restaurant,Yoga Studio,Pub,Brewery,Cocktail Bar,Indian Restaurant,Spa
3,M4L,Pizza Place,Park,Fast Food Restaurant,Movie Theater,Restaurant,Liquor Store,Pub,Italian Restaurant,Brewery,Steakhouse
4,M4M,Gym,Coffee Shop,Park,Baseball Field,Garden Center,Diner,Coworking Space,,,


In [40]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# setting number of clusters
kclusters = 5
# dropping Postal code, we are only using the features
toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)
# run k-means clustering, creating an instance and fitting it 
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 1, 0, 4, 0, 0])

In [41]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_hoods

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('PostalCode'), on='PostalCode')


In [42]:
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,0,Coffee Shop,Breakfast Spot,Yoga Studio,Spa,Bakery,Beer Store,Distribution Center,Electronics Store,Event Space,Food Truck
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783,0,Café,Ramen Restaurant,Theater,Clothing Store,Shopping Mall,Tanning Salon,Sandwich Place,Burrito Place,Burger Joint,Japanese Restaurant
15,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756,0,Café,Farmers Market,Coffee Shop,Restaurant,Speakeasy,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Jazz Club,Cocktail Bar
19,M4E,East Toronto,The Beaches,43.6784,-79.2941,0,Pub,Trail,Bakery,Neighborhood,Cheese Shop,Asian Restaurant,Health Food Store,Gastropub,Pizza Place,
20,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754,0,Japanese Restaurant,Cocktail Bar,Farmers Market,Beer Bar,Basketball Stadium,Café,Pub,Seafood Restaurant,Jazz Club,Liquor Store


In [43]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# creating a toronto map
lat_toronto=43.6532
long_toronto=-79.3832
map_toronto = folium.Map(location=[lat_toronto, long_toronto], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_toronto)
       
map_toronto

In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] ==0, toronto_merged.columns[[1]+[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,"Regent Park, Harbourfront",0,Coffee Shop,Breakfast Spot,Yoga Studio,Spa,Bakery,Beer Store,Distribution Center,Electronics Store,Event Space,Food Truck
9,Downtown Toronto,"Garden District, Ryerson",0,Café,Ramen Restaurant,Theater,Clothing Store,Shopping Mall,Tanning Salon,Sandwich Place,Burrito Place,Burger Joint,Japanese Restaurant
15,Downtown Toronto,St. James Town,0,Café,Farmers Market,Coffee Shop,Restaurant,Speakeasy,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Jazz Club,Cocktail Bar
19,East Toronto,The Beaches,0,Pub,Trail,Bakery,Neighborhood,Cheese Shop,Asian Restaurant,Health Food Store,Gastropub,Pizza Place,
20,Downtown Toronto,Berczy Park,0,Japanese Restaurant,Cocktail Bar,Farmers Market,Beer Bar,Basketball Stadium,Café,Pub,Seafood Restaurant,Jazz Club,Liquor Store
24,Downtown Toronto,Central Bay Street,0,Coffee Shop,Café,Clothing Store,Sandwich Place,Ramen Restaurant,Shopping Mall,Bubble Tea Shop,Poke Place,Plaza,Spa
30,Downtown Toronto,"Richmond, Adelaide, King",0,Coffee Shop,Café,Seafood Restaurant,Gym,Restaurant,Gym / Fitness Center,Speakeasy,Plaza,Pizza Place,Concert Hall
31,West Toronto,"Dufferin, Dovercourt Village",0,Park,Bakery,Furniture / Home Store,Pizza Place,Café,Pharmacy,Bar,Bank,Middle Eastern Restaurant,Smoke Shop
35,East York/East Toronto,The Danforth East,0,Convenience Store,Coffee Shop,Park,,,,,,,
36,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",0,Park,Harbor / Marina,Music Venue,Café,,,,,,
