### Import libraries

In [1]:
import pandas as pd
import numpy as np
import sys
import requests

### Read data

In [2]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

### Parse page

The html tag td can be used to identify rows in the table.

In [3]:
# separate lines
response=response.text.split("\n")

# table entries have the td tag
response=[ s for s in response if "td" in s ]

# join them all an separate by the end row tag /tr
response="".join(response)
response=response.split("</tr>")

# remove first and last 4 characteres of each row
response=[ s[4:-5].split("</td><td>") for s in response ]

# make a dataframe from the list of rows
df=pd.DataFrame(response,columns=["Postcode","Borough","Neighborhood"])

### Clean dataframe

Here we clean the dataframe as instructed in the assignment.

In [4]:
# remove all row that are Borough not assigned or None - this last a result of parsing wrong lines with the td tag
df=df[~df["Borough"].astype(str).isin(["Not assigned","None"])]

# reset index
df.reset_index(inplace=True, drop=True)

# remove html tags
def remove_hrefs(x):
    """
    This function removes the hyperlinks from an html a tag in pure text
    """
    if "href=" in str(x):
        x=x.split(">")[1].split("<")[0]
    return x

df["Borough"]=df["Borough"].apply(lambda x: remove_hrefs(x))
df["Neighborhood"]=df["Neighborhood"].apply(lambda x: remove_hrefs(x))

# Not assigned neighborhood get the value of "Borough"
df.loc[df["Neighborhood"]=="Not assigned","Neighborhood"]=df.loc[df["Neighborhood"]=="Not assigned","Borough"]

# Aggregate rows from duplicate postcode entries 
df = df.groupby(['Postcode'],as_index=False).agg(lambda x:', '.join(list(set(x))))

print(df.shape)

(103, 3)


### Get coordinates for postal codes

First try using geocoder.

In [5]:
import geocoder

In [6]:
# def getCoor(postal_code):
#     # initialize your variable to None
#     lat_lng_coords = None
#     trial=0
#     # loop until you get the coordinates
#     while(lat_lng_coords is None) or (trial < 5) :
#         g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#         lat_lng_coords = g.latlng
#         trial=trial+1
#     if lat_lng_coords is None:
#         print("For", postal_code, "it was not possible to retrieve coordinates.")
#         latitude = np.nan
#         longitude = np.nan 
#     else:
#         latitude = lat_lng_coords[0]
#         longitude = lat_lng_coords[1]
#     return str(latitude)+'::'+str(longitude)

# df["coordidantes"]=df["Postcode"].apply(lambda x: getCoor(x)) getCoor(M1B)

As the block above did not work we try just the simples parts.

In [7]:
# print("geocoder:", geocoder.google('{}, Toronto, Ontario'.format("M1B")) )
# print("Function:", getCoor("M1B") )

Retrieving data from geocoder did not work. We are therefore using the suplied table.

In [8]:
pcdf=pd.read_csv("https://cocl.us/Geospatial_data")
pcdf.columns=["Postcode","Latitude","Longitude"]

In [9]:
df=pd.merge(df,pcdf,how="left",on=["Postcode"])
df

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, Guildwood, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Martin Grove Gardens, Kingsview Village, St. P...",43.688905,-79.554724
101,M9V,Etobicoke,"Thistletown, Silverstone, Albion Gardens, Moun...",43.739416,-79.588437


#### Create a map of Toronto with neighborhoods superimposed on top.

We take the average latitude and longitude to center the map.

In [10]:
import folium # map rendering library

In [11]:
latitude=np.mean(df['Latitude'])
longitude=np.mean(df['Longitude'])

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], \
                                           df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Check the different borough and choose one to work on.

In [12]:
print(df["Borough"].unique())

['Scarborough' 'North York' 'East York' 'East Toronto' 'Central Toronto'
 'Downtown Toronto' 'York' 'West Toronto' "Queen's Park" 'Mississauga'
 'Etobicoke']


We have choosen central Toronto and will now draw a map with the respective neighborhoods.

In [13]:
central_toronto=df[df["Borough"]=="Central Toronto"]
latitude=np.mean(central_toronto['Latitude'])
longitude=np.mean(central_toronto['Longitude'])

# create map of New York using latitude and longitude values
map_central = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(central_toronto['Latitude'], central_toronto['Longitude'], \
                                           central_toronto['Borough'], central_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_central)  
    
map_central

### Foursquare credentials and API version

In [15]:
CLIENT_ID = '******' # your Foursquare ID
CLIENT_SECRET = '******' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

### Explore Neighborhoods in Toronto

In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    """
    This functions returns a data frame of the top 100 locations for each neighborhod given it's latitute and longitude.
    """
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [17]:
central_toronto_venues = getNearbyVenues(names=central_toronto['Neighborhood'],
                                   latitudes=central_toronto['Latitude'],
                                   longitudes=central_toronto['Longitude']
                                  )

Lawrence Park
Davisville North
North Toronto West
Davisville
Summerhill East, Moore Park
Summerhill West, Forest Hill SE, South Hill, Deer Park, Rathnelly
Roselawn
Forest Hill North, Forest Hill West
Yorkville, North Midtown, The Annex


In [18]:
central_toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,The Photo School – Toronto,43.730429,-79.388767,Photography Studio
2,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
3,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
4,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


In [19]:
print('There are {} uniques categories.'.format(len(central_toronto_venues['Venue Category'].unique())))

There are 60 uniques categories.


To properly analyse the different categories we will hot encode them.

In [20]:
# one hot encoding
central_onehot = pd.get_dummies(central_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
central_onehot['Neighborhood'] = central_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [central_onehot.columns[-1]] + list(central_onehot.columns[:-1])
central_onehot = central_onehot[fixed_columns]

central_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,...,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,Lawrence Park,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
central_onehot.shape

(114, 61)

For each neighborhood we get the mean number of times of each categorie.

In [23]:
central_grouped = central_onehot.groupby('Neighborhood').mean().reset_index()
central_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,...,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.029412,0.0,0.0,0.0,0.029412,0.0,0.0,0.058824,0.029412,...,0.0,0.0,0.058824,0.0,0.029412,0.029412,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Forest Hill North, Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0
3,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
4,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.045455,0.045455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455
5,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Summerhill East, Moore Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Summerhill West, Forest Hill SE, South Hill, D...",0.066667,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.066667,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.066667,0.0
8,"Yorkville, North Midtown, The Annex",0.043478,0.043478,0.0,0.0,0.0,0.043478,0.0,0.130435,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0


In [24]:
central_grouped.shape

(9, 61)

We now create a dataframe with the top 10 categories of venus for each neighborhood.

In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [26]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = central_grouped['Neighborhood']

for ind in np.arange(central_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(central_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Dessert Shop,Sandwich Place,Café,Italian Restaurant,Coffee Shop,Gym,Sushi Restaurant,Pizza Place,Indian Restaurant,Gourmet Shop
1,Davisville North,Hotel,Gym,Sandwich Place,Clothing Store,Park,Food & Drink Shop,Breakfast Spot,Gym / Fitness Center,Health & Beauty Service,Farmers Market
2,"Forest Hill North, Forest Hill West",Trail,Jewelry Store,Sushi Restaurant,Park,Yoga Studio,Gourmet Shop,Food & Drink Shop,Fried Chicken Joint,Garden,Gift Shop
3,Lawrence Park,Swim School,Bus Line,Park,Photography Studio,Yoga Studio,Farmers Market,History Museum,Health & Beauty Service,Gym / Fitness Center,Gym
4,North Toronto West,Sporting Goods Shop,Coffee Shop,Yoga Studio,Salon / Barbershop,Ice Cream Shop,Gift Shop,Mexican Restaurant,Park,Pet Store,Diner


### KMeans clustering

This will be based on the top 10 categories of venus for each respective neighborhood.

In [27]:
from sklearn.cluster import KMeans

In [28]:
# set number of clusters
kclusters = 5

central_grouped_clustering = central_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(central_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 4, 3, 0, 2, 1, 0, 0], dtype=int32)

In [29]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

central_merged = central_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
central_merged = central_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

central_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Swim School,Bus Line,Park,Photography Studio,Yoga Studio,Farmers Market,History Museum,Health & Beauty Service,Gym / Fitness Center,Gym
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Hotel,Gym,Sandwich Place,Clothing Store,Park,Food & Drink Shop,Breakfast Spot,Gym / Fitness Center,Health & Beauty Service,Farmers Market
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0,Sporting Goods Shop,Coffee Shop,Yoga Studio,Salon / Barbershop,Ice Cream Shop,Gift Shop,Mexican Restaurant,Park,Pet Store,Diner
47,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Dessert Shop,Sandwich Place,Café,Italian Restaurant,Coffee Shop,Gym,Sushi Restaurant,Pizza Place,Indian Restaurant,Gourmet Shop
48,M4T,Central Toronto,"Summerhill East, Moore Park",43.689574,-79.38316,1,Restaurant,Park,Playground,Diner,History Museum,Health & Beauty Service,Gym / Fitness Center,Gym,Greek Restaurant,Gourmet Shop


### Dispaly the different clusters on a map.

In [30]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [31]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(central_merged['Latitude'], central_merged['Longitude'], \
                                  central_merged['Neighborhood'], central_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters