In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.3 MB

The following NEW packages will be 

# Create a dataframe using WebPage

In [28]:
res=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup=BeautifulSoup(res.content,'html.parser')
table=soup.find_all('table')[0]
canada_df=pd.read_html(str(table))[0]

In [29]:
canada_df.replace('Not assigned',np.nan,inplace=True)
canada_df.dropna(subset=['Borough'],axis=0,inplace=True)
canada_df.fillna(method='ffill',axis=1,inplace=True)
cn_df=canada_df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [30]:
cn_df.shape

(103, 3)

# Utilize the Foursquare location data to get the latitude and the longitude coordinates of each neighborhood

In [31]:
#geocoder cannot retrieve the coordinates so the csv file is used
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.rename(columns={'Postal Code':'Postcode'},inplace=True)
#combine two dataframes as a new dataframe
data_df=pd.merge(cn_df,geo_df,on=['Postcode'])

In [32]:
#explore neighborhoods in Toronto
toronto_df=data_df[data_df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


# Explore boroughs

In [33]:
CLIENT_ID = 'BIGFPE2V5ACMTPCU0VYYFLHTRTHU5HSB5ON23GBJZGMTPPXI'
CLIENT_SECRET = 'KHQM2LCLTP5AWJV3NZQY5WMR0ESU2YYAFFKTWXEFL5YL25ZE'
VERSION = '20191018'

In [34]:
#define a function to clean the category data
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [35]:
venues=pd.DataFrame()

In [36]:
for i in range(len(toronto_df)):
    LATITUDE=toronto_df['Latitude'][i]
    LONGITUDE=toronto_df['Longitude'][i]
    url='https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION,LATITUDE,LONGITUDE,500,100)
    results=requests.get(url).json()
    venues_temp=results['response']['venues']
    venues_temp=json_normalize(venues_temp)
    filtered_columns=['name','location.postalCode','location.lat','location.lng','categories']
    venues_temp=venues_temp.loc[:,filtered_columns]
    venues_temp.dropna(subset=['location.postalCode'],axis=0,inplace=True)
    venues_temp['location.postalCode']=venues_temp['location.postalCode'].astype(str).str[0:3].str.upper()
    venues_temp['categories']=venues_temp.apply(get_category_type,axis=1)
    venues=pd.concat([venues,venues_temp],axis=0,sort=False,ignore_index=True)

venues.rename(columns={'name':'Name','location.postalCode':'Postcode','location.lat':'Venue_Latitude','location.lng':'Venue_Longitude','categories':'Categories'},inplace=True)    
data_df=pd.merge(venues,toronto_df,on=['Postcode'])

In [16]:
data_df

Unnamed: 0,Name,Postcode,Venue_Latitude,Venue_Longitude,Categories,Borough,Neighbourhood,Latitude,Longitude
0,Beaches Fitness - Personal Trainer & Health Coach,M4L,43.669253,-79.311140,Gym / Fitness Center,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
1,British Style Fish & Chips,M4L,43.668723,-79.317139,Fish & Chips Shop,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
2,Zipcar,M4L,43.667100,-79.314800,Rental Car Location,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,Paint Nite,M4L,43.667302,-79.312767,Nightlife Spot,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
4,TTC Russell Substation,M4L,43.666293,-79.315828,Power Plant,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
5,Sixgreen Labs Inc.,M4L,43.669213,-79.319305,Tech Startup,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
6,Off Centre DJ School,M4L,43.671670,-79.318887,College Classroom,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
7,Village Hardware,M4L,43.666257,-79.317169,Hardware Store,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
8,Ele Ela Hair Design,M4L,43.667545,-79.311244,Salon / Barbershop,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
9,The Tulip Steakhouse,M4L,43.666348,-79.316854,Steakhouse,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572


In [37]:
toronto_onehot=pd.get_dummies(data_df[['Categories']], prefix="", prefix_sep="")
toronto_onehot['Postcode']=data_df['Postcode'] 
toronto_grouped=toronto_onehot.groupby('Postcode').mean().reset_index()

In [38]:
k=5
toronto_clustering=toronto_grouped.drop(['Postcode'],axis=1)
kmeans=KMeans(n_clusters=k,random_state=0)
kmeans.fit(toronto_clustering)
labels=kmeans.labels_
toronto_grouped['Labels']=labels
toronto_clustered=toronto_grouped[['Postcode','Labels']]
toronto_clustered_2=pd.merge(toronto_clustered,toronto_df,on=['Postcode'])
toronto_clustered_2

Unnamed: 0,Postcode,Labels,Borough,Neighbourhood,Latitude,Longitude
0,M4E,0,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,0,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,3,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,0,East Toronto,Studio District,43.659526,-79.340923
4,M4N,0,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,4,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,2,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,0,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,3,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,4,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [39]:
#create a map
map_clusters=folium.Map(location=[43.653963,-79.387207],zoom_start=11)
x=np.arange(k)
ys=[i+x+(i*x)**2 for i in range(k)]
colors_array=cm.rainbow(np.linspace(0,1,len(ys)))
rainbow=[colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_clustered_2['Latitude'],toronto_clustered_2['Longitude'],toronto_clustered_2['Neighbourhood'],toronto_clustered_2['Labels']):
    label=folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters