# Toronto Clustering

## Obtaining and cleaning data

In [15]:
#import libraries
import pandas as pd
import numpy as np

#Read html making "Not Assigned" as NaN
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0,
                      na_values='Not assigned')

#Defines table expected heading
headings = ['Postal Code', 'Borough', 'Neighborhood']

#loop through read tables to identify target
for table in tables:
    current_headings = table.columns.values[:3]
    if len(current_headings) != len(headings):
        continue
    if all(current_headings == headings):
        break

#Drops NaN Boroughs
df = table.dropna(subset=['Borough'])

#Renames columns do keep a standard
df.columns = ['PostalCode','Borough','Neighborhood']

#Iterate to find empty Neighborhoods and insert the Borough name if found
for index, row in df.iterrows():
    if df.loc[index]['Neighborhood'] == "":
        df.loc[index]['Neighborhood'] = df.loc[index]['Borough']
    
#Group Neighborhoods by Postal Code
df = df.groupby(['PostalCode']).sum()

df.shape

(103, 2)

## Acquiring Coordinates

In [16]:
geospat = pd.read_csv('https://cocl.us/Geospatial_data')
geospat.columns = ['PostalCode','Latitude','Longitude']
geospat.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
#Merging geospatial data with neighborhoods
df = pd.merge(df,geospat, on='PostalCode')

#select only one Borough for simplicity of the example
df = df[df['Borough'] == 'Scarborough'].reset_index(drop=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [4]:
# Installing mapping libs
#!conda install -c conda-forge geopy --yes
#!conda install -c conda-forge folium=0.5.0 --yes


^C


In [65]:
import requests # library to handle requests

CLIENT_ID = 'K0GDPZH3XGE31BAXK4K35BJG0P23R5VZYM4V2SD3ZCFIRYPP' # your Foursquare ID
CLIENT_SECRET = '4VUESA5BAPQHYO2XFHAPXAP0OJCK5DSJBWONTY45SUHVGISE' # your Foursquare Secret
VERSION = '20200524' # Foursquare API version

#function to default to venue.categories when none avaiable
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

nearby_venuesT = pd.DataFrame()

#Loop to populate each neighborhood with most common top 10 venues categories
for i, row in df.iterrows():
    neigh_lat = row['Latitude']
    neigh_long = row['Longitude']

    url='https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&time{}&day{}&sortByPopularity{}&limit{}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        neigh_lat,
        neigh_long,
        'any',
        'any',
        '1',
        '10'
        )

    results = requests.get(url).json()
    venues = results['response']['groups'][0]['items']
    nearby_venues = pd.json_normalize(venues)
    filtered_columns = ['venue.categories']
    nearby_venues =nearby_venues.loc[:, filtered_columns]
    nearby_venues = nearby_venues.head(10)

    # filter the category for each row
    nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

    # clean columns
    nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

    for index in nearby_venues:
        nearby_venues['score'] = 1
    
    nearby_venues_count = nearby_venues.groupby('categories').count()
    nearby_venuesT = nearby_venuesT.append(nearby_venues_count.transpose())
    
nearby_venuesT.head()

Unnamed: 0,Athletics & Sports,Bakery,Park,Zoo,Zoo Exhibit,Breakfast Spot,Burger Joint,Campground,Cosmetics Shop,Italian Restaurant,...,Mexican Restaurant,Noodle House,Seafood Restaurant,Chinese Restaurant,Tattoo Parlor,Vegetarian / Vegan Restaurant,BBQ Joint,Hotpot Restaurant,Japanese Restaurant,Farm
score,1.0,1.0,1.0,2.0,5.0,,,,,,...,,,,,,,,,,
score,1.0,1.0,3.0,,,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
score,1.0,,2.0,,,,1.0,,,,...,,,,,,,,,,
score,1.0,,1.0,,,,1.0,,,,...,,,,,,,,,,
score,,,,,,,1.0,,,,...,,,,,,,,,,


In [66]:
# Inserting zeroes in NaN
nearby_venuesT.fillna(0,inplace=True)

# Removing low information columns
nearby_venuesT = nearby_venuesT.drop(columns=nearby_venuesT.loc[:,nearby_venuesT.max() <= 1])

#Cleaning up and adding Postal Code for merge
nearby_venuesT = nearby_venuesT.reset_index()
nearby_venuesT['PostalCode'] = df['PostalCode']

nearby_venuesT.head()

Unnamed: 0,index,Park,Zoo,Zoo Exhibit,Coffee Shop,Indian Restaurant,Caribbean Restaurant,Middle Eastern Restaurant,Noodle House,Chinese Restaurant,PostalCode
0,score,1.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,M1B
1,score,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M1C
2,score,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M1E
3,score,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,M1G
4,score,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,M1H


In [74]:
# Merging venue count with Neighborhood
df_cat = df
df_cat = df_cat.merge(nearby_venuesT, on='PostalCode', how='left') 

#Merging categories
df_cat['Zoo Venues'] = df_cat['Zoo']+df_cat['Zoo Exhibit']
df_cat['Food Venues'] = df_cat['Indian Restaurant']+df_cat['Caribbean Restaurant']+df_cat['Middle Eastern Restaurant']
+df_cat['Noodle House']+df_cat['Chinese Restaurant']

#Creating clean features dataset
df_cat_clean = df_cat[['Neighborhood','Latitude','Longitude','Park','Coffee Shop','Zoo Venues','Food Venues']]
df_cat_clean.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Park,Coffee Shop,Zoo Venues,Food Venues
0,"Malvern, Rouge",43.806686,-79.194353,1.0,0.0,7.0,0.0
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,3.0,0.0,0.0,0.0
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,2.0,0.0,0.0,0.0
3,Woburn,43.770992,-79.216917,1.0,1.0,0.0,2.0
4,Cedarbrae,43.773136,-79.239476,0.0,1.0,0.0,2.0


## Preparing cluster model

In [90]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn import metrics

X = StandardScaler().fit_transform(df_cat_clean[['Latitude','Longitude','Park','Coffee Shop','Zoo Venues','Food Venues']])
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)
df_plot = df_cat_clean
df_plot['cluster'] = kmeans.labels_
df_plot

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Neighborhood,Latitude,Longitude,Park,Coffee Shop,Zoo Venues,Food Venues,cluster
0,"Malvern, Rouge",43.806686,-79.194353,1.0,0.0,7.0,0.0,3
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,3.0,0.0,0.0,0.0,1
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,2.0,0.0,0.0,0.0,1
3,Woburn,43.770992,-79.216917,1.0,1.0,0.0,2.0,0
4,Cedarbrae,43.773136,-79.239476,0.0,1.0,0.0,2.0,0
5,Scarborough Village,43.744734,-79.239476,1.0,1.0,0.0,1.0,0
6,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029,0.0,0.0,0.0,0.0,0
7,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577,1.0,1.0,0.0,1.0,0
8,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476,4.0,1.0,0.0,1.0,1
9,"Birch Cliff, Cliffside West",43.692657,-79.264848,1.0,1.0,0.0,0.0,0


## Preparing maps

In [97]:
# Importing mapping libs
from geopy.geocoders import Nominatim
import folium
from folium.plugins import MarkerCluster

In [148]:
#center of the map
address = 'Scarborough, Toronto, ON'

geolocator = Nominatim(user_agent="scarborough")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_scarborough = folium.Map(location=[latitude, longitude], zoom_start=12,tiles='Stamen Toner')



#Drawing circles of 50m from each Neighborhood center and ploting clusters with different colors. Clusters are defined not only
#by distance but also by main venues
for i, row in df_plot.iterrows():
    neigh = row['Neighborhood']
    neigh = neigh[0:15]
    
    if row['cluster']==2 :
        
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=50,
            popup = neigh,
            color='Black',
            fill=True,
            fill_color='Black'
        ).add_to(map_scarborough)
     
    elif row['cluster']==0:
        
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=50,
            popup=row['Neighborhood'],
            color='Red',
            fill=True,
            fill_color='Red'
        ).add_to(map_scarborough)

    elif row['cluster']==1:
        
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=50,
            popup=row['Neighborhood'],
            color='Blue',
            fill=True,
            fill_color='Blue'
        ).add_to(map_scarborough)
        
    elif row['cluster']==3:
        
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=50,
            popup=row['Neighborhood'],
            color='Purple',
            fill=True,
            fill_color='Purple'
        ).add_to(map_scarborough)

      
map_scarborough