## Notebook for Capstone Project
In this notebook, I will be using some location data on neighborhoods in Toronto to identify similar neighborhoods based on some features such as schools, restaurants, museums, etc.

## Obtaining Data from Wikipedia

In [1]:
import pandas as pd
import numpy as np
print ('Hello Capstone Project Course!')

Hello Capstone Project Course!


In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [4]:
DataSource = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_T"
source = requests.get(DataSource).text
#print(source)

In [5]:
soup = BeautifulSoup(source, 'html')

In [6]:
mydata =soup.find_all('table')[1] # Grab the second table
#print(mydata)

In [7]:
# Creating a pandas dataframe with five columns: PostalCode, Borough, Neighborhood, Latitude, and Longitude
column_names = ['Postalcode','Borough','Neighborhood', 'Latitude', 'Longitude']
df = pd.DataFrame(columns = column_names)

In [8]:
# Populating the pandas DataFrame with all the postcode, borough, neighborhood, latitude, and longitude
for tr_cell in mydata.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==5:
        df.loc[len(df)] = row_data

In [9]:
df.head(5)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,T1A,Medicine Hat,Central Medicine Hat,50.03646,-110.67925
1,T2A,Calgary,"Penbrooke Meadows, Marlborough",51.04968,-113.96432
2,T3A,Calgary,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12606,-114.143158
3,T4A,Airdrie,East Airdrie,51.27245,-113.98698
4,T5A,Edmonton,"West Clareview, East Londonderry",53.5899,-113.4413


In [10]:
df.shape

(180, 5)

# Data Cleaning
Subsetting data to remove all boroughs with **Not assigned**

In [11]:
df=df[df['Borough']!='Not assigned']
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,T1A,Medicine Hat,Central Medicine Hat,50.03646,-110.67925
1,T2A,Calgary,"Penbrooke Meadows, Marlborough",51.04968,-113.96432
2,T3A,Calgary,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12606,-114.143158
3,T4A,Airdrie,East Airdrie,51.27245,-113.98698
4,T5A,Edmonton,"West Clareview, East Londonderry",53.5899,-113.4413


In [12]:
df.shape

(144, 5)

Setting **Neighborhood** to **Borough** name if **Neighborhood** is **Not assigned**

In [13]:
temp_df=df.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_df=temp_df.reset_index(drop=False)
temp_df.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)
temp_df.head()

Unnamed: 0,Postalcode,Neighborhood_joined
0,T1A,Central Medicine Hat
1,T1B,South Medicine Hat
2,T1C,North Medicine Hat
3,T1G,Not assigned
4,T1H,North Lethbridge


In [14]:
df_merge = pd.merge(df, temp_df, on='Postalcode')

In [15]:
df_merge.drop(['Neighborhood'],axis=1,inplace=True)

In [16]:
df_merge.drop_duplicates(inplace=True)

In [17]:
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)
df_merge.head(10)

Unnamed: 0,Postalcode,Borough,Latitude,Longitude,Neighborhood
0,T1A,Medicine Hat,50.03646,-110.67925,Central Medicine Hat
1,T2A,Calgary,51.04968,-113.96432,"Penbrooke Meadows, Marlborough"
2,T3A,Calgary,51.12606,-114.143158,"Dalhousie, Edgemont, Hamptons, Hidden Valley"
3,T4A,Airdrie,51.27245,-113.98698,East Airdrie
4,T5A,Edmonton,53.5899,-113.4413,"West Clareview, East Londonderry"
5,T6A,Edmonton,53.5483,-113.408,North Capilano
6,T7A,Drayton Valley,53.2165,-114.9893,Not assigned
7,T8A,Sherwood Park,53.519,-113.3216,West Sherwood Park
8,T9A,Wetaskiwin,52.9741,-113.3646,Not assigned
9,T1B,Medicine Hat,50.0172,-110.651,South Medicine Hat


In [18]:
df_merge.shape

(144, 5)

In [19]:
Edmonton_df1=df_merge[df_merge['Neighborhood']!='Not assigned']
Edmonton_df2=Edmonton_df1[Edmonton_df1['Latitude']!='Not assigned']
Edmonton_df3=Edmonton_df2[Edmonton_df2['Longitude']!='Not assigned']
Edmonton_df=Edmonton_df3[Edmonton_df3['Borough'].str.contains("Edmonton")]
Edmonton_df.head()

Unnamed: 0,Postalcode,Borough,Latitude,Longitude,Neighborhood
4,T5A,Edmonton,53.5899,-113.4413,"West Clareview, East Londonderry"
5,T6A,Edmonton,53.5483,-113.408,North Capilano
13,T5B,Edmonton,53.5766,-113.4608,"East North Central, West Beverly"
14,T6B,Edmonton,53.5322,-113.4404,"SE Capilano, West Southeast Industrial, East B..."
20,T5C,Edmonton,53.6129,-113.4572,Central Londonderry


## Exploring and Clustering neighborhoods in Edmonton

In [20]:
import numpy as np
import os
from sklearn.cluster import KMeans
import folium 
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt


print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


## Using the FourSquare API to Extract data on venues

In [21]:
CLIENT_ID = 'KMILCNBP3WGHK4E4HENJZLYPLG3GHR4HTOWAETSCAO35CQJ5' # your Foursquare ID
CLIENT_SECRET = 'E0INTB2L25UG1EWAP5BND5R1ER303VJBKZNXCHHEBJOOUV2C' # your Foursquare Secret
ACCESS_TOKEN = 'OSNZFZ3BU0XBVZ55K1SAD0R5SVERF04SZSATHPMZSMDR4CN2' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KMILCNBP3WGHK4E4HENJZLYPLG3GHR4HTOWAETSCAO35CQJ5
CLIENT_SECRET:E0INTB2L25UG1EWAP5BND5R1ER303VJBKZNXCHHEBJOOUV2C


In [22]:
def getNearbyVenues(names, latitudes, longitudes):
    radius=500
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [23]:
Edmonton_venues = getNearbyVenues(names=Edmonton_df['Neighborhood'],
                                   latitudes=Edmonton_df['Latitude'],
                                   longitudes=Edmonton_df['Longitude']
                                  )

West Clareview, East Londonderry
North Capilano
East North Central, West Beverly
SE Capilano, West Southeast Industrial, East Bonnie Doon
Central Londonderry
Central Bonnie Doon
West Londonderry, East Calder
South Bonnie Doon, East University
North Central, Queen Mary Park, Blatchford
West University, Strathcona Place
NorthDowntown Fringe, East Downtown Fringe
Southgate, North Riverbend
North Downtown
Kaskitayo, Aspen Gardens
South Downtown, South Downtown Fringe (Alberta Provincial Government)
West Mill Woods
North Westmount, West Calder, East Mistatim
East Mill Woods
South Westmount, Groat Estate, East Northwest Industrial
Southwest Edmonton
Glenora, SW Downtown Fringe
South Industrial
North Jasper Place
East Southeast Industrial, South Clover Bar
Central Jasper Place, Buena Vista
Southgate, North Riverbend
West Northwest Industrial, Winterburn
North Clover Bar
West Jasper Place, West Edmonton Mall
The Meadows
Central Mistatim
The Palisades, West Castle Downs
Central Beverly
Heritage

In [24]:
Edmonton_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Beverly,4,4,4,4,4,4
Central Bonnie Doon,3,3,3,3,3,3
"Central Jasper Place, Buena Vista",9,9,9,9,9,9
Central Mistatim,3,3,3,3,3,3
East Castledowns,6,6,6,6,6,6
East Mill Woods,2,2,2,2,2,2
"East North Central, West Beverly",4,4,4,4,4,4
"East Southeast Industrial, South Clover Bar",2,2,2,2,2,2
Ellerslie,2,2,2,2,2,2
"Glenora, SW Downtown Fringe",2,2,2,2,2,2


In [25]:
# one hot encoding
Edmonton_onehot = pd.get_dummies(Edmonton_venues[['Venue Category']], prefix="", prefix_sep="")
#Scarborough_onehot.drop(['Neighborhood'],axis=1,inplace=True) 
Edmonton_onehot.insert(loc=0, column='Neighborhood', value=Edmonton_venues['Neighborhood'])
Edmonton_onehot.shape

(304, 123)

In [26]:
# one hot encoding
Edmonton_onehot = pd.get_dummies(Edmonton_venues[['Venue Category']], prefix="", prefix_sep="")
#Scarborough_onehot.drop(['Neighborhood'],axis=1,inplace=True) 
Edmonton_onehot.insert(loc=0, column='Neighborhood', value=Edmonton_venues['Neighborhood'])
Edmonton_onehot.shape

(304, 123)

In [27]:
Edmonton_grouped1 = Edmonton_onehot.groupby('Neighborhood').count().reset_index()
Edmonton_grouped1.head()

Unnamed: 0,Neighborhood,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Baseball Field,Baseball Stadium,Big Box Store,Bookstore,...,Thai Restaurant,Theater,Toy / Game Store,Trail,Turkish Restaurant,Vietnamese Restaurant,Warehouse Store,Water Park,Whisky Bar,Wine Shop
0,Central Beverly,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
1,Central Bonnie Doon,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
2,"Central Jasper Place, Buena Vista",9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
3,Central Mistatim,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,East Castledowns,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6


## Segmenting and Clustering Neighborhoods

In [31]:
# set number of clusters

Edmonton_grouped_clustering = Edmonton_grouped1.drop('Neighborhood', 1)

kclusters = 5 # since n_clusters=3 has the highest Silhouette Coefficient we choose kclusters=3

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Edmonton_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 4, 3, 4, 0, 4, 0, 4, 4, 4], dtype=int32)

In [None]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
#neighborhoods_venues_sorted.head()

In [32]:
Edmonton_grouped1.insert(0, 'Cluster Labels', kmeans.labels_)
Edmonton_grouped1.head()

Unnamed: 0,Cluster Labels,Neighborhood,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Baseball Field,Baseball Stadium,Big Box Store,...,Thai Restaurant,Theater,Toy / Game Store,Trail,Turkish Restaurant,Vietnamese Restaurant,Warehouse Store,Water Park,Whisky Bar,Wine Shop
0,0,Central Beverly,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
1,4,Central Bonnie Doon,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
2,3,"Central Jasper Place, Buena Vista",9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
3,4,Central Mistatim,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,0,East Castledowns,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6


In [None]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge Edmonton_grouped with Edmonton_data to add latitude/longitude for each neighborhood
#Edmonton_df = Edmonton_df.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

#Edmonton_df.head()

In [None]:
# merge Edmonton_grouped with Edmonton_data to add latitude/longitude for each neighborhood
Edmonton_df = Edmonton_df.join(Edmonton_grouped1.set_index('Neighborhood'), on='Neighborhood')

Edmonton_df.head()

In [None]:
Edmonton_df.dropna()

In [None]:
Edmonton_df['Cluster Labels'].value_counts()

In [None]:
address = 'Edmonton, AB'

geolocator = Nominatim(user_agent="Sc_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Edmonton is {}, {}.'.format(latitude, longitude))

In [None]:
Edmonton_df.shape

In [None]:
Edmonton_df=Edmonton_df.dropna()
Edmonton_df.shape

In [None]:
Edmonton_df = Edmonton_df.astype({"Cluster Labels": int})
Edmonton_df['Cluster Labels'].dtype

In [None]:
Edmonton_df = Edmonton_df.astype({"Latitude": float})
Edmonton_df = Edmonton_df.astype({"Longitude": float})
Edmonton_df.dtypes

In [None]:
Edmonton_df['Cluster Labels'].value_counts()

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Edmonton_df['Latitude'], Edmonton_df['Longitude'], Edmonton_df['Neighborhood'], Edmonton_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster - 1],
        fill=True,
        fill_color=rainbow[cluster - 1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters