# Capstone Project Notebook

#### Import required packages:

In [1]:
import pandas as pd
import numpy as np

In [2]:
# !pip install bs4
# !pip install requests
from bs4 import BeautifulSoup
import requests

### PartA: Obtaining information from wikipedia

In [3]:
#Mark the url and obtain lxml
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = requests.get(url).text
soup = BeautifulSoup(req,'lxml')

In [4]:
#cut out the required table
table = soup.find('table',{'class':'wikitable sortable'})
ths = table.find_all('th')

In [5]:
#Clean the table data and put into a dataframe
rows = table.findAll("tr")
row_lengths = [len(r.findAll(['th', 'td'])) for r in rows]
ncols = max(row_lengths)
nrows = len(rows)
data = []
for i in range(nrows):
    rowD = []
    for j in range(ncols):
        rowD.append('')
    data.append(rowD)

for i in range(len(rows)):
    row = rows[i]
    rowD = []
    cells = row.findAll(["td", "th"])
    for j in range(len(cells)):
        cell = cells[j]

        #lots of cells span cols and rows so lets deal with that
        cspan = int(cell.get('colspan', 1))
        rspan = int(cell.get('rowspan', 1))
        l = 0
        for k in range(rspan):
            # Shifts to the first empty cell of this row
            while data[i + k][j + l]:
                l += 1
            for m in range(cspan):
                cell_n = j + l + m
                row_n = i + k
                # in some cases the colspan can overflow the table, in those cases just get the last item
                cell_n = min(cell_n, len(data[row_n])-1)
                data[row_n][cell_n] += cell.text

    data.append(rowD)

In [6]:
df = pd.DataFrame(data, columns=["Postcode", "Borough", "Neighbourhood"])
#Remove no post code records
df = df[df.Postcode != 'Postcode']
#drop null values
df = df.dropna()
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df[df.Borough != 'Not assigned']
#Clean the Neighbourhood column
df = df.replace('\n', '', regex=True)
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df.loc[df.Neighbourhood == "Not assigned", 'Neighbourhood'] = df['Borough']

In [7]:
#Remove duplicate by merging
df_dup = df[df.duplicated(subset=["Postcode", "Borough"],keep=False)].groupby(["Postcode", "Borough"])['Neighbourhood'].apply(','.join).reset_index()
df_nodup = df[~df.duplicated(subset=["Postcode", "Borough"],keep=False)]
df_final = pd.concat([df_dup, df_nodup], ignore_index=True, sort = False)

In [8]:
#Checking if there are missing Postcode
print(df_final.Postcode.nunique(),df.Postcode.nunique())

103 103


In [9]:
df_final.shape

(103, 3)

### PartB: Obtain geographical coordinates

In [10]:
!pip install geocoder
import geocoder # import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 15.7MB/s ta 0:00:01
[?25hCollecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [11]:
# Geocoder not working
# for index, row in df.iterrows():
#     g = geocoder.google('{}, Toronto, Ontario'.format(row['Postcode']))
#     lat_lng_coords = g.latlng
#     print(lat_lng_coords)

In [12]:
# Geocoder not working, use provided data instead
pd_geo = pd.read_csv('http://cocl.us/Geospatial_data')
pd_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df_lat_lng = pd.merge(df_final, pd_geo, left_on='Postcode', right_on='Postal Code')
df_lat_lng.drop(['Postal Code'], axis = 1)
df_lat_lng.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",M1K,43.727929,-79.262029
4,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",M1L,43.711112,-79.284577


### PartC - Analysing and Clustering

In [14]:
!pip install folium
import folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 7.0MB/s eta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [15]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Canada are 43.653963, -79.387207.


In [16]:
# create map of New York using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_lat_lng['Latitude'], df_lat_lng['Longitude'], df_lat_lng['Borough'], df_lat_lng['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

### Define Foursquare Credentials and Version

Hidden Creditials
<!-- The correct answer is:
CLIENT_ID = 'JK3IO1PHDSLTY0AMXBFYVA1A0H5Q2301SHRQMR3ONWQ5TVQC' # your Foursquare ID
CLIENT_SECRET = 'GIXYRF1MHRM22N23MOVTRTS5XVQZ1DL1ZKYAGGUSQZ5JGAWG' # your Foursquare Secret
--> 

In [50]:
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except:
            pass
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [19]:
toronto_venues = getNearbyVenues(names=df_lat_lng['Neighbourhood'],
                                   latitudes=df_lat_lng['Latitude'],
                                   longitudes=df_lat_lng['Longitude']
                                  )

Rouge,Malvern
Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Clarks Corners,Sullivan,Tam O'Shanter
Agincourt North,L'Amoreaux East,Milliken,Steeles East
Fairview,Henry Farm,Oriole
Silver Hills,York Mills
Newtonbrook,Willowdale
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
CFB Toronto,Downsview East
Woodbine Gardens,Parkview Hill
The Danforth West,Riverdale
The Beaches West,India Bazaar
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Cabbagetown,St. James Town
Ryerson,Garden District
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Bedford Park,Lawrence Manor Eas

In [20]:
toronto_venues.groupby('Neighbourhood').count()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 269 uniques categories.


### Analyze Each Neighborhood

In [21]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head(),toronto_onehot.shape

(                          Neighbourhood  Accessories Store  Afghan Restaurant  \
 0                         Rouge,Malvern                  0                  0   
 1                         Rouge,Malvern                  0                  0   
 2  Highland Creek,Rouge Hill,Port Union                  0                  0   
 3  Highland Creek,Rouge Hill,Port Union                  0                  0   
 4       Guildwood,Morningside,West Hill                  0                  0   
 
    Airport  Airport Food Court  Airport Gate  Airport Lounge  Airport Service  \
 0        0                   0             0               0                0   
 1        0                   0             0               0                0   
 2        0                   0             0               0                0   
 3        0                   0             0               0                0   
 4        0                   0             0               0                0   
 
    Airport 

In [22]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.020000,...,0.020000,0.00,0.000000,0.000000,0.000000,0.010000,0.0,0.000000,0.01,0.000000
1,Agincourt,0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.111111,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000
4,"Alderwood,Long Branch",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000
5,"Bathurst Manor,Downsview North,Wilson Heights",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.050000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000
6,Bayview Village,0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000
7,"Bedford Park,Lawrence Manor East",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.040000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000
8,Berczy Park,0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.017857,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000
9,"Birch Cliff,Cliffside West",0.0,0.000000,0.000000,0.0000,0.0000,0.000,0.0000,0.000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000


In [23]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [44]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Thai Restaurant,Café,Bar,Steakhouse,Restaurant,Sushi Restaurant,Burger Joint,Breakfast Spot,Bookstore
1,Agincourt,Clothing Store,Lounge,Breakfast Spot,Skating Rink,Latin American Restaurant,Electronics Store,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Park,Playground,Bakery,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Pizza Place,Video Store,Sandwich Place,Beer Store,Fried Chicken Joint,Pharmacy,Fast Food Restaurant,Electronics Store,Empanada Restaurant
4,"Alderwood,Long Branch",Pizza Place,Skating Rink,Pharmacy,Coffee Shop,Pub,Sandwich Place,Gym,Airport Service,Fast Food Restaurant,Falafel Restaurant


### Clustering

In [45]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [46]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [47]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_lat_lng

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged = toronto_merged.fillna(0)
toronto_merged = toronto_merged.astype({'Cluster Labels': 'int32'})

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353,1,Fast Food Restaurant,Print Shop,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Dim Sum Restaurant,Eastern European Restaurant
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497,1,History Museum,Bar,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Electronics Store
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711,1,Spa,Intersection,Breakfast Spot,Rental Car Location,Medical Center,Electronics Store,Mexican Restaurant,Yoga Studio,Drugstore,Doner Restaurant
3,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",M1K,43.727929,-79.262029,1,Discount Store,Convenience Store,Department Store,Hobby Shop,Bus Station,Coffee Shop,Yoga Studio,Dumpling Restaurant,Doner Restaurant,Donut Shop
4,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",M1L,43.711112,-79.284577,1,Bakery,Bus Line,Soccer Field,Bus Station,Ice Cream Shop,Park,Intersection,Metro Station,Donut Shop,Drugstore


In [48]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Cluster 1

In [49]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Scarborough,-79.284577,0,Park,Playground,Bakery,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
17,North York,-79.464763,0,Airport,Snack Place,Park,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
21,Central Toronto,-79.38316,0,Tennis Court,Playground,Park,Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
40,North York,-79.490074,0,Construction & Landscaping,Bakery,Park,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
48,Etobicoke,-79.506944,0,River,Park,Pool,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
56,North York,-79.329656,0,Food & Drink Shop,Park,Electronics Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Yoga Studio
68,York,-79.453512,0,Park,Market,Women's Store,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
77,East York,-79.338106,0,Convenience Store,Park,Coffee Shop,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
85,Central Toronto,-79.38879,0,Construction & Landscaping,Park,Bus Line,Swim School,Yoga Studio,Dumpling Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore
88,North York,-79.400049,0,Electronics Store,Convenience Store,Bank,Park,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
