# Coursera Capstone

This is the project for coursera capstone.

In [1]:
import numpy as np
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import lxml

In [2]:
print("numpy %s" % np.__version__)
print("pandas %s" % pd.__version__)
print("requests %s" % requests.__version__)
print("BeautifulSoup %s" % bs4.__version__)

numpy 1.13.3
pandas 0.21.0
requests 2.18.4
BeautifulSoup 4.6.0


## Get Toronto Neighborhood Data

In [3]:
TORONTO_WIKI_URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 

Getting HTML page from Wikipedia.

In [4]:
response = requests.get(TORONTO_WIKI_URL)

Using `BeautifulSoup` to pull out data from the response with `lxml` as the HTML parser.

In [5]:
soup = BeautifulSoup(response.content, 'lxml')

Select the table rows from the page. There is only one `wikitable` class in the page so we can use it as part of the selector.

In [6]:
table_rows = soup.select('.wikitable tr')

Parse the table rows into a dataframe.

In [7]:
def get_header_names(table_rows):
    header_cells = table_rows[0].select('th')
    assert len(header_cells) == 3
    return [th.text.strip() for th in header_cells]

header_names = get_header_names(table_rows)

def get_neighbourhoods(table_rows):
    data = []
    for tr in table_rows[1:]:
        data_cells = tr.select('td')
        assert len(data_cells) == 3
        data.append([td.text.strip() for td in data_cells])
    return data

data = get_neighbourhoods(table_rows)
    
df = pd.DataFrame(data, columns = ['Postal Code', 'Borough', 'Neighborhood Name'])
print(df.shape)
df.head()

(289, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood Name
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Get rid of the 'Not assigned' values.

In [8]:
df = df[(df['Borough'] != 'Not assigned') & (df['Neighborhood Name'] != 'Not assigned')]
print(df.shape)
df.head()

(211, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood Name
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Combine multiple neighborhoods per postal code.

In [9]:
df = df.groupby('Postal Code').agg(lambda x: ', '.join(sorted(set(x)))).reset_index()
print(df.shape)
df.head()

(102, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood Name
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


The shape of the data frame is: 

In [10]:
df.shape

(102, 3)

# Get the latitude and longitude for each location

In [11]:
df = df.merge(pd.read_csv('geospatial_data.csv'))
print(df.shape)
df.head()

(102, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood Name,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Visualize

In [12]:
import folium
from tqdm import tqdm as make_progress_bar
from geopy.geocoders import Nominatim

Get the latitude and longitude for visualizing the map.

In [13]:
address = 'Ontario, Toronto'

geolocator = Nominatim(user_agent="coursera-assignment")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinates of Ontario, Toronto are 43.653963, -79.387207.


Visualize the neighborhoods / postal code.

In [14]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood Name']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

Set the `CLIENT_ID`, `CLIENT_SECRET` and `VERSION` to sent to the FourSquare API.

In [15]:
# The code was removed by Watson Studio for sharing.

Get nearby venues of the neighborhoods from the Foursquare API. 

In [16]:
def make_foursquare_explore_url_(lattitude, longitude, radius = 500, limit = 100):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lattitude, 
            longitude, 
            radius, 
            limit)
    return url

def get_venues_from_foursquare_explore_response_(response_json):
    results = response_json["response"]['groups'][0]['items']
    filtered_venues = [(v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results]
    return filtered_venues

def get_nearby_venues_columns_():
    return [
        'Postal Code', 
        'Borough', 
        'Neighborhood Name', 
        'Neighborhood Latitude', 
        'Neighborhood Longitude', 
        'Venue Name', 
        'Venue Latitude', 
        'Venue Longitude', 
        'Venue Category'
    ]

def get_nearby_venues(df, radius=500):
    
    venues_list=[]
    for row in make_progress_bar(df.values, ascii = True):
        postal_code, borough, neighborhood, lat, long = row
    
        url = make_foursquare_explore_url_(lat, long)
        response = requests.get(url).json()
    
        venues = get_venues_from_foursquare_explore_response_(response)
        venues_list.append([np.append(row, venue) for venue in venues])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = get_nearby_venues_columns_()
    
    return nearby_venues

Save fetched data to 'nearby_venues.csv' so that we don't waste our FourSquare API quota.

In [17]:
nearby_venues_filename = 'nearby_venues.csv'
try:
    # raise FileNotFoundError
    nearby_venues = pd.read_csv(nearby_venues_filename)
    print("File exists. Reading from %s" % nearby_venues_filename)
except FileNotFoundError:
    print("File does not exist. Fetching from Four Square API")
    nearby_venues = get_nearby_venues(df)
    nearby_venues.to_csv(nearby_venues_filename, index = False)

File exists. Reading from nearby_venues.csv


In [18]:
nearby_venues.head()

Unnamed: 0,Postal Code,Borough,Neighborhood Name,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course
2,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [19]:
print(nearby_venues.shape)

(2201, 9)


Let's see how many categories are there.

In [20]:
print('There are {} uniques categories.'.format(len(nearby_venues['Venue Category'].unique())))

There are 269 uniques categories.


Convert categories into one hot encoding. 

In [21]:
# one hot encoding
categories_onehot = pd.get_dummies(nearby_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
venues_df = pd.concat((nearby_venues.iloc[:, :6], categories_onehot), axis = 1)

In [22]:
print(venues_df.shape)

(2201, 275)


In [23]:
venues_df

Unnamed: 0,Postal Code,Borough,Neighborhood Name,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,Wendy's,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,RIGHT WAY TO GOLF,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,Royal Canadian Legion,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Enterprise Rent-A-Car,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Woburn Medical Centre,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Eggsmart,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,M1G,Scarborough,Woburn,43.770992,-79.216917,Starbucks,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Count the number of each category for each postal code.

In [24]:
categories_df = venues_df.groupby(['Postal Code']).mean().reset_index()
categories_df = categories_df.merge(df.iloc[:, :3], on = 'Postal Code')

Reorder the 'Neighborhood Name' column from the last to the third position.

In [25]:
categories_column_order = [categories_df.columns.tolist()[0]] + categories_df.columns.tolist()[-2:] + categories_df.columns.tolist()[1: -2]
categories_df = categories_df[categories_column_order]

In [26]:
categories_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood Name,Neighborhood Latitude,Neighborhood Longitude,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
categories_df.shape

(100, 274)

Seems some postal codes are missing let's truncate `df` to match.

In [28]:
df = df.loc[categories_df.index]

Checking for the most common venues for each postal code.

In [29]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[5:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [30]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal Code'] = categories_df['Postal Code']

for ind in np.arange(categories_df.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(categories_df.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(20)

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M1B,Fast Food Restaurant,Yoga Studio,Discount Store
1,M1C,Golf Course,Bar,Empanada Restaurant
2,M1E,Electronics Store,Pizza Place,Medical Center
3,M1G,Coffee Shop,Korean Restaurant,Convenience Store
4,M1H,Hakka Restaurant,Athletics & Sports,Caribbean Restaurant
5,M1J,Playground,Yoga Studio,Empanada Restaurant
6,M1K,Train Station,Hobby Shop,Bus Station
7,M1L,Bus Line,Bakery,Park
8,M1M,Motel,American Restaurant,Doner Restaurant
9,M1N,College Stadium,General Entertainment,Skating Rink


## Modeling

In [31]:
from sklearn.cluster import KMeans

In [32]:
# set number of clusters
kclusters = 5

manhattan_grouped_clustering = categories_df.drop(['Postal Code', 'Borough', 'Neighborhood Name'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [33]:
manhattan_merged = df

# add clustering labels
manhattan_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Postal Code'), on='Postal Code')

manhattan_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood Name,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0,Fast Food Restaurant,Yoga Studio,Discount Store
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,1,Golf Course,Bar,Empanada Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1,Electronics Store,Pizza Place,Medical Center
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1,Coffee Shop,Korean Restaurant,Convenience Store
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1,Hakka Restaurant,Athletics & Sports,Caribbean Restaurant


In [34]:
manhattan_merged.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood Name,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
95,M9L,North York,Humber Summit,43.756303,-79.565963,3,Empanada Restaurant,Pizza Place,Yoga Studio
96,M9M,North York,"Emery, Humberlea",43.724766,-79.532242,1,Baseball Field,Ethiopian Restaurant,Donut Shop
97,M9N,York,Weston,43.706876,-79.518188,1,Park,Yoga Studio,Empanada Restaurant
98,M9P,Etobicoke,Westmount,43.696319,-79.532242,1,Pizza Place,Intersection,Middle Eastern Restaurant
99,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724,1,Pizza Place,Park,Mobile Phone Shop


In [35]:
from matplotlib import cm, colors

In [36]:

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood Name'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters