# Opening a Cafe Shop in Barrhaven - Data Analysis Notebook

This Notebook is part of Coursera Capstone Final Project for IBM Applied Data Science Capstone course

### Import necessary Libraries

In [1]:
# library to handle data in a vectorized manner
import numpy as np

# library for data analsysis
import pandas as pd 

# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup

# library to handle requests
import requests 

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## 1. Download and Explore Dataset

#### Getting data from Github and Tranform the data into a pandas dataframe

In [2]:
df_ottawa = pd.read_csv("https://raw.githubusercontent.com/kenchau21/ibm_datascience_capstone/master/list_postal_code_of_ottawa.csv")
df_ottawa.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,K1Y,Ottawa West,"Island Park, Hintonburg, Mechanicsville, Champ..."
1,K1S,Ottawa,"The Glebe, Ottawa South, Ottawa East"
2,K1K,Ottawa,"Overbrook, Forbes, Manor Park, Viscount Alexan..."
3,K1L,Ottawa,"Vanier, McKay Lake"
4,K1Z,Ottawa,"Westboro, Carlington"


In [3]:
df_ottawa.shape

(28, 3)

## 2. Importing Latitude and the Longitude Coordinates

In [4]:
#Get coordinates data from GitHub
df_coordinates= pd.read_csv("https://raw.githubusercontent.com/kenchau21/ibm_datascience_capstone/master/list_coordinates_of_ottawa.csv")
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,K1Y,45.402,-75.733
1,K1S,45.399,-75.687
2,K1K,45.445,-75.643
3,K1L,45.44,-75.663
4,K1Z,45.387,-75.74


In [5]:
# Change the column name of the geo-spatial data Postal Code to PostalCode to align with Toronto postal codes
df_coordinates.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

#Set indexes as PostalCode to be able to merge two data frames instantly based on indexes
df_coordinates.set_index('PostalCode')

df_ottawa.set_index('PostalCode')

# Merging two dataframe together
df_combined = pd.merge(df_ottawa, df_coordinates)

# Change the column name of the geo-spatial data Postal Code to PostalCode to align with Toronto postal codes
df_combined.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)

df_combined

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,K1Y,Ottawa West,"Island Park, Hintonburg, Mechanicsville, Champ...",45.402,-75.733
1,K1S,Ottawa,"The Glebe, Ottawa South, Ottawa East",45.399,-75.687
2,K1K,Ottawa,"Overbrook, Forbes, Manor Park, Viscount Alexan...",45.445,-75.643
3,K1L,Ottawa,"Vanier, McKay Lake",45.44,-75.663
4,K1Z,Ottawa,"Westboro, Carlington",45.387,-75.74
5,K1G,Ottawa,"Riverview, Hawthorne, Canterbury, Hunt Club Park",45.355,-75.577
6,K1H,Ottawa,"Alta Vista, Billings Bridge",45.388,-75.659
7,K1M,Ottawa,"Rockcliffe Park, New Edinburgh",45.449,-75.682
8,K1P,Ottawa,Parliament Hill,45.422,-75.703
9,K1R,Ottawa,West Downtown area,45.412,-75.711


#### Select only Barrhaven Neighborhood

In [6]:
barrhaven_data = df_combined[df_combined['Neighborhood'].isin(['Barrhaven'])].reset_index(drop=True)
barrhaven_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,K2J,Nepean,Barrhaven,45.269,-75.752


## 3. Explore Neighborhoods in Barrhaven

In [7]:
address = 'Barrhaven, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Barrhaven are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Barrhaven are 45.2652066, -75.7659503343141.


In [29]:
# create map of Barrhaven using latitude and longitude values
map_barrhaven = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(barrhaven_data['Latitude'], barrhaven_data['Longitude'], barrhaven_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_barrhaven)  
    
map_barrhaven


#### Define Foursquare Credentials and Version

In [9]:
CLIENT_ID = 'GRRTI0NWPGAM5JZFZGE0G31PKC2D5U42WKPSVQPBX225SF1X' # your Foursquare ID
CLIENT_SECRET = 'M4SFZQDMBJXEU3OCX3M0HRCOZSLBHFK2HDTQPWEKJX0NJDUE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

##### Get the top 200 venues that are in Barrhaven within a radius of 8000 meters

In [30]:
LIMIT = 200

#Function to get data from foursquare
def getNearbyVenues(names, latitudes, longitudes, radius=8000,categoryIds = '' ):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [31]:
coffee_shop_categoryId = '4bf58dd8d48988d1e0931735' # https://developer.foursquare.com/docs/resources/categories

barrhaven_venues = getNearbyVenues(names=barrhaven_data['Neighborhood'],
                                   latitudes=barrhaven_data['Latitude'],
                                   longitudes=barrhaven_data['Longitude'],
                                 categoryIds=coffee_shop_categoryId
                                  )

Barrhaven


##### Check the size of the resulting dataframe

In [32]:
# Resulting dataframe size
print(barrhaven_venues.shape)
barrhaven_venues

(20, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Barrhaven,45.269,-75.752,Café Cristal,45.264895,-75.777534,Café
1,Barrhaven,45.269,-75.752,Second Cup,45.269219,-75.746093,Coffee Shop
2,Barrhaven,45.269,-75.752,Starbucks,45.274293,-75.748052,Coffee Shop
3,Barrhaven,45.269,-75.752,Starbucks,45.269775,-75.739897,Coffee Shop
4,Barrhaven,45.269,-75.752,Starbucks,45.275007,-75.719048,Coffee Shop
5,Barrhaven,45.269,-75.752,Tim Hortons,45.274739,-75.738416,Coffee Shop
6,Barrhaven,45.269,-75.752,Tim Hortons,45.270406,-75.781876,Coffee Shop
7,Barrhaven,45.269,-75.752,Tim Hortons,45.284821,-75.722564,Coffee Shop
8,Barrhaven,45.269,-75.752,Tim Hortons,45.270906,-75.744291,Coffee Shop
9,Barrhaven,45.269,-75.752,Tim Hortons,45.297531,-75.73245,Coffee Shop


In [33]:
def addToMap(df, color, existingMap):
    for lat, lng, local, venue, venueCat in zip(df['Venue Latitude'], df['Venue Longitude'], df['Neighborhood'], df['Venue'], df['Venue Category']):
        label = '{} ({}) - {}'.format(venue, venueCat, local)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7).add_to(existingMap)

In [14]:
# Show venues in map
map_barrhaven_coffee = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(barrhaven_venues, 'red', map_barrhaven_coffee)

map_barrhaven_coffee

#### Check how many venues were returned for each neighborhood

In [34]:
barrhaven_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Barrhaven,20,20,20,20,20,20


#### Find out how many unique categories can be curated from all the returned venues

In [35]:
print('There are {} uniques categories.'.format(len(barrhaven_venues['Venue Category'].unique())))

There are 2 uniques categories.


## 4. Analyze Each Neighborhood

In [36]:
# one hot encoding
barrhaven_onehot = pd.get_dummies(barrhaven_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
barrhaven_onehot['Neighborhood'] = barrhaven_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [barrhaven_onehot.columns[-1]] + list(barrhaven_onehot.columns[:-1])
barrhaven_onehot = barrhaven_onehot[fixed_columns]

barrhaven_onehot.head()

Unnamed: 0,Neighborhood,Café,Coffee Shop
0,Barrhaven,1,0
1,Barrhaven,0,1
2,Barrhaven,0,1
3,Barrhaven,0,1
4,Barrhaven,0,1


In [37]:
barrhaven_grouped = barrhaven_onehot.groupby('Neighborhood').mean().reset_index()
barrhaven_grouped

Unnamed: 0,Neighborhood,Café,Coffee Shop
0,Barrhaven,0.05,0.95


#### Confirm the new dataframe size

In [38]:
barrhaven_grouped.shape

(1, 3)

#### Print each neighborhood along with the top 10 most common venues

In [39]:
num_top_venues = 10

for hood in barrhaven_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = barrhaven_grouped[barrhaven_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Barrhaven----
         venue  freq
0  Coffee Shop  0.95
1         Café  0.05




#### Let's put that into a pandas dataframe

A function to sort the venues in descending order

In [40]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Creating a new dataframe and display the top 10 venues for each neighborhood.

In [41]:
num_top_venues = 2

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = barrhaven_grouped['Neighborhood']

for ind in np.arange(barrhaven_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(barrhaven_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue
0,Barrhaven,Coffee Shop,Café


## 5. Cluster Neighborhood

Run k-means to cluster the neighborhood into 5 clusters

In [42]:
# set number of clusters
kclusters = 1

barrhaven_grouped_clustering = barrhaven_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(barrhaven_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0])

In [43]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

barrhaven_merged = barrhaven_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
barrhaven_merged = barrhaven_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

barrhaven_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue
0,K2J,Nepean,Barrhaven,45.269,-75.752,0,Coffee Shop,Café


In [44]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(barrhaven_merged['Latitude'], barrhaven_merged['Longitude'], barrhaven_merged['Neighborhood'], barrhaven_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [45]:
barrhaven_merged.loc[barrhaven_merged['Cluster Labels'] == 0, barrhaven_merged.columns[[1] + list(range(5, barrhaven_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue
0,Nepean,0,Coffee Shop,Café
