# IBM Data Science Professional Certification Capstone

### The Battle of Neighborhoods - Week 3

### Question 1

Let's import all the necessary packages for this week's assignment and scrap the HTML code of the wiki page.

In [8]:
#!pip install beautifulsoup4
#!pip install lxml
#!pip install requests

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

#Pandas can also be used to scrap the table:
#df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]

#Use BeautifulSoup to get the HTML code of wiki page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_code = requests.get(url).text
soup = BeautifulSoup(html_code, 'lxml')


Once we have scrapped the HTML code from the wiki page, we can start processing it.

In [9]:
#Get the table from the HTML code
web_table = soup.find('table', class_="wikitable sortable")
web_table_rows = web_table.find_all('tr')

#Feed the HTML table into a Pandas DataFrame
l = []
for tr in web_table_rows:
    td = tr.find_all('td')
    row = [item.text.strip() for item in td]
    l.append(row)    
table = pd.DataFrame(l, columns=["PostalCode", "Borough", "Neighborhood"])

#First column is trash. Let's drop it
table.drop([0,], inplace=True)

#Drop all rows with 'Not assigned' borough
table = table[~(table.Borough == 'Not assigned')].reset_index(drop=True)

#Replace'Not assigned' neighborhoods with their respective borough
not_assigned = table.Neighborhood == 'Not assigned'
table.Neighborhood = table.Neighborhood.where(~not_assigned, (table[not_assigned].Borough), axis=0)


Last thing to do is to collapse all instances of neighborhoods in the same borough in one string.

In [10]:
new_table = pd.DataFrame(columns=['PostalCode', 'Borough','Neighborhood'])

for postCode in table.PostalCode.unique():
    my_list = table[table.PostalCode==postCode].Neighborhood.tolist()
    my_list = ', '.join(my_list)
    borough = table[table.PostalCode==postCode].Borough.unique()[0]
    new_table = new_table.append({'PostalCode' : postCode , 'Borough' : borough, 'Neighborhood': my_list} , ignore_index=True)

print(new_table.shape)
new_table.head(10)


(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### Question 2

Next, let's get the Latitude and Longitude coordinates for our addresses.  
We start by importing the necessary packages. Geocoder has proved unreliable, so we'll use ArcGIS instead.  
Since it's a different geocoding provider, results will not be exactly the same, but the difference will be less than a dozen meters

In [11]:
from arcgis.gis import GIS
from arcgis.geocoding import geocode

gis = GIS()

#Add Latitude and Longitude empty columns to our table
new_table['Latitude'] = ''
new_table['Longitude'] = ''

#Go through each row of the Table and add information on Latitude and Longitude for each address
for index, row in new_table.iterrows():
    geocode_result = geocode(address=f'{row.PostalCode}, Toronto, Ontario', as_featureset=False)
    new_table.iloc[index]['Latitude'] = geocode_result[0]['location']['y']
    new_table.iloc[index]['Longitude'] = geocode_result[0]['location']['x']


Now, let's install the necessary packages to visualize our addresses.

In [21]:
#!conda install -c conda-forge folium=0.5.0 --yes
!pip -q install folium
import folium

toronto_lat = 43.7532
toronto_long = -79.3832
neighb_map = folium.Map(location=[toronto_lat, toronto_long], zoom_start=11)

# add popular spots to the map as blue circle markers
for lat, lng, label in zip(new_table.Latitude, new_table.Longitude, new_table.PostalCode):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='blue',
        fill_color='blue',
        fill_opacity=0.6
        ).add_to(neighb_map)

# display map
neighb_map

### Question 3

In this question, we'll analyze the Downtown of Toronto, get places of relevance for each Neighborhood, associated data, and cluster each one of them according to their similarity.  
NOTE: We'll be mostly following the same steps as in the lab about segmenting and clustering in New York.  

First, let's use the Foursquare API to explore the Downtown of Toronto.

In [106]:
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors


In [17]:
#Foursquare API credentials. Removed for sharing in GitHub
CLIENT_ID = '5TYOLT25MOJTAAUM01AORESDBVEEO4C5A1AQYH15U0FHC3AY'
CLIENT_SECRET = 'WLB4E1NPTIJTRP4JWFOQPDQD1UQXY00VVY0KTXO4VZ0PXVDG'
VERSION = '20180605' # Foursquare API version
LIMIT = 100


We'll re-use the same function created by the instructor to collect the relevant data from Foursquare.

In [117]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Let's use the function and get the the places of relevance in Toronto Downtown.

In [118]:
downtown_data = new_table[new_table.Borough == 'Downtown Toronto']

#.map(lambda x: x[0:31])
dt_venues = getNearbyVenues(names=downtown_data['Neighborhood'], latitudes=downtown_data['Latitude'], longitudes=downtown_data['Longitude'])


Next, we process the places of relevance in Toronto Downtown to have only normalized scalar data.

In [119]:
# one hot encoding
dt_onehot = pd.get_dummies(dt_venues[['Venue Category']], prefix="", prefix_sep="")

# add a 'My Neighborhood' column to the dataframe. This is done to avoid conflict with the 'Neighborhood Venue Category'
dt_onehot['My_Neighborhood'] = dt_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [dt_onehot.columns[-1]] + list(dt_onehot.columns[:-1])
dt_onehot = dt_onehot[fixed_columns]

#Get the frequency for each Value Category
dt_grouped = dt_onehot.groupby('My_Neighborhood').mean().reset_index()

We re-use the same function as defined by the instructor to get the top places for each Neighborhood.

In [87]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [102]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['My_Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['My_Neighborhood'] = dt_grouped['My_Neighborhood']

for ind in np.arange(dt_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dt_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,My_Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Hotel,Gastropub,Restaurant,Breakfast Spot,American Restaurant,Asian Restaurant,Burger Joint
1,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Farmers Market,Breakfast Spot,Bakery,Beer Bar,Hotel,Restaurant,Café
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Coffee Shop,Italian Restaurant,Bar,Gym / Fitness Center,Café,Restaurant,Pub,Speakeasy,Bakery,Sandwich Place
3,"Cabbagetown, St. James Town",Bakery,Coffee Shop,Italian Restaurant,Market,Pizza Place,Café,Restaurant,Farmers Market,Japanese Restaurant,Flower Shop
4,Central Bay Street,Coffee Shop,Clothing Store,Cosmetics Shop,Bakery,Plaza,Italian Restaurant,Sushi Restaurant,Sporting Goods Shop,Japanese Restaurant,Café


Now it's time to cluster similar Neighborhoods and put toghether and the needed data to plot the results on a map.

In [103]:
# set number of clusters
kclusters = 5

dt_grouped_clustering = dt_grouped.drop('My_Neighborhood', 1)
dt_grouped_clustering.head()
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dt_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 3, 3, 4, 3, 3, 3], dtype=int32)

In [104]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dt_merged = downtown_data

# merge Toronto Clustered and Ranked data with Toronto Downtown data to add latitude/longitude for each neighborhood
dt_merged = dt_merged.join(neighborhoods_venues_sorted.set_index('My_Neighborhood'), on='Neighborhood')

dt_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,Harbourfront,43.6503,-79.3592,3,Coffee Shop,Bakery,Boat or Ferry,Theater,French Restaurant,Café,Breakfast Spot,Brewery,Spa,Shoe Store
4,M7A,Downtown Toronto,Queen's Park,43.6612,-79.3917,0,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Theater,Falafel Restaurant,Mediterranean Restaurant,Smoothie Shop,Burrito Place,Salad Place
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.6574,-79.3782,3,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Japanese Restaurant,Bakery,Theater,Bookstore,Plaza
15,M5C,Downtown Toronto,St. James Town,43.6512,-79.3755,3,Coffee Shop,Café,Restaurant,Seafood Restaurant,Bakery,Hotel,Breakfast Spot,Cosmetics Shop,Clothing Store,Cocktail Bar
20,M5E,Downtown Toronto,Berczy Park,43.6452,-79.3737,3,Coffee Shop,Cocktail Bar,Cheese Shop,Farmers Market,Breakfast Spot,Bakery,Beer Bar,Hotel,Restaurant,Café


Finally, we can create a map of Downtown Toronto, where we display the Neighborhoods, color-coded according to the cluster associated by the K-Means Algorithm.

In [121]:
# create map
map_clusters = folium.Map(location=[toronto_lat-0.1, toronto_long], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
rainbow.remove('#80ffb4') #remove this particular shade of green, becasue it's too similar to others used by Folium
rainbow.append('#068c39') #add a darker shade of green, instead
 
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dt_merged['Latitude'], dt_merged['Longitude'], dt_merged['Neighborhood'], dt_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters