## IBM Data Science Specialization Capstone Project Notebook

In [138]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



### Part1

### Get HTML data from wikipedia

In [139]:
Canada_M = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### Scrape PostalCode, Borough, Neighborhood from HTML using BeautifulSoup

In [140]:
# soup = BeautifulSoup(open("Canada_M.htm"), "lxml")
soup = BeautifulSoup(Canada_M, "lxml")

In [141]:
PostalCode = []
Borough = []
Neighborhood = []

for data in soup.tbody.find_all('tr')[1:]:
    PostalCode.append(data.find('td').text)
    Borough.append(data.find_all('td')[1].text)
    Neighborhood.append(data.find_all('td')[2].text[:-1])

In [142]:
data = {'PostalCode': PostalCode,
        'Borough': Borough,
        'Neighborhood': Neighborhood}
df = pd.DataFrame(data)
df['PostalCode']= df['PostalCode'].str.strip()
df['Borough'] = df['Borough'].str.strip()
# Drop rows where Borough is 'Not assigned', then reset index
df = df[df.Borough != 'Not assigned'].reset_index().drop('index', axis = 1)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### Combine Neighborhood with same PostalCode

In [143]:
i = 1
while(i < len(df)):
    if df['PostalCode'].iloc[i] == df['PostalCode'].iloc[i - 1]:
        df.at[i - 1, 'Neighborhood'] = df.Neighborhood.iloc[i - 1] +', ' + df.Neighborhood.iloc[i]
        df.drop(index = i, inplace = True)
        df = df.reset_index().drop('index', axis = 1)
    else:
        i += 1

In [144]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### Part 2
### Achieve Latitude and Longitude of each PostalCode

In [145]:
# Rename the 'Postal Code' column
Geo = pd.read_csv("Downloads/Geospatial_Coordinates.csv") 
Geo.rename({'Postal Code': 'PostalCode'}, axis='columns', inplace=True)


In [146]:
# Merge Latitude and longitude to original dataframe
df = pd.merge(df, Geo, on='PostalCode')
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,Parkview Hill / Woodbine Gardens,43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Part 3
### Create a map of Toronto with neighborhoods superimposed on top.

In [147]:
import folium
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[43.706204, -79.398752], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#e182f2',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
# Show map Toronto with borough and neighborhood 
map_Toronto

### Define Foursquare Credentials and Version

In [148]:
CLIENT_ID = '4XU4VO2GVGVJ5VNHGS0Z5OVAE50ZGSXSGBZUESNVRMKX03MN' # your Foursquare ID
CLIENT_SECRET = '4ENZ0UWWYV2Z4B4PT2412GM0IERFGCPEFCRAQNE1Z3U2HPCC' # your Foursquare Secret
VERSION = '20181018' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 4XU4VO2GVGVJ5VNHGS0Z5OVAE50ZGSXSGBZUESNVRMKX03MN
CLIENT_SECRET:4ENZ0UWWYV2Z4B4PT2412GM0IERFGCPEFCRAQNE1Z3U2HPCC


In [149]:
df[df.Neighborhood == 'Upper Rouge']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
95,M1X,Scarborough,Upper Rouge,43.836125,-79.205636


In [150]:
neighborhood_latitude = df.loc[95, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[95, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[95, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Upper Rouge are 43.836124700000006, -79.20563609999999.


### Now, let's get the top 100 venues that are in Parkwoods within a radius of 500 meters.

In [151]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 2000# define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=4XU4VO2GVGVJ5VNHGS0Z5OVAE50ZGSXSGBZUESNVRMKX03MN&client_secret=4ENZ0UWWYV2Z4B4PT2412GM0IERFGCPEFCRAQNE1Z3U2HPCC&v=20181018&ll=43.836124700000006,-79.20563609999999&radius=2000&limit=100'

In [152]:
results = requests.get(url).json()

In [153]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [154]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Pumpkinland (Whittamore Farms),Farm,43.849236,-79.206647
1,Rouge Park - Finch Meander Trail,Trail,43.82485,-79.193596
2,Harvest Moon Park,Playground,43.821684,-79.211021
3,Clownfish Village,Sculpture Garden,43.821,-79.205396
4,Taj supermarket,Grocery Store,43.818746,-79.210172
5,Cedar Brae Golf and Country Club,Golf Course,43.839154,-79.230089


In [155]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.


### Explore Neighborhoods in Toronto
### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [156]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    print("Finished!")
    
    return(nearby_venues)

In [157]:
Toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                 latitudes=df['Latitude'],
                                 longitudes=df['Longitude'],
                                 radius = 2000
                                )

Finished!


In [158]:
print(Toronto_venues.shape)
Toronto_venues.head()

(8638, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Donalda Golf & Country Club,43.752816,-79.342741,Golf Course
2,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
3,Parkwoods,43.753259,-79.329656,Graydon Hall Manor,43.763923,-79.342961,Event Space
4,Parkwoods,43.753259,-79.329656,Galleria Supermarket,43.75352,-79.349518,Supermarket


In [159]:
Toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,100,100,100,100,100,100
Alderwood / Long Branch,100,100,100,100,100,100
Bathurst Manor / Wilson Heights / Downsview North,54,54,54,54,54,54
Bayview Village,50,50,50,50,50,50
Bedford Park / Lawrence Manor East,100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
Birch Cliff / Cliffside West,43,43,43,43,43,43
Brockton / Parkdale Village / Exhibition Place,100,100,100,100,100,100
Business reply mail Processing CentrE,100,100,100,100,100,100
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst\n Quay / South Niagara / Island airport\n,100,100,100,100,100,100


### Let's find out how many unique categories can be curated from all the returned venues

In [160]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 329 uniques categories.


 ### Analyze Each Neighborhood

In [161]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Zoo Exhibit,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Antique Shop,Aquarium,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [162]:
Toronto_onehot.shape

(8638, 329)

In [163]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighborhood,Zoo Exhibit,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Antique Shop,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alderwood / Long Branch,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0
2,Bathurst Manor / Wilson Heights / Downsview North,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Bedford Park / Lawrence Manor East,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0


In [164]:
Toronto_grouped.shape

(98, 329)

### Let's print each neighborhood along with the top 5 most common venues

In [165]:
num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                  venue  freq
0    Chinese Restaurant  0.13
1           Coffee Shop  0.08
2            Restaurant  0.05
3              Pharmacy  0.05
4  Cantonese Restaurant  0.03


----Alderwood / Long Branch----
                  venue  freq
0           Coffee Shop  0.08
1           Pizza Place  0.05
2  Fast Food Restaurant  0.05
3      Department Store  0.04
4    Seafood Restaurant  0.03


----Bathurst Manor / Wilson Heights / Downsview North----
            venue  freq
0     Coffee Shop  0.09
1     Pizza Place  0.07
2            Park  0.06
3  Sandwich Place  0.06
4      Restaurant  0.04


----Bayview Village----
                venue  freq
0  Chinese Restaurant  0.12
1         Coffee Shop  0.08
2                Park  0.08
3                Bank  0.06
4               Trail  0.04


----Bedford Park / Lawrence Manor East----
                venue  freq
0         Coffee Shop  0.11
1              Bakery  0.07
2    Sushi Restaurant  0.06
3  Italian Restaurant  0.06
4    

4        Beach  0.05


----Islington Avenue----
          venue  freq
0      Pharmacy  0.11
1   Coffee Shop  0.09
2          Park  0.06
3          Bank  0.06
4  Liquor Store  0.04


----Kennedy Park / Ionview / East Birchmount Park----
                  venue  freq
0    Chinese Restaurant  0.07
1         Grocery Store  0.07
2  Fast Food Restaurant  0.07
3           Coffee Shop  0.07
4              Pharmacy  0.05


----Kensington Market / Chinatown / Grange Park----
         venue  freq
0         Café  0.07
1  Coffee Shop  0.06
2  Pizza Place  0.05
3     Beer Bar  0.04
4   Restaurant  0.04


----Kingsview Village / St. Phillips / Martin Grove Gardens / Richview Gardens----
            venue  freq
0     Coffee Shop  0.12
1     Pizza Place  0.08
2        Pharmacy  0.06
3  Sandwich Place  0.06
4     Gas Station  0.05


----Lawrence Manor / Lawrence Heights----
                    venue  freq
0          Clothing Store  0.10
1             Coffee Shop  0.07
2  Furniture / Home Store  0.04
3  

                   venue  freq
0                   Café  0.10
1       Greek Restaurant  0.07
2                   Park  0.06
3                    Pub  0.04
4  Vietnamese Restaurant  0.04


----The Kingsway / Montgomery Road  / Old Mill North----
                venue  freq
0         Coffee Shop  0.10
1  Italian Restaurant  0.06
2              Bakery  0.05
3                 Pub  0.04
4                Café  0.04


----Thorncliffe Park----
               venue  freq
0        Coffee Shop  0.08
1     Sandwich Place  0.05
2      Grocery Store  0.05
3               Park  0.04
4  Indian Restaurant  0.04


----Toronto Dominion Centre / Design Exchange----
                 venue  freq
0          Coffee Shop  0.06
1                Hotel  0.05
2             Beer Bar  0.04
3                 Café  0.04
4  Japanese Restaurant  0.04


----University of Toronto / Harbord----
                venue  freq
0         Coffee Shop  0.08
1  Mexican Restaurant  0.04
2          Restaurant  0.04
3                C

### Let's put that into a pandas dataframe
### First, let's write a function to sort the venues in descending order.

In [166]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [167]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Chinese Restaurant,Coffee Shop,Pharmacy,Restaurant,Indian Restaurant,Cantonese Restaurant,Sandwich Place,Breakfast Spot,Supermarket,Bank
1,Alderwood / Long Branch,Coffee Shop,Fast Food Restaurant,Pizza Place,Department Store,Burger Joint,Seafood Restaurant,Café,Restaurant,Clothing Store,Pharmacy
2,Bathurst Manor / Wilson Heights / Downsview North,Coffee Shop,Pizza Place,Park,Sandwich Place,Bank,Deli / Bodega,Gas Station,Asian Restaurant,Restaurant,Pharmacy
3,Bayview Village,Chinese Restaurant,Coffee Shop,Park,Bank,Clothing Store,Grocery Store,Café,Japanese Restaurant,Gas Station,Trail
4,Bedford Park / Lawrence Manor East,Coffee Shop,Bakery,Italian Restaurant,Sushi Restaurant,Bagel Shop,Sandwich Place,Restaurant,Café,Pizza Place,Department Store


### Cluster Neighborhoods

In [168]:
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:]

array([4, 4, 4, 4, 3, 0, 4, 1, 1, 1, 3, 4, 4, 0, 1, 0, 4, 4, 0, 3, 3, 4,
       4, 4, 4, 1, 1, 4, 4, 0, 3, 0, 4, 4, 4, 0, 1, 4, 3, 4, 3, 1, 4, 4,
       1, 4, 4, 3, 3, 1, 4, 4, 4, 3, 1, 4, 3, 4, 4, 3, 1, 4, 4, 0, 0, 0,
       3, 3, 4, 1, 1, 4, 4, 0, 0, 4, 0, 1, 3, 1, 3, 1, 3, 4, 0, 0, 2, 4,
       4, 4, 4, 4, 4, 4, 4, 1, 4, 4])

In [169]:
Toronto_merged = df
# add clustering labels
Toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_merged.head() # check the last columns!

ValueError: Length of values does not match length of index