# Week3 Capstone Segmenting Clustering

Olakunle Kadri


### #1. Web Scraping Task

__Import Needed Modules__

In [None]:
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests 

import lxml.html as lh

#!conda install -c conda-forge geocoder --yes # uncomment this line if you haven't completed the Foursquare API lab
import geocoder
import sys

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# import k-means from clustering stage
from sklearn.cluster import KMeans

import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


print('Libraries imported.')

__Scrape Table Content from the Web__

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page = requests.get(url)
doc = lh.fromstring(page.content)

tr_elements = doc.xpath('//tr') #Parse data that are stored between <tr>..</tr> of HTML

[len(T) for T in tr_elements[:10]] # check number of columns for first 10 rows

__Extract Data from the Table__

In [None]:
dataset=[]
for j in range(1,len(tr_elements)):
    line = []
    for t in tr_elements[j].iterchildren():
        line.append(t.text_content().strip())
    
    if len(line) == 3 and line[0] != '' and line[1] != 'Not assigned':
        dataset.append(line)


__Create the Dataframe and Add Column names__

In [None]:
datadf = pd.DataFrame(dataset,columns=['PostalCode', 'Borough', 'Neighborhood'])

datadf[:2]

In [None]:
datadf.describe()

__Print the Number of Rows__

In [None]:
print(f'Number of Rows: {datadf.shape[0]}')

### #2. Get the Latitude and the Longitude Coordinates of each Neighborhood

__Download the Geospatial_data__

In [None]:
!wget -q -O geocoord_for_postal_code.csv http://cocl.us/Geospatial_data

print('Data downloaded!')    

__Extract geographical coordinates of each postal code__

In [None]:
geocoord_df = pd.read_csv('geocoord_for_postal_code.csv')

geocoord_df[:2]

__Merge scraped data with the geographical coordinates based on Postal Code__

In [None]:
datadf_merged_left = pd.merge(left=datadf, right=geocoord_df, how='left', left_on='PostalCode', right_on='Postal Code')
datadf_merged_left[:5]

### #3. Explore and cluster the neighborhoods in Toronto 

In [None]:
datadf_merged_left = datadf_merged_left[['Borough','Neighborhood','Latitude','Longitude']]
datadf_merged_left[:2]

In [None]:
print('The dataframe has {} boroughs and {} records.'.format(
        len(datadf_merged_left['Borough'].unique()),
        datadf_merged_left.shape[0] ))

__Let's work with North York Region__ 

In [None]:
northyork_data = datadf_merged_left[datadf_merged_left['Borough'] == 'North York'].reset_index(drop=True)
northyork_data[:2]

In [None]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'North York, ON'

geolocator = Nominatim(user_agent="to_explorer")     
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The geograpical coordinate of {address} are {latitude}, {longitude}.')

In [None]:
# create map of North York using latitude and longitude values
map_northyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(northyork_data['Latitude'], northyork_data['Longitude'], northyork_data['Borough'], northyork_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_northyork)  
    
map_northyork

#### Define Foursquare Credentials and Version

In [None]:
CLIENT_ID = 'PSPPN2RALFEH1WZRQO0LCRXGILS3Q042IVLR0FMJG5DODHUU' # your Foursquare ID
CLIENT_SECRET = '5ZCJO4HX5MKIUWC5D50Q53UY45V33MWBJ40300UX1FRH5WRI' # your Foursquare Secret
VERSION = '20200601' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
northyork_data.loc[0, 'Neighborhood']

In [None]:
neighborhood_latitude = northyork_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = northyork_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = northyork_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

In [None]:
LIMIT = 500 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

In [None]:
results = requests.get(url).json()
len(results)

In [None]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

In [None]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

#### Explore Neighborhoods in North York

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# type your answer here

northyork_venues = getNearbyVenues(names=northyork_data['Neighborhood'],
                                   latitudes=northyork_data['Latitude'],
                                   longitudes=northyork_data['Longitude']
                                  )


In [None]:
print(northyork_venues.shape)
northyork_venues.head()

__Let's check how many venues were returned for each neighborhood__

In [None]:
northyork_venues.groupby('Neighborhood').count()

__Let's find out how many unique categories can be curated from all the returned venues__

In [None]:
print('There are {} uniques categories.'.format(len(northyork_venues['Venue Category'].unique())))

## 3. Analyze Each Neighborhood

In [None]:
# one hot encoding
northyork_onehot = pd.get_dummies(northyork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
northyork_onehot['Neighborhood'] = northyork_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [northyork_onehot.columns[-1]] + list(northyork_onehot.columns[:-1])
northyork_onehot = northyork_onehot[fixed_columns]

northyork_onehot.head()

In [None]:
northyork_onehot.shape

#### Now, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
northyork_grouped = northyork_onehot.groupby('Neighborhood').mean().reset_index()
northyork_grouped[:2]

In [None]:
northyork_grouped.shape

#### Let's print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in northyork_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = northyork_grouped[northyork_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

#### Let's put that into a *pandas* dataframe

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = northyork_grouped['Neighborhood']

for ind in np.arange(northyork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(northyork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

## 4. Cluster Neighborhoods

__Run *k*-means to cluster the neighborhood into 5 clusters.__

In [None]:
# set number of clusters
kclusters = 5

northyork_grouped_clustering = northyork_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(northyork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

__Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.__

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

northyork_merged = northyork_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
northyork_merged = northyork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

northyork_merged[:2] # check the last columns!


In [None]:
northyork_merged['Cluster Labels'] = northyork_merged['Cluster Labels']+1

In [None]:
northyork_merged[:2]

__Finally, let's visualize the resulting clusters__

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []

bad = []

try:

    for lat, lon, poi, cluster in zip(northyork_merged ['Latitude'], 
                                      northyork_merged ['Longitude'], 
                                      northyork_merged ['Neighborhood'], 
                                      northyork_merged['Cluster Labels']):

        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster+1],
            fill=True,
            fill_color=rainbow[cluster+1],
            fill_opacity=0.7).add_to(map_clusters)
except Exception as err:
    bad.append(err)
    

map_clusters

## 5. Examine Clusters

In [None]:
for n in range(0,3):
    
    cluster_df = northyork_merged.loc[northyork_merged['Cluster Labels'] == n, 
                     northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]
    
    if cluster_df.shape[0] >0:
        print(f'\n#{n} --------------------------------------------------\n')
        print(cluster_df[:1])