# `Capstone Project - Segmenting and Clustering Assignment`

 **Install & Import all required libraries **

In [9]:
!pip install folium
!pip install geopy



In [12]:
import requests
import pandas as pd
import numpy as np
import folium
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim

**Define path details for input files**

In [13]:
input_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
spatial_path = 'http://cocl.us/Geospatial_data/Geospatial_data.csv'

**Below code will read the table data from Wikipedia page**

In [14]:
def get_data(url):
   try:
      result = requests.get(url)
      content_type = result.headers['Content-type'].lower()
      if result.status_code ==200 and content_type is not None and content_type.find('html')>-1:
         return result.content
      else:
         return None
   
   except requests.exception.RequestException as e:
      print('Error during request to {} :{}'.format(url, str(e)))
      return None

**Scrap the required data from link and store it in a list**

In [15]:
url_data = get_data(input_url)
parsed_data = BeautifulSoup(url_data, 'html.parser')

work_list = []
for idx in parsed_data.select('td'):
    if idx.text.isspace():
       break
    else:
       work_list.append(idx.text.strip('\n'))

**Create the data frame with scrapped data**

In [21]:
postcode_list = []
borough_list = []
neigh_list = []
for idx in range(0,len(work_list),3):
    postcode_list.append(work_list[idx])
          
for idx in range(1,len(work_list),3):
    borough_list.append(work_list[idx])

for idx in range(2,len(work_list),3):
    neigh_list.append(work_list[idx])

df_table = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])
df_table['PostalCode'] = postcode_list
df_table['Borough'] = borough_list
df_table['Neighborhood'] = neigh_list 

In [22]:
# drops the rows with 'Borough' as 'Not assigned'.
df_table = df_table[df_table['Borough'] != 'Not assigned'] 

# If the 'Neighborhood' has the data as 'Not assigned', then the details from 'Borough' will be copied.
df_table['Neighborhood']=np.where((df_table['Neighborhood']=='NA'),df_table['Borough'],df_table['Neighborhood'])

# If there are multiple rows for a PostalCode, then the data will be combined and seperated by commas
df_table = df_table.groupby('PostalCode',as_index=False).agg(','.join)

In [23]:
print("Shape of dataframe is: ", df_table.shape)
df_table.head(12)

Shape of dataframe is:  (103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


**Populate Latitude and Longitude details**

In [24]:
df_spatial = pd.read_csv(spatial_path)
df_table.set_index('PostalCode',inplace=True)
df_spatial.set_index('Postal Code',inplace=True)
df_table = pd.concat([df_table,df_spatial], axis=1)
df_table.index.names = ['PostalCode']
df_table.reset_index(inplace=True)

In [25]:
df_table.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


**Create a map of Toronto with the Neighborhood details obtained**

In [49]:
x_nom = Nominatim(user_agent='Foursquare_agent')
x_loc = x_nom.geocode("Toronto,Canada")
Toronto_Lat = x_loc.latitude
Toronto_Long = x_loc.longitude
Toronto_map = folium.Map([Toronto_Lat,Toronto_Long],zoom_start =10)
for bor, neigh, lat, long in zip(df_table['Borough'],df_table['Neighborhood'],df_table['Latitude'],df_table['Longitude']):
    folium.CircleMarker(
       [lat,long],
       radius =5,
       color = 'blue',
       popup = folium.Popup('{},{}'.format(bor,neigh),parse_html=True),
       fill_color = 'red',
       fill_opacity = 0.6).add_to(Toronto_map)
Toronto_map

**Extract data with "Boroughs" that contain the word Toronto** 

In [106]:
df_toronto = df_table[df_table['Borough'].str.contains('Toronto')]
df_toronto.reset_index(drop=True,inplace=True)
df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


**Neighborhood Analysis**

In [64]:
Toronto_map_1 = folium.Map([Toronto_Lat,Toronto_Long],zoom_start =12)
for bor, neigh, lat, long in zip(df_toronto['Borough'],df_toronto['Neighborhood'],df_toronto['Latitude'],df_toronto['Longitude']):
    folium.CircleMarker(
       [lat,long],
       radius =5,
       color = 'red',
       popup = folium.Popup(bor,parse_html=True),
       fill_color = 'blue',
       fill_opacity = 0.6).add_to(Toronto_map_1)
Toronto_map_1

**Define Foursquare Credentials**

In [69]:
api_client_id = '0WAUW3LYUG1C2AHQKJBAMTWCLGZPXDRTRBCZ3EZI4EEZKZHL'
api_client_secret = 'TRBZA1ORTZTYWOBOPIDNEB0HGNT24RGEKYF3IGK2VV2VBM31'
api_version = '20200601'

In [70]:
explore_api_url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&limit={}&radius={}'
limit = 100
radius = 500

## Explore all the neighborhoods in the dataframe df_toronto

**Create a list of all venues around the neighborhoods**

In [105]:
venues_list = []

for lat, long, name in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Neighborhood']):
    url = explore_api_url.format(api_client_id,api_client_secret,api_version,lat,long,limit,radius)
    results = requests.get(url).json()
    for idx in results['response']['groups'][0]['items']:
        venue_name = idx['venue']['name']
        venue_category = idx['venue']['categories'][0]['name']
        venue_lat = idx['venue']['location']['lat']
        venue_long = idx['venue']['location']['lng']
        venues_list.append([name, lat, long, venue_name, venue_category, venue_lat, venue_long])

**Create the dataframe with all venue details**

In [122]:
df_venues = pd.DataFrame( list for list in venues_list)
df_venues.columns = ['Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue Name','Venue Category','Venue Latitude', 'Venue Longitude']

In [124]:
print("Shape of df_venue:", df_venues.shape)
df_venues.head(5)

Shape of df_venue: (1627, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Category,Venue Latitude,Venue Longitude
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,Pub,43.679181,-79.297215
3,The Beaches,43.676357,-79.293031,Upper Beaches,Neighborhood,43.680563,-79.292869
4,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,Cosmetics Shop,43.67782,-79.351265


**Number of Venues under each Neighborhood**

In [129]:
df_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Category,Venue Latitude,Venue Longitude
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,58,58,58,58,58,58
"Brockton, Parkdale Village, Exhibition Place",24,24,24,24,24,24
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",14,14,14,14,14,14
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",15,15,15,15,15,15
Central Bay Street,65,65,65,65,65,65
Christie,17,17,17,17,17,17
Church and Wellesley,74,74,74,74,74,74
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,35,35,35,35,35,35
Davisville North,8,8,8,8,8,8


In [132]:
print("Total number of Neighborhood explored: ", len(df_venues['Neighborhood'].unique()))
print("Total number of Venues explored: ", len(df_venues['Venue Category'].unique()))

Total number of Neighborhood explored:  39
Total number of Venues explored:  237


## Analyze the neighborhood data
**Fetch the 10 most commonly visited venues in all Neighborhoods**

In [177]:
df_venues_analyze = pd.get_dummies(df_venues['Venue Category'])
df_venues_analyze.rename(columns={'Neighborhood':'Neighborhood Category'}, inplace=True)
df_venues_analyze = pd.concat([df_venues['Neighborhood'],df_venues_analyze],axis=1)

In [183]:
df_venues_groupby = df_venues_analyze.groupby('Neighborhood',as_index=False).mean()

In [223]:
top_common_venues = 10
temp_list = ['st','nd','rd']
common_columns = ['Neighborhood']
for idx in range(0,top_common_venues):
    if (idx+1) < 4:
       common_columns.append("{}{} Most Common Venue".format((idx+1),temp_list[idx]))
    else:
       common_columns.append("{}th Most Common Venue".format((idx+1)))
                             
df_common_venues = pd.DataFrame(columns=common_columns)     
df_common_venues['Neighborhood'] = df_venues_groupby['Neighborhood']

for idx in range(0,df_venues_groupby.shape[0]):
    row = df_venues_groupby.iloc[idx,:]
    df_common_venues.iloc[idx,1:] = row.iloc[1:].sort_values(ascending=False).head(10).index.values
    
df_common_venues

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Cheese Shop,Café,Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Gourmet Shop
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Nightclub,Yoga Studio,Stadium,Burrito Place,Restaurant,Climbing Gym,Pet Store
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Garden,Comic Shop,Pizza Place,Restaurant,Burrito Place,Brewery,Skate Park,Farmers Market,Fast Food Restaurant
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Sculpture Garden,Harbor / Marina
4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Burger Joint,Department Store,Bubble Tea Shop,Salad Place,Japanese Restaurant,Ramen Restaurant
5,Christie,Grocery Store,Café,Park,Restaurant,Baby Store,Diner,Athletics & Sports,Italian Restaurant,Candy Store,Coffee Shop
6,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Pub,Men's Store,Mediterranean Restaurant,Hotel,Yoga Studio
7,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Café,Hotel,Gym,American Restaurant,Seafood Restaurant,Japanese Restaurant,Deli / Bodega,Beer Bar
8,Davisville,Pizza Place,Sandwich Place,Dessert Shop,Sushi Restaurant,Gym,Coffee Shop,Café,Italian Restaurant,Park,Gourmet Shop
9,Davisville North,Food & Drink Shop,Gym,Park,Breakfast Spot,Pizza Place,Department Store,Hotel,Sandwich Place,Yoga Studio,Diner


## Clustering the Neighborhood

In [237]:
from sklearn.cluster import KMeans
X = df_venues_groupby.drop(['Neighborhood'],axis=1)
no_of_clusters = 4
km = KMeans(init="k-means++",n_clusters=no_of_clusters,n_init=1)
km.fit(X)
km.labels_[:10]
df_common_venues.insert(0,'Cluster Label',km.labels_)

In [233]:
# Add the label details to Common venues Dataframe
df_common_venues.head()

Unnamed: 0,Cluster Label,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Cheese Shop,Café,Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Gourmet Shop
1,0,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Nightclub,Yoga Studio,Stadium,Burrito Place,Restaurant,Climbing Gym,Pet Store
2,0,"Business reply mail Processing Centre, South C...",Light Rail Station,Garden,Comic Shop,Pizza Place,Restaurant,Burrito Place,Brewery,Skate Park,Farmers Market,Fast Food Restaurant
3,0,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Boutique,Plane,Airport,Airport Food Court,Airport Gate,Airport Terminal,Sculpture Garden,Harbor / Marina
4,0,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Burger Joint,Department Store,Bubble Tea Shop,Salad Place,Japanese Restaurant,Ramen Restaurant


**The Toronto Neighborhood DataFrame is updated with most commonly visited places**

In [245]:
df_toronto_updated = df_toronto.join(df_common_venues.set_index('Neighborhood'), on='Neighborhood')

## Visualize the Cluster details

In [246]:
toronto_map_2 = folium.Map([Toronto_Lat,Toronto_Long],zoom_start=12)
import matplotlib.cm as cm
import matplotlib.colors as colors

# set color scheme for the clusters
x = np.arange(no_of_clusters)
ys = [i + x + (i*x)**2 for i in range(no_of_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Create the markers

for lat, long, name, clusters in zip(df_toronto_updated['Latitude'], df_toronto_updated['Longitude'],df_toronto_updated['Neighborhood'],df_toronto_updated['Cluster Label']):
    folium.CircleMarker([lat,long],
    color = rainbow[clusters-1],
    fill = True,
    fill_color = rainbow[clusters-1],
    fill_opacity = '0.8',
    popup = folium.Popup(str(name) + 'Cluster' + str(clusters), parse_html =True)
    ).add_to(toronto_map_2)

toronto_map_2

## Cluster-1 - restaurants and eateries are most commonly visited places

In [267]:
df_temp = df_toronto_updated[df_toronto_updated['Cluster Label']==0]
df_temp.loc[:][df_temp.columns[[1,5,6,7,8,9,10,11,12,13,14,15]]]

Unnamed: 0,Borough,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Trail,Pub,Health Food Store,Neighborhood Category,Dog Run,Dessert Shop,Diner,Discount Store,Distribution Center,Yoga Studio
1,East Toronto,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Restaurant,Ice Cream Shop,Cosmetics Shop,Brewery,Bubble Tea Shop,Café
3,East Toronto,0,Café,Coffee Shop,American Restaurant,Bakery,Brewery,Gastropub,Yoga Studio,Fish Market,Pet Store,Park
6,Central Toronto,0,Clothing Store,Coffee Shop,Sporting Goods Shop,Health & Beauty Service,Ice Cream Shop,Fast Food Restaurant,Diner,Metro Station,Mexican Restaurant,Park
7,Central Toronto,0,Pizza Place,Sandwich Place,Dessert Shop,Sushi Restaurant,Gym,Coffee Shop,Café,Italian Restaurant,Park,Gourmet Shop
9,Central Toronto,0,Pub,Coffee Shop,Restaurant,Bank,Sushi Restaurant,Bagel Shop,Fried Chicken Joint,Sports Bar,Pizza Place,Supermarket
11,Downtown Toronto,0,Coffee Shop,Pizza Place,Italian Restaurant,Bakery,Restaurant,Market,Café,Pub,General Entertainment,Diner
12,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Pub,Men's Store,Mediterranean Restaurant,Hotel,Yoga Studio
13,Downtown Toronto,0,Coffee Shop,Bakery,Pub,Café,Park,Breakfast Spot,Theater,Yoga Studio,Farmers Market,Restaurant
14,Downtown Toronto,0,Clothing Store,Coffee Shop,Café,Cosmetics Shop,Bubble Tea Shop,Japanese Restaurant,Lingerie Store,Bookstore,Plaza,Pizza Place


## Cluster-2 - Indoor Recreational places are most visited in this cluster

In [249]:
df_toronto_updated.loc[df_toronto_updated['Cluster Label'] == 1 , df_toronto_updated.columns[[1] + list(range(4,df_toronto_updated.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,-79.38316,1,Tennis Court,Yoga Studio,Department Store,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


## Cluster-3 - Outdoor area is visited most commonly and then followed by Recreational places

In [250]:
df_toronto_updated.loc[df_toronto_updated['Cluster Label'] == 2 , df_toronto_updated.columns[[1] + list(range(4,df_toronto_updated.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,-79.416936,2,Garden,Yoga Studio,Department Store,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


## Cluster-4 - Pub, Park and transport stations are most visited in this cluster

In [251]:
df_toronto_updated.loc[df_toronto_updated['Cluster Label'] == 3 , df_toronto_updated.columns[[1] + list(range(4,df_toronto_updated.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,East Toronto,-79.315572,3,Park,Pub,Liquor Store,Sandwich Place,Burrito Place,Italian Restaurant,Fast Food Restaurant,Restaurant,Fish & Chips Shop,Steakhouse
4,Central Toronto,-79.38879,3,Lawyer,Swim School,Bus Line,Park,Gift Shop,Event Space,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
5,Central Toronto,-79.390197,3,Food & Drink Shop,Gym,Park,Breakfast Spot,Pizza Place,Department Store,Hotel,Sandwich Place,Yoga Studio,Diner
10,Downtown Toronto,-79.377529,3,Park,Playground,Trail,Deli / Bodega,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
38,East Toronto,-79.321558,3,Light Rail Station,Garden,Comic Shop,Pizza Place,Restaurant,Burrito Place,Brewery,Skate Park,Farmers Market,Fast Food Restaurant
