## Business in the Neighborhood

####  Web Scraping and converting data into Dataframe

In [2]:
import pandas as pd
import numpy as np
import json
import wikipedia as wiki

# Obtain data https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
html = wiki.page("List of postal codes of Canada: M").html().encode("UTF-8")

df = pd.read_html(html, header = 0)[0]

# Ignore cells with a borough - Not assigned
df = df[df.Borough != 'Not assigned']

# Combining into one row with the neighborhoods 
df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()


for index, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


####  Importing libraries and adding location data into dataframe

In [3]:
from geopy.geocoders import Nominatim
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

In [4]:

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [5]:
postal_code=pd.read_csv('http://cocl.us/Geospatial_data')

In [6]:
df['Latitude']=postal_code['Latitude']
df['Longitude']=postal_code['Longitude']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### User input data (borough, business)

In [7]:
borough=input("Enter preferred neighborhood in Toronto")
business=input("Enter business type (e.g. restaurant,mall) ")


Enter preferred neighborhood in TorontoNorth York
Enter business type (e.g. restaurant,mall) Restaurant


#### getting client deatils of FourSquare API

In [15]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


In [9]:
borough_data= df[df['Borough'] == borough].reset_index(drop=True)

#### Using geopy to get the location coordinates of the user selected borough

In [10]:
address = borough+', Toronto'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 43.7708175, -79.4132998.


#### Obtaining the venues data with respect to every location in the borough

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            business,
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
borough_venues = getNearbyVenues(names=borough_data['Neighbourhood'],
                                   latitudes=borough_data['Latitude'],
                                   longitudes=borough_data['Longitude']
                                  )

Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Bedford Park, Lawrence Manor East
Lawrence Heights, Lawrence Manor
Glencairn
Downsview, North Park, Upwood Park
Humber Summit
Emery, Humberlea


In [13]:
bv=borough_venues
borough_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hillcrest Village,43.803762,-79.363452,New York Fries,43.803664,-79.363905,Fast Food Restaurant
1,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
2,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Hero Certified Burgers,43.777295,-79.344584,Burger Joint
3,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Michel's Baguette,43.777082,-79.344557,Bakery
4,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,New York Fries,43.778298,-79.343267,Fast Food Restaurant


#### getting the frequency of search query venue in each neighbourhood

In [14]:
final=borough_venues.groupby('Neighbourhood').count()
final=final.reset_index()
final.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Bathurst Manor, Downsview North, Wilson Heights",9,9,9,9,9,9
1,Bayview Village,3,3,3,3,3,3
2,"Bedford Park, Lawrence Manor East",21,21,21,21,21,21
3,"CFB Toronto, Downsview East",1,1,1,1,1,1
4,Don Mills North,4,4,4,4,4,4


#### Data Modelling using k-means algorithm

In [16]:
from sklearn.cluster import KMeans

In [55]:
# set number of clusters
kclusters = 4

n_grouped_clustering = final.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(n_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([2, 0, 3, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

#### Adding cluster label to the original data set

In [65]:
# add clustering labels
borough_grouped=bv.groupby('Neighbourhood').mean().reset_index()
borough_grouped.insert(3,'Cluster Labels',kmeans.labels_)

borough_grouped

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Cluster Labels,Venue Latitude,Venue Longitude
0,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259,2,43.755316,-79.440895
1,Bayview Village,43.786947,-79.385975,0,43.787845,-79.381025
2,"Bedford Park, Lawrence Manor East",43.733282,-79.41975,3,43.733725,-79.419436
3,"CFB Toronto, Downsview East",43.737473,-79.464763,0,43.737632,-79.469056
4,Don Mills North,43.745906,-79.352188,0,43.746091,-79.346766
5,Downsview Central,43.728496,-79.495697,0,43.725474,-79.497566
6,Downsview Northwest,43.761631,-79.520999,0,43.758039,-79.51997
7,"Downsview, North Park, Upwood Park",43.713756,-79.490074,0,43.715481,-79.490085
8,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,1,43.77782,-79.344125
9,"Flemingdon Park, Don Mills South",43.7259,-79.340923,2,43.726201,-79.34069


#### Mapping the borough along with all the clusters in all neighborhood

In [68]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(borough_grouped['Neighbourhood Latitude'], borough_grouped['Neighbourhood Longitude'], borough_grouped['Neighbourhood'],borough_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [72]:
borough_grouped.loc[borough_grouped['Cluster Labels'] == 0, borough_grouped.columns[[0] + list(range(1, borough_grouped.shape[1]))]]

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Cluster Labels,Venue Latitude,Venue Longitude
1,Bayview Village,43.786947,-79.385975,0,43.787845,-79.381025
3,"CFB Toronto, Downsview East",43.737473,-79.464763,0,43.737632,-79.469056
4,Don Mills North,43.745906,-79.352188,0,43.746091,-79.346766
5,Downsview Central,43.728496,-79.495697,0,43.725474,-79.497566
6,Downsview Northwest,43.761631,-79.520999,0,43.758039,-79.51997
7,"Downsview, North Park, Upwood Park",43.713756,-79.490074,0,43.715481,-79.490085
10,Glencairn,43.709577,-79.445073,0,43.708275,-79.443492
11,Hillcrest Village,43.803762,-79.363452,0,43.802674,-79.363922
12,Humber Summit,43.756303,-79.565963,0,43.757754,-79.569311
13,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,0,43.7195,-79.465477


#### Eliminating the rows with the cluster of highest frequency and lowest frequency data

In [89]:
bn=borough_grouped
bn.drop(bn[bn['Cluster Labels'] ==0].index,inplace=True)
bn.drop(bn[bn['Cluster Labels'] ==1].index,inplace=True)
bn

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Cluster Labels,Venue Latitude,Venue Longitude
0,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259,2,43.755316,-79.440895
2,"Bedford Park, Lawrence Manor East",43.733282,-79.41975,3,43.733725,-79.419436
9,"Flemingdon Park, Don Mills South",43.7259,-79.340923,2,43.726201,-79.34069


#### Final output

In [90]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(borough_grouped['Neighbourhood Latitude'], borough_grouped['Neighbourhood Longitude'], borough_grouped['Neighbourhood'],borough_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters