<a href="https://colab.research.google.com/github/jayeshmanani/Coursera_Capstone/blob/master/Coursera_Capstone_week_3_neighborhoods_in_Toronto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

## Scraping the Data From Wiki Page

In [0]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [0]:
res = requests.get(url)

In [0]:
soup = BeautifulSoup(res.text, 'lxml')

In [0]:
table = soup.find_all('table')[0]

In [0]:
new_table = pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'],index=range(len(table.find_all('tr')))) # I know the size 


In [60]:
new_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
1,,,
2,,,
3,,,
4,,,


In [0]:
row_marker = 0
for row in table.find_all('tr'):
  column_marker = 0
  columns = row.find_all('td')
  for column in columns:
    new_table.iat[row_marker,column_marker] = column.get_text().strip()
    column_marker += 1
  row_marker+=1

In [0]:
new_table = new_table[1:]

In [64]:
new_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


#### Filtering the columns where the **Borough** is **Not Assigned**

In [0]:
new_table2 = new_table[new_table['Borough']!='Not assigned']

In [0]:
new_table2 = new_table2.reset_index().drop(columns=['index'])

In [75]:
new_table2.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


#### Assigning the Neighbouthood value to the **"Not Assigned"** values in the Neighbourhood column

In [0]:
for x in new_table2[new_table2['Neighbourhood']=='Not assigned'].index:
  new_table2.iloc[x]['Neighbourhood'] = new_table2.iloc[x]['Borough']

In [98]:
new_table2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


#### Checkput the shape of the data we have now

In [88]:
new_table2.shape

(210, 3)

#### Now try to get the Lat, long of the location using the CSV file

In [0]:
geo_code_csv = pd.read_csv('Geospatial_Coordinates.csv')

In [97]:
geo_code_csv.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [102]:
geo_code_csv.shape

(103, 3)

#### Now mapping the pincode to the our borough pincode 

In [0]:
new_table2['latitude'] = new_table2['Postcode'].apply(lambda x: geo_code_csv[geo_code_csv['Postal Code']==x]['Latitude'].values[0])
new_table2['Longitude'] = new_table2['Postcode'].apply(lambda x: geo_code_csv[geo_code_csv['Postal Code']==x]['Longitude'].values[0])

In [108]:
## New dataframe with Latitude and Longitude of the Pincode

new_table2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


#### Do some data analysis work and Analyse the data to find some insight

In [111]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(len(new_table2['Borough'].unique()),new_table2.shape[0]))

The dataframe has 11 boroughs and 210 neighborhoods.


In [114]:
from geopy.geocoders import Nominatim
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create a map of New York with neighborhoods superimposed on top.

In [121]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(new_table2['latitude'], new_table2['Longitude'], new_table2['Borough'], new_table2['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [123]:
new_table2['Borough'].unique()

array(['North York', 'Downtown Toronto', "Queen's Park", 'Scarborough',
       'East York', 'Etobicoke', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

### Exploring the **Mississauga** Borough of Ontario

In [124]:
from geopy.geocoders import Nominatim
address = 'Mississauga, Ontario'

geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.590338, -79.645729.


In [0]:
mississauga_data = new_table2[new_table2['Borough']=='Mississauga'].reset_index(drop=True)

In [137]:
# create map of Manhattan using latitude and longitude values
map_mississauga = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(mississauga_data['latitude'], mississauga_data['Longitude'], mississauga_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_mississauga)  
    
map_mississauga

### now onward we are going to use the foursquare api for place exploration

In [131]:
CLIENT_ID = 'MRK3UFN44LFU3VYQRP35EZW2A4EEXRIISN5C2MPMSQ2PSRCK' # your Foursquare ID
CLIENT_SECRET = 'MOAJMHW5VTKZUY43PZWFGGSWE52OOIQ2Q0ITOOFUHAZS1RWN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
print("Credential for Foursquare is all set to go")

Credential for Foursquare is all set to go


In [214]:
data['Borough'].unique()

array(['North York', 'Downtown Toronto', "Queen's Park", 'Scarborough',
       'East York', 'Etobicoke', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

#### Starting with exploration of 1st neighbour

In [0]:
data = new_table2[new_table2['Borough']=='Central Toronto'].reset_index(drop=True)

In [216]:
data.loc[0, 'Neighbourhood']

'Lawrence Park'

In [217]:
latitude_to_explore = data.loc[0, 'latitude'] 
longitude_to_explore = data.loc[0, 'Longitude'] 

neighborhood_name = data.loc[0, 'Neighbourhood'] 

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               latitude_to_explore, 
                                                               longitude_to_explore))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


In [218]:
LIMIT = 100
radius = 500 

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude_to_explore, 
    longitude_to_explore, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=MRK3UFN44LFU3VYQRP35EZW2A4EEXRIISN5C2MPMSQ2PSRCK&client_secret=MOAJMHW5VTKZUY43PZWFGGSWE52OOIQ2Q0ITOOFUHAZS1RWN&v=20180605&ll=43.7280205,-79.3887901&radius=500&limit=100'

In [219]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e2f42e698205d001b1e173e'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-50e6da19e4b0d8a78a0e9794-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/park_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d163941735',
         'name': 'Park',
         'pluralName': 'Parks',
         'primary': True,
         'shortName': 'Park'}],
       'id': '50e6da19e4b0d8a78a0e9794',
       'location': {'address': '3055 Yonge Street',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Lawrence Avenue East',
        'distance': 465,
        'formattedAddress': ['3055 Yonge Street (Lawrence Avenue East)',
         'Toronto ON',
         'Canada'],
        'labeledLatLngs': [{

In [0]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [0]:
import json
from pandas.io.json import json_normalize

In [222]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Zodiac Swim School,Swim School,43.728532,-79.38286
2,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805


In [223]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

3 venues were returned by Foursquare.


### Let's create a function to repeat the same process to all the neighborhoods in Mississauga

In [0]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [225]:
toronto_venues = getNearbyVenues(names=data['Neighbourhood'],
                                   latitudes=data['latitude'],
                                   longitudes=data['Longitude']
                                  )

Lawrence Park
Roselawn
Davisville North
Forest Hill North
Forest Hill West
North Toronto West
The Annex
North Midtown
Yorkville
Davisville
Moore Park
Summerhill East
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West


### Let's check the size of the resulting dataframe

In [226]:
print(toronto_venues.shape)
toronto_venues.head()

(218, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Roselawn,43.711695,-79.416936,Rosalind's Garden Oasis,43.712189,-79.411978,Garden
4,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


In [227]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,35,35,35,35,35,35
Davisville North,8,8,8,8,8,8
Deer Park,14,14,14,14,14,14
Forest Hill North,4,4,4,4,4,4
Forest Hill SE,14,14,14,14,14,14
Forest Hill West,4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
Moore Park,3,3,3,3,3,3
North Midtown,22,22,22,22,22,22
North Toronto West,21,21,21,21,21,21


### Let's find out how many unique categories can be curated from all the returned venues

In [228]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 59 uniques categories.


## Analyze Each Neighborhood

In [229]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Dance Studio,Department Store,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Greek Restaurant,Gym,History Museum,Hotel,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Park,Pharmacy,Pizza Place,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Roselawn,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [230]:
# shape of new dataframe (one-hot coded)
toronto_onehot.shape

(218, 60)

In [231]:

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Dance Studio,Department Store,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Greek Restaurant,Gym,History Museum,Hotel,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Park,Pharmacy,Pizza Place,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.057143,0.0,0.0,0.057143,0.0,0.028571,0.0,0.085714,0.028571,0.028571,0.0,0.028571,0.0,0.0,0.0,0.028571,0.028571,0.028571,0.057143,0.0,0.0,0.028571,0.057143,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.028571,0.057143,0.028571,0.0,0.028571,0.0,0.085714,0.028571,0.0,0.0,0.0,0.0,0.057143,0.0,0.0,0.028571,0.028571,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Deer Park,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.142857,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
3,Forest Hill North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,Forest Hill SE,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.142857,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
5,Forest Hill West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
6,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Moore Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0
8,North Midtown,0.045455,0.0,0.045455,0.0,0.0,0.045455,0.0,0.136364,0.0,0.0,0.136364,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.045455,0.0,0.045455,0.0,0.045455,0.045455,0.045455,0.045455,0.0,0.0,0.0,0.136364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0
9,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.142857,0.095238,0.0,0.0,0.0,0.047619,0.047619,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.047619,0.047619,0.0,0.0,0.0,0.047619,0.047619,0.047619,0.0,0.0,0.047619,0.095238,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.047619


In [232]:
toronto_grouped.shape

(17, 60)

## Let's print each neighborhood along with the top 5 most common venues

In [233]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
              venue  freq
0    Sandwich Place  0.09
1      Dessert Shop  0.09
2              Café  0.06
3  Sushi Restaurant  0.06
4               Gym  0.06


----Davisville North----
               venue  freq
0                Gym  0.12
1  Food & Drink Shop  0.12
2     Breakfast Spot  0.12
3   Asian Restaurant  0.12
4              Hotel  0.12


----Deer Park----
                 venue  freq
0          Coffee Shop  0.14
1                  Pub  0.14
2  American Restaurant  0.07
3         Liquor Store  0.07
4          Pizza Place  0.07


----Forest Hill North----
                venue  freq
0       Jewelry Store  0.25
1               Trail  0.25
2                Park  0.25
3    Sushi Restaurant  0.25
4  Salon / Barbershop  0.00


----Forest Hill SE----
                 venue  freq
0          Coffee Shop  0.14
1                  Pub  0.14
2  American Restaurant  0.07
3         Liquor Store  0.07
4          Pizza Place  0.07


----Forest Hill West----
                venu

In [0]:
## function for finding the most common venue
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [235]:
import numpy as np

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Sandwich Place,Dessert Shop,Italian Restaurant,Sushi Restaurant,Café,Pizza Place,Coffee Shop,Gym,Japanese Restaurant,Indian Restaurant
1,Davisville North,Department Store,Food & Drink Shop,Asian Restaurant,Gym,Breakfast Spot,Hotel,Sandwich Place,Park,Fast Food Restaurant,Flower Shop
2,Deer Park,Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Restaurant,Liquor Store,Light Rail Station,Supermarket,Sports Bar,Vietnamese Restaurant
3,Forest Hill North,Trail,Jewelry Store,Sushi Restaurant,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden
4,Forest Hill SE,Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Restaurant,Liquor Store,Light Rail Station,Supermarket,Sports Bar,Vietnamese Restaurant


## **Cluster Neighborhoods**

In [236]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 3, 1, 3, 4, 0, 1, 1], dtype=int32)

#### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [237]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Swim School,Bus Line,Park,Yoga Studio,Dessert Shop,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Garden
1,M5N,Central Toronto,Roselawn,43.711695,-79.416936,2,Garden,Yoga Studio,Dessert Shop,History Museum,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Fried Chicken Joint,Food & Drink Shop
2,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Department Store,Food & Drink Shop,Asian Restaurant,Gym,Breakfast Spot,Hotel,Sandwich Place,Park,Fast Food Restaurant,Flower Shop
3,M5P,Central Toronto,Forest Hill North,43.696948,-79.411307,3,Trail,Jewelry Store,Sushi Restaurant,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden
4,M5P,Central Toronto,Forest Hill West,43.696948,-79.411307,3,Trail,Jewelry Store,Sushi Restaurant,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden


In [238]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine Clusters

**Cluster 1**

In [240]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Central Toronto,0,Gym,Trail,Tennis Court,Yoga Studio,Department Store,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint
11,Central Toronto,0,Gym,Trail,Tennis Court,Yoga Studio,Department Store,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint


**Cluster 2**

In [241]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Central Toronto,1,Department Store,Food & Drink Shop,Asian Restaurant,Gym,Breakfast Spot,Hotel,Sandwich Place,Park,Fast Food Restaurant,Flower Shop
5,Central Toronto,1,Clothing Store,Coffee Shop,Sporting Goods Shop,Yoga Studio,Fast Food Restaurant,Diner,Mexican Restaurant,Dessert Shop,Miscellaneous Shop,Park
6,Central Toronto,1,Sandwich Place,Coffee Shop,Café,American Restaurant,Middle Eastern Restaurant,Pharmacy,Pizza Place,Pub,Liquor Store,Cosmetics Shop
7,Central Toronto,1,Sandwich Place,Coffee Shop,Café,American Restaurant,Middle Eastern Restaurant,Pharmacy,Pizza Place,Pub,Liquor Store,Cosmetics Shop
8,Central Toronto,1,Sandwich Place,Coffee Shop,Café,American Restaurant,Middle Eastern Restaurant,Pharmacy,Pizza Place,Pub,Liquor Store,Cosmetics Shop
9,Central Toronto,1,Sandwich Place,Dessert Shop,Italian Restaurant,Sushi Restaurant,Café,Pizza Place,Coffee Shop,Gym,Japanese Restaurant,Indian Restaurant
12,Central Toronto,1,Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Restaurant,Liquor Store,Light Rail Station,Supermarket,Sports Bar,Vietnamese Restaurant
13,Central Toronto,1,Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Restaurant,Liquor Store,Light Rail Station,Supermarket,Sports Bar,Vietnamese Restaurant
14,Central Toronto,1,Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Restaurant,Liquor Store,Light Rail Station,Supermarket,Sports Bar,Vietnamese Restaurant
15,Central Toronto,1,Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Restaurant,Liquor Store,Light Rail Station,Supermarket,Sports Bar,Vietnamese Restaurant


**Cluster 3**

In [242]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Central Toronto,2,Garden,Yoga Studio,Dessert Shop,History Museum,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Fried Chicken Joint,Food & Drink Shop


**Cluster 4**

In [243]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Central Toronto,3,Trail,Jewelry Store,Sushi Restaurant,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden
4,Central Toronto,3,Trail,Jewelry Store,Sushi Restaurant,Park,Yoga Studio,Dessert Shop,Greek Restaurant,Gourmet Shop,Gas Station,Garden


**Cluster 5**

In [244]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,4,Swim School,Bus Line,Park,Yoga Studio,Dessert Shop,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Garden
