# Capstone Project - The Battle of Neighborhoods 

##  Recommendation for Opening a Fitness Center in North York, Toronto 

In [1]:
# importing libraries
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
from bs4 import BeautifulSoup
import requests # library to handle requests
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
import geopy.geocoders # convert an address into latitude and longitude values

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries are imported.')

Libraries are imported.


In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### Locate Table and use tags to find postal code by Borough and Neighbourhood

In [3]:
soup = BeautifulSoup(website_url,'lxml')
My_table = soup.find('table',{'class':'wikitable sortable'})
links=My_table.find_all('tr')
data = []
for row in links:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows
print(df.shape)
print(df.columns)
df.head(10)

(288, 3)
Index(['PostalCode', 'Borough', 'Neighbourhood'], dtype='object')


Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


### Remove "Not assigned" and then Aggregate


In [4]:
df.drop(df[df['Borough'] == 'Not assigned'].index,axis=0, inplace = True) 
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


### Combining the rows for same postal code

In [5]:
df_pbn = df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_pbn.columns = ['PostalCode', 'Borough', 'Neighbourhood']
df_pbn.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Replacing the values for Borough and Neighbourhood

In [6]:
df_pbn['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)
df_pbn.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['PostalCode', 'Latitude', 'Longitude']

In [8]:
df_pos = pd.merge(df_pbn, df_geo, on=['PostalCode'], how='inner')
df_tor = df_pos[['Borough', 'Neighbourhood', 'PostalCode', 'Latitude', 'Longitude']].copy()
df_tor.to_csv('toronto_base.csv')
df_tor.head()

Unnamed: 0,Borough,Neighbourhood,PostalCode,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


### Postal Codes in Toronto

In [9]:
# Loading the dataset which is about postal codes in Toronto
# This dataset was created in week 3. 
df_toronto = pd.read_csv('/home/kavita/kavita/DataScience/capstone_project/toronto_base.csv')
print(df_toronto.shape)
df_toronto.head()

(103, 6)


Unnamed: 0.1,Unnamed: 0,Borough,Neighbourhood,PostalCode,Latitude,Longitude
0,0,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,1,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,2,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


### geographical coordinates of the neighborhoods in the Toronto

In [10]:
from geopy.geocoders import Nominatim
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  after removing the cwd from sys.path.


The geograpical coordinate of the City of Toronto are 43.653963, -79.387207.


### Create a Map of Toronto City (with its Postal Codes' Regions)

In [11]:
# for the city Toronto, latitude and longtitude are manually extracted via google search
toronto_latitude = 43.6932; toronto_longitude = -79.3832
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

### Focusing on the "North York" Borough in Toronto (its neighborhoods)

In [12]:
# df_toronto['Borough'] == 'North York'

# selecting only neighborhoods regarding to "Scarborough" borough.
North_York_data = df_toronto[df_toronto['Borough'] == 'North York']
North_York_data = North_York_data.reset_index(drop=True).drop(columns = 'Unnamed: 0')
print(North_York_data.shape)
North_York_data

(24, 5)


Unnamed: 0,Borough,Neighbourhood,PostalCode,Latitude,Longitude
0,North York,Hillcrest Village,M2H,43.803762,-79.363452
1,North York,"Fairview, Henry Farm, Oriole",M2J,43.778517,-79.346556
2,North York,Bayview Village,M2K,43.786947,-79.385975
3,North York,"Silver Hills, York Mills",M2L,43.75749,-79.374714
4,North York,"Newtonbrook, Willowdale",M2M,43.789053,-79.408493
5,North York,Willowdale South,M2N,43.77012,-79.408493
6,North York,York Mills West,M2P,43.752758,-79.400049
7,North York,Willowdale West,M2R,43.782736,-79.442259
8,North York,Parkwoods,M3A,43.753259,-79.329656
9,North York,Don Mills North,M3B,43.745906,-79.352188


### Create a Map of North York and Its Neighbourhoods

In [13]:
from geopy.geocoders import Nominatim
address = 'North York, Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  after removing the cwd from sys.path.


The geograpical coordinate of the City of Toronto are 43.7708175, -79.4132998.


In [14]:
address_scar = 'North York, Toronto'
latitude_scar = 43.7708175
longitude_scar = -79.4132998
print('The geograpical coordinate of "North York" are: {}, {}.'.format(latitude_scar, longitude_scar))

map_North_York = folium.Map(location=[latitude_scar, longitude_scar], zoom_start=11.5)

# add markers to map
for lat, lng, label in zip(North_York_data['Latitude'], North_York_data['Longitude'], North_York_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 10,
        popup = label,
        color ='blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7).add_to(map_North_York)  
    
map_North_York

The geograpical coordinate of "North York" are: 43.7708175, -79.4132998.


In [15]:
def foursquare_crawler (postal_code_list, neighborhood_list, lat_list, lng_list, LIMIT = 500, radius = 1000):
    result_ds = []
    counter = 0
    for postal_code, neighborhood, lat, lng in zip(postal_code_list, neighborhood_list, lat_list, lng_list):
         
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, 
            lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        tmp_dict = {}
        tmp_dict['Postal Code'] = postal_code; tmp_dict['Neighborhood(s)'] = neighborhood; 
        tmp_dict['Latitude'] = lat; tmp_dict['Longitude'] = lng;
        tmp_dict['Crawling_result'] = results;
        result_ds.append(tmp_dict)
        counter += 1
        print('{}.'.format(counter))
        print('Data is Obtained, for the Postal Code {} (and Neighborhoods {}) SUCCESSFULLY.'.format(postal_code, neighborhood))
    return result_ds;


In [16]:
# @hiddel_cell
CLIENT_ID = 'CAKIKUUNLJVOKYRC4WAC2FD1QNEDOG0KJPX2ELOHGOFGQRAR' # your Foursquare ID
CLIENT_SECRET = 'DOO4UYAM2DEOXUKUXNYLB1SZON4QFMFERRHINY4YNLLZRUK2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version


### Crawling Internet (in fact only Foursquare database) for 
### Venues in the Neighborhoods inside "North York"

In [17]:
print('Crawling different neighborhoods inside "North York"')
North_York_foursquare_dataset = foursquare_crawler(list(North_York_data['PostalCode']),
                                                   list(North_York_data['Neighbourhood']),
                                                   list(North_York_data['Latitude']),
                                                   list(North_York_data['Longitude']),)


Crawling different neighborhoods inside "North York"
1.
Data is Obtained, for the Postal Code M2H (and Neighborhoods Hillcrest Village) SUCCESSFULLY.
2.
Data is Obtained, for the Postal Code M2J (and Neighborhoods Fairview, Henry Farm, Oriole) SUCCESSFULLY.
3.
Data is Obtained, for the Postal Code M2K (and Neighborhoods Bayview Village) SUCCESSFULLY.
4.
Data is Obtained, for the Postal Code M2L (and Neighborhoods Silver Hills, York Mills) SUCCESSFULLY.
5.
Data is Obtained, for the Postal Code M2M (and Neighborhoods Newtonbrook, Willowdale) SUCCESSFULLY.
6.
Data is Obtained, for the Postal Code M2N (and Neighborhoods Willowdale South) SUCCESSFULLY.
7.
Data is Obtained, for the Postal Code M2P (and Neighborhoods York Mills West) SUCCESSFULLY.
8.
Data is Obtained, for the Postal Code M2R (and Neighborhoods Willowdale West) SUCCESSFULLY.
9.
Data is Obtained, for the Postal Code M3A (and Neighborhoods Parkwoods) SUCCESSFULLY.
10.
Data is Obtained, for the Postal Code M3B (and Neighborhoods 

# Breakpoint:
## Saving results of Foursquare, so that we would not need to connect every time to Foursquare (and use our portions) .

In [18]:
import pickle
with open("North_York_foursquare_dataset.txt", "wb") as fp:   #Pickling
    pickle.dump(North_York_foursquare_dataset, fp)
print('Received Data from Internet is Saved to Computer.')  

Received Data from Internet is Saved to Computer.


In [19]:
with open("North_York_foursquare_dataset.txt", "rb") as fp:   # Unpickling
   North_York_foursquare_dataset = pickle.load(fp)
# print(type(North_York_foursquare_dataset))
# North_York_foursquare_dataset

### Cleaning the RAW Data Received from Foursquare Database

In [20]:
# This function is created to connect to the saved list which is the received database. It will extract each venue 
# for every neighborhood inside the database

def get_venue_dataset(foursquare_dataset):
    result_df = pd.DataFrame(columns = ['Postal Code', 'Neighborhood', 
                                           'Neighborhood Latitude', 'Neighborhood Longitude',
                                          'Venue', 'Venue Summary', 'Venue Category', 'Distance'])
    # print(result_df)
    
    for neigh_dict in foursquare_dataset:
        postal_code = neigh_dict['Postal Code']; neigh = neigh_dict['Neighborhood(s)']
        lat = neigh_dict['Latitude']; lng = neigh_dict['Longitude']
        print('Number of Venuse in Coordination "{}" Posal Code and "{}" Negihborhood(s) is:'.format(postal_code, neigh))
        print(len(neigh_dict['Crawling_result']))
        
        for venue_dict in neigh_dict['Crawling_result']:
            summary = venue_dict['reasons']['items'][0]['summary']
            name = venue_dict['venue']['name']
            dist = venue_dict['venue']['location']['distance']
            cat =  venue_dict['venue']['categories'][0]['name']
            
            
            # print({'Postal Code': postal_code, 'Neighborhood': neigh, 
            #                   'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
            #                   'Venue': name, 'Venue Summary': summary, 
            #                   'Venue Category': cat, 'Distance': dist})
            
            result_df = result_df.append({'Postal Code': postal_code, 'Neighborhood': neigh, 
                              'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
                              'Venue': name, 'Venue Summary': summary, 
                              'Venue Category': cat, 'Distance': dist}, ignore_index = True)
            # print(result_df)
    
    return(result_df)

In [21]:
North_York_venues = get_venue_dataset(North_York_foursquare_dataset)

Number of Venuse in Coordination "M2H" Posal Code and "Hillcrest Village" Negihborhood(s) is:
21
Number of Venuse in Coordination "M2J" Posal Code and "Fairview, Henry Farm, Oriole" Negihborhood(s) is:
44
Number of Venuse in Coordination "M2K" Posal Code and "Bayview Village" Negihborhood(s) is:
13
Number of Venuse in Coordination "M2L" Posal Code and "Silver Hills, York Mills" Negihborhood(s) is:
4
Number of Venuse in Coordination "M2M" Posal Code and "Newtonbrook, Willowdale" Negihborhood(s) is:
30
Number of Venuse in Coordination "M2N" Posal Code and "Willowdale South" Negihborhood(s) is:
100
Number of Venuse in Coordination "M2P" Posal Code and "York Mills West" Negihborhood(s) is:
20
Number of Venuse in Coordination "M2R" Posal Code and "Willowdale West" Negihborhood(s) is:
11
Number of Venuse in Coordination "M3A" Posal Code and "Parkwoods" Negihborhood(s) is:
29
Number of Venuse in Coordination "M3B" Posal Code and "Don Mills North" Negihborhood(s) is:
30
Number of Venuse in Coo

### Showing Venues for Each Neighborhood in North York

In [22]:
print(North_York_venues.shape)
North_York_venues.head()

(626, 8)


Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
0,M2H,Hillcrest Village,43.803762,-79.363452,Tastee,This spot is popular,Bakery,692
1,M2H,Hillcrest Village,43.803762,-79.363452,고려삼계탕 Korean Ginseng Chicken Soup & Bibimbap,This spot is popular,Korean Restaurant,754
2,M2H,Hillcrest Village,43.803762,-79.363452,Cummer Park,This spot is popular,Park,776
3,M2H,Hillcrest Village,43.803762,-79.363452,Galati,This spot is popular,Grocery Store,815
4,M2H,Hillcrest Village,43.803762,-79.363452,Tim Hortons,This spot is popular,Coffee Shop,731


In [23]:
North_York_venues.tail()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
621,M9M,"Emery, Humberlea",43.724766,-79.532242,Fiera Foods,This spot is popular,Bakery,823
622,M9M,"Emery, Humberlea",43.724766,-79.532242,Hwy 401 & Hwy 400,This spot is popular,Intersection,875
623,M9M,"Emery, Humberlea",43.724766,-79.532242,Joseph Bannon Park,This spot is popular,Park,889
624,M9M,"Emery, Humberlea",43.724766,-79.532242,U-Haul at Weston Rd,This spot is popular,Storage Facility,917
625,M9M,"Emery, Humberlea",43.724766,-79.532242,Multimodal Website Marketing,This spot is popular,Business Service,924


# Breakpoint: 
## End of Processing the Retrieved Information from Foursquare
## Saving a Cleaned Version of DataFrame as the Results from Foursquare 

In [24]:
North_York_venues.to_csv('North_York_venues.csv')

### Loading Data from File (Saved "Foursquare " DataFrame for Venues)

In [25]:
North_York_venues = pd.read_csv('North_York_venues.csv')

### Some Summary Information about Neighborhoods inside "North York"

In [26]:
neigh_list = list(North_York_venues['Neighborhood'].unique())
print('Number of Neighborhoods inside North York:')
print(len(neigh_list))
print('List of Neighborhoods inside North York:')
neigh_list

Number of Neighborhoods inside North York:
24
List of Neighborhoods inside North York:


['Hillcrest Village',
 'Fairview, Henry Farm, Oriole',
 'Bayview Village',
 'Silver Hills, York Mills',
 'Newtonbrook, Willowdale',
 'Willowdale South',
 'York Mills West',
 'Willowdale West',
 'Parkwoods',
 'Don Mills North',
 'Flemingdon Park, Don Mills South',
 'Bathurst Manor, Downsview North, Wilson Heights',
 'Northwood Park, York University',
 'CFB Toronto, Downsview East',
 'Downsview West',
 'Downsview Central',
 'Downsview Northwest',
 'Victoria Village',
 'Bedford Park, Lawrence Manor East',
 'Lawrence Heights, Lawrence Manor',
 'Glencairn',
 'Downsview, North Park, Upwood Park',
 'Humber Summit',
 'Emery, Humberlea']

### Some Summary Information about Neighborhoods inside "North York" Cont'd

In [27]:
neigh_venue_summary = North_York_venues.groupby('Neighborhood').count()
neigh_venue_summary.drop(columns = ['Unnamed: 0']).head()

Unnamed: 0_level_0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Bathurst Manor, Downsview North, Wilson Heights",27,27,27,27,27,27,27
Bayview Village,13,13,13,13,13,13,13
"Bedford Park, Lawrence Manor East",40,40,40,40,40,40,40
"CFB Toronto, Downsview East",22,22,22,22,22,22,22
Don Mills North,30,30,30,30,30,30,30


In [28]:
print('There are {} uniques categories.'.format(len(North_York_venues['Venue Category'].unique())))

print('Here is the list of different categories:')
list(North_York_venues['Venue Category'].unique())

There are 153 uniques categories.
Here is the list of different categories:


['Bakery',
 'Korean Restaurant',
 'Park',
 'Grocery Store',
 'Coffee Shop',
 'Bank',
 'Pizza Place',
 'Sandwich Place',
 'Pharmacy',
 'Housing Development',
 'Chinese Restaurant',
 'Ice Cream Shop',
 'Shopping Mall',
 'Recreation Center',
 'Pool',
 'Residential Building (Apartment / Condo)',
 'Diner',
 'Convenience Store',
 'Toy / Game Store',
 'Movie Theater',
 'Burger Joint',
 'Electronics Store',
 'Salon / Barbershop',
 'Tea Room',
 'Candy Store',
 'American Restaurant',
 'Fast Food Restaurant',
 'Department Store',
 'Juice Bar',
 'Smoothie Shop',
 'Theater',
 'Clothing Store',
 'Liquor Store',
 'Caribbean Restaurant',
 'Food Court',
 'Japanese Restaurant',
 'Restaurant',
 'Cosmetics Shop',
 'Sporting Goods Shop',
 'Beer Store',
 'Fried Chicken Joint',
 'Café',
 'Skating Rink',
 'Intersection',
 'Trail',
 'Hookah Bar',
 'Middle Eastern Restaurant',
 'Dessert Shop',
 'Hot Dog Joint',
 'Indian Restaurant',
 'Supermarket',
 'Ramen Restaurant',
 'Pet Store',
 'Steakhouse',
 'Seafood Res

In [29]:
# Just for fun and deeper understanding
print(type(North_York_venues[['Venue Category']]))

print(type(North_York_venues['Venue Category']))


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


### One-hot Encoding the "categroies" Column into Every Unique Categorical Feature.

In [30]:
# one hot encoding
North_York_onehot = pd.get_dummies(data = North_York_venues, drop_first  = False, 
                              prefix = "", prefix_sep = "", columns = ['Venue Category'])
North_York_onehot.head()

Unnamed: 0.1,Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Distance,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Beer Store,Bike Shop,Boutique,Bowling Alley,Boxing Gym,Breakfast Spot,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Stop,Business Service,Butcher,Cafeteria,Café,Candy Store,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Community Center,Concert Hall,Convenience Store,Cosmetics Shop,Creperie,Dance Studio,Deli / Bodega,Dentist's Office,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Event Space,Fabric Shop,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Fireworks Store,Fish & Chips Shop,Flea Market,Food & Drink Shop,Food Court,Frame Store,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,History Museum,Hockey Arena,Hookah Bar,Hot Dog Joint,Hotel,Housing Development,Ice Cream Shop,Indian Restaurant,Indonesian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Juice Bar,Karaoke Bar,Kitchen Supply Store,Korean Restaurant,Latin American Restaurant,Laundry Service,Liquor Store,Lounge,Massage Studio,Mediterranean Restaurant,Men's Store,Metro Station,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Movie Theater,Office,Other Repair Shop,Paper / Office Supplies Store,Park,Pet Store,Pharmacy,Photography Lab,Pizza Place,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Recreation Center,Residential Building (Apartment / Condo),Restaurant,Road,Salad Place,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shop & Service,Shopping Mall,Skating Rink,Ski Area,Ski Chalet,Smoothie Shop,Snack Place,Soccer Field,Spa,Sporting Goods Shop,Sports Bar,Sports Club,Steakhouse,Storage Facility,Supermarket,Sushi Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,0,M2H,Hillcrest Village,43.803762,-79.363452,Tastee,This spot is popular,692,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,M2H,Hillcrest Village,43.803762,-79.363452,고려삼계탕 Korean Ginseng Chicken Soup & Bibimbap,This spot is popular,754,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,M2H,Hillcrest Village,43.803762,-79.363452,Cummer Park,This spot is popular,776,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,M2H,Hillcrest Village,43.803762,-79.363452,Galati,This spot is popular,815,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,M2H,Hillcrest Village,43.803762,-79.363452,Tim Hortons,This spot is popular,731,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Manually Selecting (Subsetting) Related Features for the Fitness Center

In [31]:
# This list is created manually 
important_list_of_features = [
 
 'Neighborhood',
 'Neighborhood Latitude',
 'Neighborhood Longitude',

 'Park',
 'Shopping Mall',
 'Recreation Center',
 'Pool',
 'Residential Building (Apartment / Condo)',
 'Convenience Store',
 'Toy / Game Store',
 'Salon / Barbershop',
 'Department Store',
 'Sporting Goods Shop',
 'Skating Rink',
 'Trail',
 'Supermarket',
 'Sports Bar',
 'Lounge',
 'Yoga Studio',
 'Gym',
 'Intersection',
 'Bowling Alley',
 'Tennis Court',
 'Golf Course',
 'Playground',
 'Road',
 'Gym / Fitness Center',
 'Athletics & Sports',
 'Video Store',
 'Ski Chalet',
 'Community Center',
 'Ski Area',
 'Massage Studio',
 'Soccer Field',
 'Baseball Field',
 'Hockey Arena',
 "Men's Store",
 'Boxing Gym',
 'Sports Club']


### Updating the One-hot Encoded DataFrame and
### Grouping the Data by Neighborhoods

In [32]:
North_York_onehot = North_York_onehot[important_list_of_features].drop(
    columns = ['Neighborhood Latitude', 'Neighborhood Longitude']).groupby(
    'Neighborhood').sum()
North_York_onehot.head()

Unnamed: 0_level_0,Park,Shopping Mall,Recreation Center,Pool,Residential Building (Apartment / Condo),Convenience Store,Toy / Game Store,Salon / Barbershop,Department Store,Sporting Goods Shop,Skating Rink,Trail,Supermarket,Sports Bar,Lounge,Yoga Studio,Gym,Intersection,Bowling Alley,Tennis Court,Golf Course,Playground,Road,Gym / Fitness Center,Athletics & Sports,Video Store,Ski Chalet,Community Center,Ski Area,Massage Studio,Soccer Field,Baseball Field,Hockey Arena,Men's Store,Boxing Gym,Sports Club
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
"Bathurst Manor, Downsview North, Wilson Heights",1,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0
Bayview Village,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"Bedford Park, Lawrence Manor East",1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
"CFB Toronto, Downsview East",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
Don Mills North,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


### Showing the Fully-Processed DataFrame about Neighborhoods inside North York.
### This Dataset is Ready for any Machine Learning Algorithm.

# Run k-means to Cluster Neighborhoods into 5 Clusters

In [38]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters = 4, random_state = 0).fit(North_York_onehot)


## Showing Centers of Each Cluster

In [39]:
means_df = pd.DataFrame(kmeans.cluster_centers_)
means_df.columns = North_York_onehot.columns
means_df.index = ['G1','G2','G3','G4']
means_df['Total Sum'] = means_df.sum(axis = 1)
means_df.sort_values(axis = 0, by = ['Total Sum'], ascending=False)

Unnamed: 0,Park,Shopping Mall,Recreation Center,Pool,Residential Building (Apartment / Condo),Convenience Store,Toy / Game Store,Salon / Barbershop,Department Store,Sporting Goods Shop,Skating Rink,Trail,Supermarket,Sports Bar,Lounge,Yoga Studio,Gym,Intersection,Bowling Alley,Tennis Court,Golf Course,Playground,Road,Gym / Fitness Center,Athletics & Sports,Video Store,Ski Chalet,Community Center,Ski Area,Massage Studio,Soccer Field,Baseball Field,Hockey Arena,Men's Store,Boxing Gym,Sports Club,Total Sum
G4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
G2,2.5,0.666667,0.166667,0.333333,0.166667,0.666667,0.0,0.0,0.0,0.0,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0,0.166667,0.166667,0.333333,0.333333,0.166667,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,0.166667,0.0,6.833333
G1,1.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.0,0.333333,0.333333,0.0,0.666667,0.0,0.666667,0.666667,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,6.0
G3,0.571429,0.5,0.0,0.071429,0.0,0.285714,0.071429,0.071429,0.142857,0.071429,0.214286,0.214286,0.285714,0.142857,0.071429,0.071429,0.071429,0.214286,0.0,0.0,0.071429,0.0,0.071429,0.142857,0.142857,0.142857,0.071429,0.071429,0.071429,0.071429,0.0,0.071429,0.0,0.071429,0.0,0.071429,4.142857


## Results and scores of the cluster:
### Best Group is G4 with total sum of 8;
### Second Best Group is G2 with total sum of 6.833;
### Third Best Group is G1 with total sum of 6.0;

### Inserting "kmeans.labels_" into the Original North York DataFrame
#### Finding the Corresponding Group for Each Neighborhood.

In [48]:
neigh_summary = pd.DataFrame([North_York_onehot.index,1 + kmeans.labels_]).T
neigh_summary.columns = ['Neighborhood','Group']
neigh_summary

Unnamed: 0,Neighborhood,Group
0,"Bathurst Manor, Downsview North, Wilson Heights",3
1,Bayview Village,3
2,"Bedford Park, Lawrence Manor East",3
3,"CFB Toronto, Downsview East",1
4,Don Mills North,3
5,Downsview Central,3
6,Downsview Northwest,3
7,Downsview West,2
8,"Downsview, North Park, Upwood Park",3
9,"Emery, Humberlea",3


## After analyzing the results:
### Best Neighborhood Are...

In [49]:
neigh_summary[neigh_summary['Group'] == 4]

Unnamed: 0,Neighborhood,Group
11,"Flemingdon Park, Don Mills South",4


In [50]:
name_of_neigh = list(neigh_summary[neigh_summary['Group'] == 4]['Neighborhood'])[0]
North_York_venues[North_York_venues['Neighborhood'] == name_of_neigh].iloc[0,1:5].to_dict()

{'Postal Code': 'M3C',
 'Neighborhood': 'Flemingdon Park, Don Mills South',
 'Neighborhood Latitude': 43.72589970000001,
 'Neighborhood Longitude': -79.340923}

## Second Best Neighborhoods

In [51]:
neigh_summary[neigh_summary['Group'] == 2]

Unnamed: 0,Neighborhood,Group
7,Downsview West,2
13,Hillcrest Village,2
18,Parkwoods,2
19,"Silver Hills, York Mills",2
20,Victoria Village,2
23,York Mills West,2


In [52]:
name_of_neigh = list(neigh_summary[neigh_summary['Group'] == 2]['Neighborhood'])[0]
North_York_venues[North_York_venues['Neighborhood'] == name_of_neigh].iloc[0,1:5].to_dict()

{'Postal Code': 'M3L',
 'Neighborhood': 'Downsview West',
 'Neighborhood Latitude': 43.7390146,
 'Neighborhood Longitude': -79.5069436}

## Third Best Neighborhood

In [53]:
neigh_summary[neigh_summary['Group'] == 1]

Unnamed: 0,Neighborhood,Group
3,"CFB Toronto, Downsview East",1
12,Glencairn,1
15,"Lawrence Heights, Lawrence Manor",1


In [54]:
name_of_neigh = list(neigh_summary[neigh_summary['Group'] == 1]['Neighborhood'])[0]
North_York_venues[North_York_venues['Neighborhood'] == name_of_neigh].iloc[0,1:5].to_dict()

{'Postal Code': 'M3K',
 'Neighborhood': 'CFB Toronto, Downsview East',
 'Neighborhood Latitude': 43.737473200000004,
 'Neighborhood Longitude': -79.46476329999999}

### Observations:-
#### Flemingdon Park, Don Mills South is the best place to open a fitness center in North York borough of Toronto.