# Capstone Project Week-5 Notebook

<p> Importing the necessary libraries, Using BeautifulSoup to scrape the wikipedia page <p>

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

import matplotlib.cm as cm
import matplotlib.colors as colors

import folium

from sklearn.cluster import KMeans

#### Getting the web page using requests module

In [2]:
page=requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

#### Using BeautifulSoup to parse the page we requested

In [3]:
soup=BeautifulSoup(page.content,'html.parser')

#### Finding all the rows in the page using find_all method

In [4]:
trows=soup.find_all('tr')

In [5]:
#Empty lists to store the cells of each row
postal_code=[]
borough=[]
Neighbourhood=[]

<p> Looping over each row and reading each cell and then adding them to the corresponding list. Limiting the rows to 289 since that's where the postal code data table ends<p>

In [6]:
for tr in trows[1:289]:
    td=tr.find_all('td')
    postal_code.append(td[0].string)
    borough.append(td[1].string)
    Neighbourhood.append(td[2].get_text().strip("\n"))

#### Converting the data scraped into a DataFrame

In [7]:
df_pc=pd.DataFrame(list(zip(postal_code,borough,Neighbourhood)),columns=['PostalCode','Borough','Neighbourhood'])

In [8]:
df_pc.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Removing the Boroughs that are having value as Not assigned

In [9]:
df_pc['Borough'].replace('Not assigned',np.nan,inplace=True)
df_pc.dropna(axis=0,inplace=True)

#### Replacing Not assigned Neighbourhoods with the Borough

In [10]:
df_pc.loc[df_pc.Neighbourhood=='Not assigned','Neighbourhood']=df_pc.loc[df_pc.Neighbourhood=='Not assigned','Borough'].item()

In [11]:
df_pc.loc[8]

PostalCode                M7A
Borough          Queen's Park
Neighbourhood    Queen's Park
Name: 8, dtype: object

#### Combining the Neighbourhoods that are from same Postal Code

In [12]:
pc_clean=df_pc.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

In [13]:
pc_clean.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
pc_clean.shape

(103, 3)

#### Reading the location from the csv provided at the url

In [15]:
df_locs=pd.read_csv('http://cocl.us/Geospatial_data')

In [16]:
df_locs.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<p> Verifying that postal codes are in the same order in both the dataframes

In [17]:
(pc_clean.PostalCode==df_locs['Postal Code']).value_counts()

True    103
dtype: int64

#### Adding the Latitude and Longitude columns from the locations DataFrame to Postal Codes DataFrame

In [18]:
pc_clean[['Latitude','Longitude']]=df_locs[['Latitude','Longitude']]

In [19]:
pc_clean.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Gathering the data containing Toronto in the Borough of the Postal Code DataFrame

In [20]:
toronto_df=pc_clean[pc_clean['Borough'].str.contains("Toronto")][['Borough','Neighbourhood','Latitude','Longitude']]

In [21]:
toronto_df.shape

(38, 4)

<p> Removed the Foursquare API Keys<p>

In [23]:
LIMIT=100
radius=500

### Writing a function to get the top 100 venues that are in every neighbourhood within a radius of 500 meters.

In [24]:
def getNearbyVenues(names_list, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for names, lat, lng in zip(names_list, latitudes, longitudes):
        
        for name in names.split(","):
            
            print(name)
            
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
            results = requests.get(url).json()["response"]['groups'][0]['items']
            
            venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
            
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### writing the code to run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [25]:
toronto_venues = getNearbyVenues(names_list=toronto_df['Neighbourhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )


The Beaches
The Danforth West
Riverdale
The Beaches West
India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park
Summerhill East
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
Rosedale
Cabbagetown
St. James Town
Church and Wellesley
Harbourfront
Regent Park
Ryerson
Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide
King
Richmond
Harbourfront East
Toronto Islands
Union Station
Design Exchange
Toronto Dominion Centre
Commerce Court
Victoria Hotel
Roselawn
Forest Hill North
Forest Hill West
The Annex
North Midtown
Yorkville
Harbord
University of Toronto
Chinatown
Grange Park
Kensington Market
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place
Underground city
Christie
Dovercourt Village
Dufferin
Little Portugal
Trinity
Brockton
Exhibition Place
Parkdale Village
High Park
The Junction South
Parkdale
Roncesvall

In [26]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


In [51]:
toronto_locations=toronto_venues[['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']].groupby(['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']).count().reset_index()

### Checking the size of the toronto_venues dataframe

In [27]:
toronto_venues.shape

(3294, 7)

Let's check how many venues were returned for each neighborhood

In [28]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Bathurst Quay,16,16,16,16,16,16
Berczy Park,55,55,55,55,55,55
Brockton,22,22,22,22,22,22
Business Reply Mail Processing Centre 969 Eastern,19,19,19,19,19,19
CN Tower,16,16,16,16,16,16
Cabbagetown,46,46,46,46,46,46
Central Bay Street,88,88,88,88,88,88
Chinatown,100,100,100,100,100,100
Christie,15,15,15,15,15,15


## Analyzing each Neighborhood using One Hot Encoding


In [29]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
mylt=list(toronto_onehot.columns)
mylt.remove('Neighborhood')
new_col=['Neighborhood']
new_col.extend(mylt)
new_col
toronto_onehot=toronto_onehot[new_col]

In [30]:
toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


####  Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [31]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.01,0.000000,0.000000,0.0000,0.0000,0.0000,0.000,0.000,0.000,...,0.00000,0.00,0.010000,0.000000,0.000000,0.000000,0.010000,0.000000,0.010000,0.000000
1,Bathurst Quay,0.00,0.000000,0.000000,0.0625,0.0625,0.0625,0.125,0.125,0.125,...,0.00000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,Berczy Park,0.00,0.000000,0.000000,0.0000,0.0000,0.0000,0.000,0.000,0.000,...,0.00000,0.00,0.018182,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,Brockton,0.00,0.000000,0.000000,0.0000,0.0000,0.0000,0.000,0.000,0.000,...,0.00000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.045455
4,Business Reply Mail Processing Centre 969 Eastern,0.00,0.000000,0.000000,0.0000,0.0000,0.0000,0.000,0.000,0.000,...,0.00000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.052632
5,CN Tower,0.00,0.000000,0.000000,0.0625,0.0625,0.0625,0.125,0.125,0.125,...,0.00000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,Cabbagetown,0.00,0.000000,0.000000,0.0000,0.0000,0.0000,0.000,0.000,0.000,...,0.00000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,Central Bay Street,0.00,0.000000,0.000000,0.0000,0.0000,0.0000,0.000,0.000,0.000,...,0.00000,0.00,0.011364,0.000000,0.011364,0.000000,0.011364,0.000000,0.000000,0.011364
8,Chinatown,0.00,0.000000,0.000000,0.0000,0.0000,0.0000,0.000,0.000,0.000,...,0.00000,0.00,0.060000,0.000000,0.000000,0.030000,0.010000,0.000000,0.000000,0.000000
9,Christie,0.00,0.000000,0.000000,0.0000,0.0000,0.0000,0.000,0.000,0.000,...,0.00000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [32]:
toronto_grouped.shape

(73, 240)

### printing each neighborhood along with the top 5 most common venues

In [33]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2                  Bar  0.04
3  American Restaurant  0.04
4      Thai Restaurant  0.04


----Bathurst Quay----
              venue  freq
0  Airport Terminal  0.12
1    Airport Lounge  0.12
2   Airport Service  0.12
3               Bar  0.06
4     Boat or Ferry  0.06


----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.05
2  Italian Restaurant  0.04
3      Farmers Market  0.04
4                Café  0.04


----Brockton----
            venue  freq
0  Breakfast Spot  0.09
1            Café  0.09
2     Coffee Shop  0.09
3     Yoga Studio  0.05
4          Bakery  0.05


----Business Reply Mail Processing Centre 969 Eastern----
           venue  freq
0    Yoga Studio  0.05
1     Restaurant  0.05
2            Spa  0.05
3  Burrito Place  0.05
4     Smoke Shop  0.05


----CN Tower----
              venue  freq
0  Airport Terminal  0.12
1    Ai

## Creating a dataframe to show the top 10 venues for each neighborhood

In [34]:
# function to sort the venues in desceding order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [52]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,American Restaurant,Steakhouse,Thai Restaurant,Bar,Restaurant,Burger Joint,Gym,Cosmetics Shop
1,Bathurst Quay,Airport Service,Airport Terminal,Airport Lounge,Boutique,Coffee Shop,Sculpture Garden,Boat or Ferry,Plane,Bar,Airport Gate
2,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Italian Restaurant,Beer Bar,Steakhouse,Bakery,Seafood Restaurant,Café,Farmers Market
3,Brockton,Coffee Shop,Café,Breakfast Spot,Yoga Studio,Italian Restaurant,Pet Store,Climbing Gym,Restaurant,Caribbean Restaurant,Burrito Place
4,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Auto Workshop,Pizza Place,Gym / Fitness Center,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park


## Clustering the neighborhoods using K-Means

In [53]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 0, 0, 0, 2, 0, 0, 0, 0])

Now creating a dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [54]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_locations

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() 

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,43.650571,-79.384568,0,Coffee Shop,Café,American Restaurant,Steakhouse,Thai Restaurant,Bar,Restaurant,Burger Joint,Gym,Cosmetics Shop
1,Bathurst Quay,43.628947,-79.39442,2,Airport Service,Airport Terminal,Airport Lounge,Boutique,Coffee Shop,Sculpture Garden,Boat or Ferry,Plane,Bar,Airport Gate
2,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Cheese Shop,Italian Restaurant,Beer Bar,Steakhouse,Bakery,Seafood Restaurant,Café,Farmers Market
3,Brockton,43.636847,-79.428191,0,Coffee Shop,Café,Breakfast Spot,Yoga Studio,Italian Restaurant,Pet Store,Climbing Gym,Restaurant,Caribbean Restaurant,Burrito Place
4,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,0,Yoga Studio,Auto Workshop,Pizza Place,Gym / Fitness Center,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park


### Visualizing the clusters on Map

In [57]:
map_clusters = folium.Map(location=[43.65, -79.38], zoom_start=11) #Location of Toronto

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Neighborhood Latitude'], toronto_merged['Neighborhood Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster))
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


## Gathering New York City Locations and Neighborhood Data

Using the newyork_data.json that was used in Week 3 of the capstone

In [58]:
import wget
wget.download('https://cocl.us/new_york_dataset')
print('Data downloaded!')

Data downloaded!


In [60]:
import json
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [61]:
ny_neighborhoods_data = newyork_data['features']

#### Transforming the newyork city neighborhood data into a pandas dataframe

In [62]:
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
ny_neighborhoods = pd.DataFrame(columns=column_names)

for data in ny_neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    ny_neighborhoods = ny_neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    
ny_neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


#### Getting the Brooklyn neighborhood data from Newyork dataframe, to cluster the neighborhoods in brooklyn

In [69]:
brooklyn_data = ny_neighborhoods[ny_neighborhoods['Borough'] == 'Brooklyn'].reset_index(drop=True)
brooklyn_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Brooklyn,Bay Ridge,40.625801,-74.030621
1,Brooklyn,Bensonhurst,40.611009,-73.99518
2,Brooklyn,Sunset Park,40.645103,-74.010316
3,Brooklyn,Greenpoint,40.730201,-73.954241
4,Brooklyn,Gravesend,40.59526,-73.973471


#### Creating a dataframe of venues in all the brooklyn neighborhoods

In [70]:
brooklyn_venues = getNearbyVenues(names_list=brooklyn_data['Neighborhood'],
                                   latitudes=brooklyn_data['Latitude'],
                                   longitudes=brooklyn_data['Longitude']
                                  )
print(brooklyn_venues.shape)
brooklyn_venues.head()

Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker Heights
Gerritsen Beach
Marine Park
Clinton Hill
Sea Gate
Downtown
Boerum Hill
Prospect Lefferts Gardens
Ocean Hill
City Line
Bergen Beach
Midwood
Prospect Park South
Georgetown
East Williamsburg
North Side
South Side
Ocean Parkway
Fort Hamilton
Ditmas Park
Wingate
Rugby
Remsen Village
New Lots
Paerdegat Basin
Mill Basin
Fulton Ferry
Vinegar Hill
Weeksville
Broadway Junction
Dumbo
Homecrest
Highland Park
Madison
Erasmus
(2837, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bay Ridge,40.625801,-74.030621,Pilo Arts Day Spa and Salon,40.624748,-74.030591,Spa
1,Bay Ridge,40.625801,-74.030621,Bagel Boy,40.627896,-74.029335,Bagel Shop
2,Bay Ridge,40.625801,-74.030621,Cocoa Grinder,40.623967,-74.030863,Juice Bar
3,Bay Ridge,40.625801,-74.030621,Pegasus Cafe,40.623168,-74.031186,Breakfast Spot
4,Bay Ridge,40.625801,-74.030621,Leo's Casa Calamari,40.623348,-74.031082,Pizza Place


Now, Let's Analyze each neighborhood

In [76]:
brooklyn_onehot = pd.get_dummies(brooklyn_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
brooklyn_onehot['Neighborhood'] = brooklyn_venues['Neighborhood'] 

# move neighborhood column to the first column
mylt=list(brooklyn_onehot.columns)
mylt.remove('Neighborhood')
new_col=['Neighborhood']
new_col.extend(mylt)
new_col
brooklyn_onehot=brooklyn_onehot[new_col]

brooklyn_onehot.head()

Unnamed: 0,Neighborhood,Adult Boutique,American Restaurant,Antique Shop,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,...,Video Game Store,Video Store,Vietnamese Restaurant,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Bay Ridge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Bay Ridge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Bay Ridge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Bay Ridge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Bay Ridge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Making sure that toronto data frame and brooklyn dataframe have the same columns

To use the same kmeans model we used to cluster Toronto data the neighborhood data columns need to be similar, so removing the extra columns and adding missing columns

In [86]:

for col in toronto_onehot.columns:
    if col in brooklyn_onehot.columns:
        pass
    else:
        brooklyn_onehot[col]=0


In [91]:
for col in brooklyn_onehot.columns:
    if col not in toronto_onehot.columns:
        brooklyn_onehot.drop(col,axis=1,inplace=True)

In [92]:
brooklyn_onehot.shape

(2837, 240)

Grouping the brooklyn neighborhood data

In [93]:
brooklyn_grouped = brooklyn_onehot.groupby('Neighborhood').mean().reset_index()
brooklyn_grouped

Unnamed: 0,Neighborhood,Adult Boutique,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,...,Skate Park,Smoothie Shop,Soup Place,Stationery Store,Strip Club,Swim School,Tailor Shop,Tanning Salon,Theme Restaurant,Train Station
0,Bath Beach,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.020833,0.020833,...,0,0,0,0,0,0,0,0,0,0
1,Bay Ridge,0.00,0.035294,0.000000,0.000000,0.011765,0.000000,0.000000,0.035294,0.000000,...,0,0,0,0,0,0,0,0,0,0
2,Bedford Stuyvesant,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.033333,0.033333,0.000000,...,0,0,0,0,0,0,0,0,0,0
3,Bensonhurst,0.00,0.027027,0.000000,0.000000,0.000000,0.027027,0.000000,0.027027,0.027027,...,0,0,0,0,0,0,0,0,0,0
4,Bergen Beach,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
5,Boerum Hill,0.00,0.010870,0.010870,0.010870,0.021739,0.000000,0.000000,0.000000,0.021739,...,0,0,0,0,0,0,0,0,0,0
6,Borough Park,0.00,0.047619,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.047619,...,0,0,0,0,0,0,0,0,0,0
7,Brighton Beach,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.023256,...,0,0,0,0,0,0,0,0,0,0
8,Broadway Junction,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
9,Brooklyn Heights,0.00,0.020000,0.000000,0.000000,0.000000,0.020000,0.000000,0.010000,0.020000,...,0,0,0,0,0,0,0,0,0,0


#### Printing each neighborhood with top 5 most common venues

In [94]:
num_top_venues = 5

for hood in brooklyn_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = brooklyn_grouped[brooklyn_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bath Beach----
                  venue  freq
0              Pharmacy  0.06
1       Bubble Tea Shop  0.04
2    Chinese Restaurant  0.04
3  Fast Food Restaurant  0.04
4      Sushi Restaurant  0.04


----Bay Ridge----
                venue  freq
0                 Spa  0.07
1  Italian Restaurant  0.06
2         Pizza Place  0.05
3    Greek Restaurant  0.05
4          Bagel Shop  0.04


----Bedford Stuyvesant----
           venue  freq
0  Deli / Bodega  0.13
1    Pizza Place  0.07
2           Café  0.07
3    Coffee Shop  0.07
4            Bar  0.07


----Bensonhurst----
                venue  freq
0  Chinese Restaurant  0.11
1    Sushi Restaurant  0.05
2          Donut Shop  0.05
3         Pizza Place  0.05
4      Ice Cream Shop  0.05


----Bergen Beach----
             venue  freq
0  Harbor / Marina  0.33
1       Playground  0.17
2       Donut Shop  0.17
3   Adult Boutique  0.00
4       Shoe Store  0.00


----Boerum Hill----
            venue  freq
0    Dance Studio  0.04
1     Coffee 

Creating a DataFrame to see the top 10 most common venues in the neighborhood

In [95]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
br_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
br_neighborhoods_venues_sorted['Neighborhood'] = brooklyn_grouped['Neighborhood']

for ind in np.arange(brooklyn_grouped.shape[0]):
    br_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(brooklyn_grouped.iloc[ind, :], num_top_venues)

br_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bath Beach,Pharmacy,Fast Food Restaurant,Sushi Restaurant,Pizza Place,Chinese Restaurant,Italian Restaurant,Bubble Tea Shop,Diner,Dessert Shop,Deli / Bodega
1,Bay Ridge,Spa,Italian Restaurant,Greek Restaurant,Pizza Place,Bagel Shop,Bar,American Restaurant,Thai Restaurant,Sandwich Place,Playground
2,Bedford Stuyvesant,Deli / Bodega,Pizza Place,Coffee Shop,Café,Bar,Boutique,Park,Japanese Restaurant,Wine Bar,Vietnamese Restaurant
3,Bensonhurst,Chinese Restaurant,Sushi Restaurant,Donut Shop,Pizza Place,Ice Cream Shop,Smoke Shop,Noodle House,Grocery Store,Coffee Shop,Dessert Shop
4,Bergen Beach,Harbor / Marina,Donut Shop,Playground,Train Station,Garden Center,Gym / Fitness Center,Gym,Grocery Store,Greek Restaurant,Gourmet Shop


### Now Using the KMeans model used to cluster Toronto Neighborhood data, we need to predict the cluster labels for Brooklyn data 

In [97]:
brooklyn_grouped_clustering = brooklyn_grouped.drop('Neighborhood', 1)
br_cluster_labels=kmeans.predict(brooklyn_grouped_clustering)
br_cluster_labels[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [98]:
br_neighborhoods_venues_sorted.insert(0, 'Cluster Labels',br_cluster_labels)

brooklyn_merged = brooklyn_data

brooklyn_merged = brooklyn_merged.join(br_neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

brooklyn_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Brooklyn,Bay Ridge,40.625801,-74.030621,0,Spa,Italian Restaurant,Greek Restaurant,Pizza Place,Bagel Shop,Bar,American Restaurant,Thai Restaurant,Sandwich Place,Playground
1,Brooklyn,Bensonhurst,40.611009,-73.99518,0,Chinese Restaurant,Sushi Restaurant,Donut Shop,Pizza Place,Ice Cream Shop,Smoke Shop,Noodle House,Grocery Store,Coffee Shop,Dessert Shop
2,Brooklyn,Sunset Park,40.645103,-74.010316,0,Mexican Restaurant,Pizza Place,Latin American Restaurant,Bakery,Bank,Pharmacy,Gym,Record Shop,Stadium,Grocery Store
3,Brooklyn,Greenpoint,40.730201,-73.954241,0,Bar,Pizza Place,Coffee Shop,Cocktail Bar,Record Shop,French Restaurant,Yoga Studio,Bakery,Mexican Restaurant,Furniture / Home Store
4,Brooklyn,Gravesend,40.59526,-73.973471,0,Pizza Place,Italian Restaurant,Lounge,Bakery,Hookah Bar,Donut Shop,Gym,Chinese Restaurant,Gourmet Shop,Deli / Bodega


### Visualizing the Clusters to see which Neighborhoods in Brooklyn are similar to those in Toronto

In [99]:
br_map_clusters = folium.Map(location=[40.7127281, -74.0060152], zoom_start=11) #40.7127281, -74.0060152 for NY

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(brooklyn_merged['Latitude'], brooklyn_merged['Longitude'], brooklyn_merged['Neighborhood'], brooklyn_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(br_map_clusters)
       
br_map_clusters

#### With the visualization we can conclude that the neighborhoods in brooklyn are very similar to neighborhoods in Toronto's cluster 0