# Clustering Districts from São Paulo based on venues categories (map visualization)

# Retriving Coordinates data of São Paulo neighbourhood from Wikipedia and tranforming it to apply Clustering techiniques for further analysis of neighbourhoods clusters based on venues categories (from Foursquare API) and map location plot

### Extracting table from Url (Wikipedia Page) using Panda

In [50]:
# importing libraries

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files


from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


import folium # map rendering library


In [39]:
import pandas as pd
url = "https://pt.wikipedia.org/wiki/Lista_dos_distritos_de_S%C3%A3o_Paulo_por_popula%C3%A7%C3%A3o"
tables = pd.read_html(url) 
tables[1]

Unnamed: 0,Posição,Distrito,População 2010,Unnamed: 3
0,1.0,Grajaú,360.787,
1,2.0,Jardim Ângela,295.434,
2,3.0,Sapopemba,284.524,
3,4.0,Capão Redondo,268.729,
4,5.0,Jardim São Luís,267.871,
...,...,...,...,...
92,93.0,Sé,23.651,
93,94.0,Pari,17.299,
94,95.0,Barra Funda,14.383,
95,96.0,Marsilac,8.258,


### Dropping unwanted columns and NaN values

In [40]:
bairros_sp=pd.DataFrame(tables[1])
bairros_sp

Unnamed: 0,Posição,Distrito,População 2010,Unnamed: 3
0,1.0,Grajaú,360.787,
1,2.0,Jardim Ângela,295.434,
2,3.0,Sapopemba,284.524,
3,4.0,Capão Redondo,268.729,
4,5.0,Jardim São Luís,267.871,
...,...,...,...,...
92,93.0,Sé,23.651,
93,94.0,Pari,17.299,
94,95.0,Barra Funda,14.383,
95,96.0,Marsilac,8.258,


In [41]:
sp_neighborhood=bairros_sp.drop(['População 2010', 'Unnamed: 3', 'Posição'], axis=1)
sp_neighborhood

Unnamed: 0,Distrito
0,Grajaú
1,Jardim Ângela
2,Sapopemba
3,Capão Redondo
4,Jardim São Luís
...,...
92,Sé
93,Pari
94,Barra Funda
95,Marsilac


In [43]:
sp_neighborhood.rename(columns = {'Distrito':'Neighborhood'}, inplace = True) #renaming for english language

In [44]:
sp_neighborhood


Unnamed: 0,Neighborhood
0,Grajaú
1,Jardim Ângela
2,Sapopemba
3,Capão Redondo
4,Jardim São Luís
...,...
92,Sé
93,Pari
94,Barra Funda
95,Marsilac


In [46]:
sp_neighborhood.dropna(subset=["Neighborhood"], axis=0, inplace=True) # Excluding invalid entrys

In [48]:
sp_neighborhood.shape

(96, 1)

## Creating the correct dataframe with coordinates 

## Making a dataframe of the postal code of each neighborhood. In order to utilize the Foursquare location data, getting the latitude and the longitude coordinates of each neighborhood

### Extracting São Paulo location coordinates using Geolocator

In [51]:
address = 'São Paulo, SP'

geolocator = Nominatim(user_agent="sp_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of São Paulo are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of São Paulo are -23.5506507, -46.6333824.


### Creating columns for the final dataframe including Longitude and Latitude

In [53]:
sp_neighborhood['Latitude']=''
sp_neighborhood['Longitude']=''
sp_neighborhood

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Grajaú,,
1,Jardim Ângela,,
2,Sapopemba,,
3,Capão Redondo,,
4,Jardim São Luís,,
5,Cidade Ademar,,
6,Brasilândia,,
7,Sacomã,,
8,Itaim Paulista,,
9,Jabaquara,,


## Looping trough all the neighborhood, retriving Coordinates from Geolocator and filling the sp_neighborhood DataFrame with the data

In [73]:
geolocator = Nominatim(user_agent="saop_explorer")
for x in range(96):
    adress = (sp_neighborhood['Neighborhood'][x]+' São Paulo, SP')

    
    location = geolocator.geocode(sp_neighborhood['Neighborhood'][x])
    latitude = location.latitude
    longitude = location.longitude
    sp_neighborhood['Latitude'][x]=latitude
    sp_neighborhood['Longitude'][x]=longitude

In [74]:
sp_neighborhood

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Grajaú,-5.8154,-46.1361
1,Jardim Ângela,-23.7125,-46.7687
2,Sapopemba,-23.6043,-46.5099
3,Capão Redondo,-23.6719,-46.7794
4,Jardim São Luís,-23.6836,-46.7378
5,Cidade Ademar,-23.673,-46.6553
6,Brasilândia,-21.2556,-52.0366
7,Sacomã,-23.6013,-46.6026
8,Itaim Paulista,-23.5018,-46.3996
9,Jabaquara,-23.6521,-46.65


### Refining data to restrict analysis onto the City of São Paulo (refining the retrieved coordinates)

#### Geolocator returned some wrong values. I'll refine the retrieved data

In [118]:
# Excluding errors from geolocator - wrong coordinates
sp=sp_neighborhood[sp_neighborhood.Latitude < (-23)]

In [119]:
sp=sp[sp.Latitude > (-23.8)]

In [120]:
sp=sp[sp.Longitude > (-47)]

In [121]:
sp=sp[sp.Longitude < (-46)]
sp

Unnamed: 0,Neighborhood,Latitude,Longitude
1,Jardim Ângela,-23.7125,-46.7687
2,Sapopemba,-23.6043,-46.5099
3,Capão Redondo,-23.6719,-46.7794
4,Jardim São Luís,-23.6836,-46.7378
5,Cidade Ademar,-23.673,-46.6553
7,Sacomã,-23.6013,-46.6026
8,Itaim Paulista,-23.5018,-46.3996
9,Jabaquara,-23.6521,-46.65
10,Cidade Tiradentes,-23.5825,-46.4092
11,Campo Limpo,-23.6326,-46.7597


## Creating a map of São Paulo with neighborhoods superimposed on top, using Folium and the refined coordinates from Geolocator


In [122]:
# create map of New York using latitude and longitude values
map_saopaulo = folium.Map(location=[-23.5506507, -46.6333824], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(sp['Latitude'], sp['Longitude'], sp['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_saopaulo)  
    
map_saopaulo

## Utilizing the Foursquare API to explore the neighborhoods and segment them


In [123]:
CLIENT_ID = 'SD3E0JFKCWQLDOK30ON5MCPQ3AA2ZEPW4OBH4VYIF55EA4UP' # your Foursquare ID
CLIENT_SECRET = '1QRRNYZ4TKLQCM3ING0X4PDYOA10CYEKDGZ5NAMR12FAVKWH' # your Foursquare Secret
ACCESS_TOKEN = 'L2EOSFH0TP2JYBFNGPVQSPU35LPH4W2W2CA0RA1SH2OFDVK2' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: SD3E0JFKCWQLDOK30ON5MCPQ3AA2ZEPW4OBH4VYIF55EA4UP
CLIENT_SECRET:1QRRNYZ4TKLQCM3ING0X4PDYOA10CYEKDGZ5NAMR12FAVKWH


### Creating a function to repeat the same process to all the neighborhoods in São Paulo

- Get the neighborhood's latitude and longitude values
- Get the top 100 venues that are in every Neighbourhood within a radius of 500 meters
- Create the GET request URL
- Send the GET request and examine the resutls
- Clean the json and structure it into a _pandas_ dataframe
- Analizing data


In [124]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [125]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [127]:
sp_venues = getNearbyVenues(names=sp['Neighborhood'],
                                   latitudes=sp['Latitude'],
                                   longitudes=sp['Longitude']
                                  )

Jardim Ângela
Sapopemba
Capão Redondo
Jardim São Luís
Cidade Ademar
Sacomã
Itaim Paulista
Jabaquara
Cidade Tiradentes
Campo Limpo
Itaquera
Cidade Dutra
Pirituba
Vila Curuçá
Vila Jacuí
São Lucas
Freguesia do Ó
Cangaíba
Jardim Helena
Vila Mariana
Vila Medeiros
Vila Andrade
Cidade Líder
José Bonifácio
Rio Pequeno
Ermelino Matarazzo
Cursino
Vila Sônia
Mandaqui
Artur Alvim
Vila Matilde
Vila Prudente
Guaianases
Raposo Tavares
Tucuruvi
Vila Formosa
Ponte Rasa
Itaim Bibi
São Miguel Paulista
Tatuapé
Jardim Paulista
Casa Verde
Água Rasa
Carrão
Limão
Parque do Carmo
Pinheiros
Mooca
Vila Guilherme
Butantã
República
Morumbi
Alto de Pinheiros
Vila Leopoldina
Brás
Jaguara


In [128]:
sp_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Jardim Ângela,-23.712528,-46.76872,Cida Manicure,-23.715485,-46.769722,Health & Beauty Service
1,Jardim Ângela,-23.712528,-46.76872,Pastéis Suely,-23.716364,-46.769401,Pastelaria
2,Jardim Ângela,-23.712528,-46.76872,Padaria Nova Aracati,-23.716672,-46.767894,Bakery
3,Sapopemba,-23.604326,-46.509885,Academia Vigor,-23.604081,-46.509578,Gym
4,Sapopemba,-23.604326,-46.509885,Bar 1 Conto,-23.60767,-46.510774,Gastropub


In [129]:
sp_venues.shape

(1019, 7)

In [130]:
sp_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alto de Pinheiros,11,11,11,11,11,11
Artur Alvim,18,18,18,18,18,18
Brás,21,21,21,21,21,21
Butantã,23,23,23,23,23,23
Campo Limpo,13,13,13,13,13,13
Cangaíba,8,8,8,8,8,8
Capão Redondo,4,4,4,4,4,4
Carrão,30,30,30,30,30,30
Casa Verde,23,23,23,23,23,23
Cidade Ademar,6,6,6,6,6,6


In [131]:
print('There are {} uniques categories.'.format(len(sp_venues['Venue Category'].unique()))) #unique categories of venues

There are 199 uniques categories.


## Analyzing Each Neighborhood and preparing with one_hot_encoding for clustering the data

In [132]:
# one hot encoding
sp_onehot = pd.get_dummies(sp_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sp_onehot['Neighborhood'] = sp_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [sp_onehot.columns[-1]] + list(sp_onehot.columns[:-1])
sp_onehot = sp_onehot[fixed_columns]

sp_onehot.head()

Unnamed: 0,Neighborhood,Acai House,Accessories Store,American Restaurant,Arcade,Argentinian Restaurant,Art Studio,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auditorium,Auto Dealership,BBQ Joint,Bagel Shop,Bakery,Bar,Bed & Breakfast,Beer Bar,Beer Garden,Beer Store,Big Box Store,Bike Rental / Bike Share,Bistro,Bookstore,Boutique,Bowling Alley,Brazilian Restaurant,Breakfast Spot,Brewery,Buffet,Building,Burger Joint,Bus Station,Butcher,Café,Cajun / Creole Restaurant,Candy Store,Carpet Store,Cheese Shop,Chinese Restaurant,Chocolate Shop,Churrascaria,Clothing Store,Coffee Shop,College Gym,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Dive Bar,Dog Run,Donut Shop,Drugstore,Dumpling Restaurant,Electronics Store,Empanada Restaurant,Escape Room,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Flea Market,Food,Food & Drink Shop,Food Truck,French Restaurant,Fried Chicken Joint,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden Center,Gastropub,General Entertainment,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gun Range,Gym,Gym / Fitness Center,Gym Pool,Gymnastics Gym,Hardware Store,Health & Beauty Service,Health Food Store,High School,Historic Site,History Museum,Hostel,Hot Dog Joint,Hotel,IT Services,Ice Cream Shop,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Latin American Restaurant,Liquor Store,Lottery Retailer,Lounge,Market,Martial Arts School,Mattress Store,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Mineiro Restaurant,Miscellaneous Shop,Mobile Phone Shop,Motel,Motorcycle Shop,Movie Theater,Museum,Music Venue,Newsstand,Nightclub,Noodle House,Northeastern Brazilian Restaurant,Northern Brazilian Restaurant,Office,Optical Shop,Outdoors & Recreation,Paper / Office Supplies Store,Park,Pastelaria,Pastry Shop,Pedestrian Plaza,Performing Arts Venue,Persian Restaurant,Peruvian Restaurant,Pet Store,Pharmacy,Pizza Place,Planetarium,Playground,Plaza,Pool Hall,Portuguese Restaurant,Racetrack,Record Shop,Rental Service,Residential Building (Apartment / Condo),Rest Area,Restaurant,Rock Club,Salad Place,Salon / Barbershop,Samba School,Sandwich Place,School,Science Museum,Seafood Restaurant,Shoe Store,Shop & Service,Shopping Mall,Skate Park,Snack Place,Soccer Field,Soccer Stadium,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Street Art,Supermarket,Sushi Restaurant,Taco Place,Tapas Restaurant,Tapiocaria,Tattoo Parlor,Tea Room,Tennis Court,Theater,Theme Park,Thrift / Vintage Store,Toy / Game Store,Track,Trail,Travel Agency,Vegetarian / Vegan Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Jardim Ângela,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Jardim Ângela,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Jardim Ângela,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Sapopemba,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Sapopemba,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [133]:
sp_onehot.shape

(1019, 200)

#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [134]:
sp_grouped = sp_onehot.groupby('Neighborhood').mean().reset_index()
sp_grouped

Unnamed: 0,Neighborhood,Acai House,Accessories Store,American Restaurant,Arcade,Argentinian Restaurant,Art Studio,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auditorium,Auto Dealership,BBQ Joint,Bagel Shop,Bakery,Bar,Bed & Breakfast,Beer Bar,Beer Garden,Beer Store,Big Box Store,Bike Rental / Bike Share,Bistro,Bookstore,Boutique,Bowling Alley,Brazilian Restaurant,Breakfast Spot,Brewery,Buffet,Building,Burger Joint,Bus Station,Butcher,Café,Cajun / Creole Restaurant,Candy Store,Carpet Store,Cheese Shop,Chinese Restaurant,Chocolate Shop,Churrascaria,Clothing Store,Coffee Shop,College Gym,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Dive Bar,Dog Run,Donut Shop,Drugstore,Dumpling Restaurant,Electronics Store,Empanada Restaurant,Escape Room,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Flea Market,Food,Food & Drink Shop,Food Truck,French Restaurant,Fried Chicken Joint,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden Center,Gastropub,General Entertainment,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gun Range,Gym,Gym / Fitness Center,Gym Pool,Gymnastics Gym,Hardware Store,Health & Beauty Service,Health Food Store,High School,Historic Site,History Museum,Hostel,Hot Dog Joint,Hotel,IT Services,Ice Cream Shop,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Latin American Restaurant,Liquor Store,Lottery Retailer,Lounge,Market,Martial Arts School,Mattress Store,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Mineiro Restaurant,Miscellaneous Shop,Mobile Phone Shop,Motel,Motorcycle Shop,Movie Theater,Museum,Music Venue,Newsstand,Nightclub,Noodle House,Northeastern Brazilian Restaurant,Northern Brazilian Restaurant,Office,Optical Shop,Outdoors & Recreation,Paper / Office Supplies Store,Park,Pastelaria,Pastry Shop,Pedestrian Plaza,Performing Arts Venue,Persian Restaurant,Peruvian Restaurant,Pet Store,Pharmacy,Pizza Place,Planetarium,Playground,Plaza,Pool Hall,Portuguese Restaurant,Racetrack,Record Shop,Rental Service,Residential Building (Apartment / Condo),Rest Area,Restaurant,Rock Club,Salad Place,Salon / Barbershop,Samba School,Sandwich Place,School,Science Museum,Seafood Restaurant,Shoe Store,Shop & Service,Shopping Mall,Skate Park,Snack Place,Soccer Field,Soccer Stadium,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Street Art,Supermarket,Sushi Restaurant,Taco Place,Tapas Restaurant,Tapiocaria,Tattoo Parlor,Tea Room,Tennis Court,Theater,Theme Park,Thrift / Vintage Store,Toy / Game Store,Track,Trail,Travel Agency,Vegetarian / Vegan Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Alto de Pinheiros,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0
1,Artur Alvim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.055556,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.055556,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Brás,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190476,0.0,0.0,0.047619,0.0,0.047619,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Butantã,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.043478,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.043478,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130435,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0
4,Campo Limpo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.076923,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Cangaíba,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Capão Redondo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Carrão,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.033333,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.033333,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.066667,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Casa Verde,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.086957,0.043478,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.086957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
9,Cidade Ademar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
sp_grouped.shape

(56, 200)

#### Printing each neighborhood along with the top 5 most common venues

In [173]:
num_top_venues = 5

for hood in sp_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = sp_grouped[sp_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Alto de Pinheiros----
     venue  freq
0    Plaza  0.45
1  Dog Run  0.09
2      Spa  0.09
3    Trail  0.09
4     Café  0.09


----Artur Alvim----
              venue  freq
0  Department Store  0.11
1       Pizza Place  0.11
2               Bar  0.06
3    Cosmetics Shop  0.06
4          Pharmacy  0.06


----Brás----
                  venue  freq
0  Brazilian Restaurant  0.19
1        Clothing Store  0.10
2                Buffet  0.05
3     Convenience Store  0.05
4                  Café  0.05


----Butantã----
                    venue  freq
0          Science Museum  0.13
1   Performing Arts Venue  0.04
2          History Museum  0.04
3            Soccer Field  0.04
4  Furniture / Home Store  0.04


----Campo Limpo----
            venue  freq
0           Diner  0.08
1  Cosmetics Shop  0.08
2     Supermarket  0.08
3      Street Art  0.08
4             Gym  0.08


----Cangaíba----
            venue  freq
0          Bakery  0.25
1        Pharmacy  0.25
2  Chocolate Shop  0.12
3      S

####  Converting into a _pandas_ dataframe and

In [174]:
#a function to sort the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### creating dataframe for top 5 venues for each neighbourhood

In [175]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = sp_grouped['Neighborhood']

for ind in np.arange(sp_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sp_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Alto de Pinheiros,Plaza,Trail,Bike Rental / Bike Share,Dog Run,Café
1,Artur Alvim,Pizza Place,Department Store,Pharmacy,Beer Garden,Sports Bar
2,Brás,Brazilian Restaurant,Clothing Store,Hot Dog Joint,Gaming Cafe,Dessert Shop
3,Butantã,Science Museum,History Museum,Mattress Store,Fruit & Vegetable Store,Music Venue
4,Campo Limpo,Food Truck,Dessert Shop,Big Box Store,Gym,Restaurant


## Clustering Neighborhoods using KMeans algorithm

In [176]:
# set number of clusters
kclusters = 3

sp_grouped_clustering = sp_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sp_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 1, 0, 0, 1, 1, 2, 1, 1, 1], dtype=int32)

#### Creating a new dataframe that includes the cluster as well as the top 5 venues for each neighborhood.

In [177]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

sp_merged = sp

# merge toronto_grouped with manhattan_data to add latitude/longitude for each neighborhood
sp_merged = sp_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

sp_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,Jardim Ângela,-23.7125,-46.7687,1,Pastelaria,Bakery,Health & Beauty Service,French Restaurant,Food Truck
2,Sapopemba,-23.6043,-46.5099,1,Gym,Market,Gastropub,Falafel Restaurant,Metro Station
3,Capão Redondo,-23.6719,-46.7794,2,Electronics Store,Plaza,Flea Market,Park,Empanada Restaurant
4,Jardim São Luís,-23.6836,-46.7378,0,Playground,Department Store,Japanese Restaurant,Pizza Place,Bus Station
5,Cidade Ademar,-23.673,-46.6553,1,Bakery,Gymnastics Gym,Soccer Field,Mobile Phone Shop,Grocery Store


### Visualizing the resulting clusters

In [178]:
# create map
map_clusters = folium.Map(location=[-23.5506507, -46.6333824], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sp_merged['Latitude'], sp_merged['Longitude'], sp_merged['Neighborhood'], sp_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examining clusters

### Can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, we can then assign a name to each cluster

In [186]:
cluster0 = sp_merged.loc[sp_merged['Cluster Labels'] == 0, sp_merged.columns[:]]

In [187]:
cluster0

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
4,Jardim São Luís,-23.6836,-46.7378,0,Playground,Department Store,Japanese Restaurant,Pizza Place,Bus Station
7,Sacomã,-23.6013,-46.6026,0,Brazilian Restaurant,Bar,Pharmacy,Chocolate Shop,Farmers Market
14,Cidade Dutra,-23.714,-46.6991,0,Pharmacy,Bar,Burger Joint,Brazilian Restaurant,Gym
35,Cidade Líder,-23.5628,-46.4943,0,Brazilian Restaurant,Gym / Fitness Center,Salad Place,Samba School,Burger Joint
36,José Bonifácio,-23.5641,-46.4348,0,Racetrack,Comfort Food Restaurant,General Entertainment,Women's Store,Empanada Restaurant
44,Mandaqui,-23.4839,-46.6346,0,Bar,Pizza Place,Brewery,Brazilian Restaurant,Market
47,Vila Matilde,-23.5362,-46.5246,0,Ice Cream Shop,Bar,Pizza Place,Gym / Fitness Center,Farmers Market
52,Tucuruvi,-23.4801,-46.6033,0,Dessert Shop,Fast Food Restaurant,Health Food Store,Pizza Place,Coffee Shop
55,Ponte Rasa,-23.511,-46.4871,0,Pharmacy,Pizza Place,Diner,Paper / Office Supplies Store,Market
56,Itaim Bibi,-23.5844,-46.6784,0,Japanese Restaurant,Brazilian Restaurant,Restaurant,Sushi Restaurant,Gym / Fitness Center


In [189]:
cluster1 = sp_merged.loc[sp_merged['Cluster Labels'] == 1, sp_merged.columns[:]]

In [191]:
cluster1

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,Jardim Ângela,-23.7125,-46.7687,1,Pastelaria,Bakery,Health & Beauty Service,French Restaurant,Food Truck
2,Sapopemba,-23.6043,-46.5099,1,Gym,Market,Gastropub,Falafel Restaurant,Metro Station
5,Cidade Ademar,-23.673,-46.6553,1,Bakery,Gymnastics Gym,Soccer Field,Mobile Phone Shop,Grocery Store
8,Itaim Paulista,-23.5018,-46.3996,1,Japanese Restaurant,Bakery,Dessert Shop,Bowling Alley,Gym / Fitness Center
9,Jabaquara,-23.6521,-46.65,1,Soccer Field,Convenience Store,Brazilian Restaurant,Juice Bar,Miscellaneous Shop
10,Cidade Tiradentes,-23.5825,-46.4092,1,Wings Joint,Clothing Store,Furniture / Home Store,Bus Station,Electronics Store
11,Campo Limpo,-23.6326,-46.7597,1,Food Truck,Dessert Shop,Big Box Store,Gym,Restaurant
12,Itaquera,-23.5361,-46.4555,1,Convenience Store,Pharmacy,Clothing Store,Chocolate Shop,Café
17,Pirituba,-23.4855,-46.7219,1,Soccer Field,Hot Dog Joint,Fast Food Restaurant,Grocery Store,Tea Room
19,Vila Curuçá,-23.5102,-46.4179,1,Cosmetics Shop,Health & Beauty Service,Pizza Place,Farmers Market,Snack Place


In [192]:
cluster2 = sp_merged.loc[sp_merged['Cluster Labels'] == 2, sp_merged.columns[:]]

In [193]:
cluster2

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
3,Capão Redondo,-23.6719,-46.7794,2,Electronics Store,Plaza,Flea Market,Park,Empanada Restaurant
51,Raposo Tavares,-23.5916,-46.7804,2,Plaza,Food Truck,Women's Store,Empanada Restaurant,French Restaurant
85,Alto de Pinheiros,-23.5495,-46.7123,2,Plaza,Trail,Bike Rental / Bike Share,Dog Run,Café


In [200]:
cluster0.groupby(by='1st Most Common Venue').agg('count')


Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Cluster Labels,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1st Most Common Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bar,2,2,2,2,2,2,2,2
Brazilian Restaurant,6,6,6,6,6,6,6,6
Coffee Shop,1,1,1,1,1,1,1,1
Dessert Shop,1,1,1,1,1,1,1,1
IT Services,1,1,1,1,1,1,1,1
Ice Cream Shop,1,1,1,1,1,1,1,1
Italian Restaurant,1,1,1,1,1,1,1,1
Japanese Restaurant,1,1,1,1,1,1,1,1
Pharmacy,2,2,2,2,2,2,2,2
Playground,1,1,1,1,1,1,1,1


In [195]:
cluster0.groupby(by='2nd Most Common Venue').agg('count')

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2nd Most Common Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bar,3,3,3,3,3,3,3,3
Brazilian Restaurant,1,1,1,1,1,1,1,1
Burger Joint,2,2,2,2,2,2,2,2
Clothing Store,1,1,1,1,1,1,1,1
Comfort Food Restaurant,1,1,1,1,1,1,1,1
Department Store,1,1,1,1,1,1,1,1
Dessert Shop,1,1,1,1,1,1,1,1
Fast Food Restaurant,1,1,1,1,1,1,1,1
Gym / Fitness Center,1,1,1,1,1,1,1,1
History Museum,1,1,1,1,1,1,1,1


In [196]:
cluster0.groupby(by='3rd Most Common Venue').agg('count')

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,4th Most Common Venue,5th Most Common Venue
3rd Most Common Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bar,2,2,2,2,2,2,2,2
Brewery,1,1,1,1,1,1,1,1
Burger Joint,1,1,1,1,1,1,1,1
Coffee Shop,1,1,1,1,1,1,1,1
Diner,1,1,1,1,1,1,1,1
General Entertainment,1,1,1,1,1,1,1,1
Health Food Store,1,1,1,1,1,1,1,1
Hot Dog Joint,1,1,1,1,1,1,1,1
Hotel,1,1,1,1,1,1,1,1
Ice Cream Shop,1,1,1,1,1,1,1,1


In [201]:
cluster1.groupby(by='1st Most Common Venue').agg('count')

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Cluster Labels,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1st Most Common Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bakery,9,9,9,9,9,9,9,9
Convenience Store,1,1,1,1,1,1,1,1
Cosmetics Shop,1,1,1,1,1,1,1,1
Food Truck,2,2,2,2,2,2,2,2
Fruit & Vegetable Store,1,1,1,1,1,1,1,1
Gym,1,1,1,1,1,1,1,1
Gym / Fitness Center,1,1,1,1,1,1,1,1
Ice Cream Shop,1,1,1,1,1,1,1,1
Japanese Restaurant,1,1,1,1,1,1,1,1
Market,1,1,1,1,1,1,1,1


In [202]:
cluster1.groupby(by='2nd Most Common Venue').agg('count')

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2nd Most Common Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BBQ Joint,1,1,1,1,1,1,1,1
Bakery,4,4,4,4,4,4,4,4
Brazilian Restaurant,2,2,2,2,2,2,2,2
Butcher,1,1,1,1,1,1,1,1
Café,1,1,1,1,1,1,1,1
Chocolate Shop,1,1,1,1,1,1,1,1
Clothing Store,1,1,1,1,1,1,1,1
Comfort Food Restaurant,1,1,1,1,1,1,1,1
Convenience Store,1,1,1,1,1,1,1,1
Cosmetics Shop,1,1,1,1,1,1,1,1


In [203]:
cluster1.groupby(by='3rd Most Common Venue').agg('count')

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,4th Most Common Venue,5th Most Common Venue
3rd Most Common Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Athletics & Sports,1,1,1,1,1,1,1,1
BBQ Joint,2,2,2,2,2,2,2,2
Bakery,1,1,1,1,1,1,1,1
Bar,1,1,1,1,1,1,1,1
Big Box Store,1,1,1,1,1,1,1,1
Bistro,1,1,1,1,1,1,1,1
Brazilian Restaurant,1,1,1,1,1,1,1,1
Burger Joint,2,2,2,2,2,2,2,2
Clothing Store,2,2,2,2,2,2,2,2
Dessert Shop,1,1,1,1,1,1,1,1


In [204]:
cluster2.groupby(by='1st Most Common Venue').agg('count')

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Cluster Labels,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1st Most Common Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Electronics Store,1,1,1,1,1,1,1,1
Plaza,2,2,2,2,2,2,2,2


In [205]:
cluster2.groupby(by='2nd Most Common Venue').agg('count')

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2nd Most Common Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Food Truck,1,1,1,1,1,1,1,1
Plaza,1,1,1,1,1,1,1,1
Trail,1,1,1,1,1,1,1,1


In [206]:
cluster2.groupby(by='3rd Most Common Venue').agg('count')

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,4th Most Common Venue,5th Most Common Venue
3rd Most Common Venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bike Rental / Bike Share,1,1,1,1,1,1,1,1
Flea Market,1,1,1,1,1,1,1,1
Women's Store,1,1,1,1,1,1,1,1


## After the analysis, we got the 3 clusters and respective most common venues

- Cluster 0: 1st Most Common Venue: Brazilian Restaurant; 2nd Most Common: Bar; Venue 3rd Most Common Venue: Bar
- Cluster 1: 1st Most Common Venue: Bakery; 2nd Most Common: Bakery Venue 3rd Most Common Venue: Restaurant
- Cluster 2: 1st Most Common Venue: Plaza; 2nd Most Common: Plaza Venue 3rd Most Common Venue: Flea Market

#### Assigning cluster according to its caracteristics

In [208]:
BarAndRestaurant_cluster = cluster0

In [209]:
Bakery_cluster = cluster1

In [210]:
Hotel_cluster = cluster2