# Data scraping

In [1]:
# -*- coding: utf-8 -*-
import sys
import pandas as pd
#from bs4 import BeautifulSoup
import numpy as np
import requests
import json
from geopy.geocoders import Nominatim
import folium  # map rendering module
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

coding = sys.stdout.encoding

## Jump to next item!

In this section it I have extracted all information I needed from Wikipedia, wranggled and cleaned the data.
Now there is a CSV file already for next times.

In [3]:
# Scraping all mesoregions, microregions, and cities of São Paulo state from Wikipedia
url = 'https://pt.wikipedia.org/wiki/Lista_de_mesorregi%C3%B5es_e_microrregi%C3%B5es_de_S%C3%A3o_Paulo'
table_list = pd.read_html(url)

mesoregions_df = table_list[0]

microregions_df = table_list[1]
microregions_len = microregions_df.shape[0]
for i in range(2,len(table_list)-1):
    microregions_df = microregions_df.append(table_list[i], ignore_index=True)
    microregions_len += table_list[i].shape[0]

print(microregions_len)
microregions_df.shape

645


(645, 4)

In [4]:
# Cleaning mesoregions dataframe
mesoregions_df.drop(columns=['Código', 'Número de municípios', 'Localização', 'Código.1'], inplace=True)
mesoregions_df.columns = ['Mesoregion', 'Microregion']
mesoregions_df.head(3)

Unnamed: 0,Mesoregion,Microregion
0,São José do Rio Preto,Jales
1,São José do Rio Preto,Fernandópolis
2,São José do Rio Preto,Votuporanga


In [5]:
# Cleaning microregions dataframe.
microregions_df.drop(columns=['Código', 'Localização'], inplace=True)
microregions_df.columns = ['Microregion', 'City']
microregions_df.head(3)

Unnamed: 0,Microregion,City
0,Jales,Aparecida d'Oeste
1,Jales,Aspásia
2,Jales,Dirce Reis


In [6]:
# Merging mesoregions with microregions into one dataframe.
cities_df = mesoregions_df.merge(microregions_df, left_on='Microregion', right_on='Microregion')
print("Shape of cities_df: ", cities_df.shape)
cities_df.head(3)

Shape of cities_df:  (645, 3)


Unnamed: 0,Mesoregion,Microregion,City
0,São José do Rio Preto,Jales,Aparecida d'Oeste
1,São José do Rio Preto,Jales,Aspásia
2,São José do Rio Preto,Jales,Dirce Reis


In [7]:
# Collect population data of all cities in Brazil
population_url = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil_por_popula%C3%A7%C3%A3o'
population_list = pd.read_html(population_url)

# Drop all irrelevant data, keeping only cities from São Paulo state
population_df = population_list[0]
popsp_df = population_df[population_df['Unidade federativa'] == 'São Paulo'].iloc[:,2:].reset_index(drop=True)
popsp_df.drop(columns=['Unidade federativa'], inplace=True)
popsp_df.columns = ['City', 'Population']
popsp_df.head(3)

Unnamed: 0,City,Population
0,São Paulo,12252023
1,Guarulhos,1379182
2,Campinas,1204073


In [8]:
# merging city geographical information with population
spcities_df = cities_df.merge(popsp_df, left_on='City', right_on='City')
spcities_df.head(3)

Unnamed: 0,Mesoregion,Microregion,City,Population
0,São José do Rio Preto,Jales,Aparecida d'Oeste,4196
1,São José do Rio Preto,Jales,Aspásia,1822
2,São José do Rio Preto,Jales,Dirce Reis,1793


In [9]:
# Retrieving each city's geographical coordinates.
geolocator = Nominatim(user_agent='it_is_me')
longitude = []
latitude = []
i=0
for city in spcities_df['City']:
    city = city + ', SP'
    
    location = geolocator.geocode(city)
    longitude.append(location.longitude)
    latitude.append(location.latitude)
    i+=1
    print("{}/{}".format(i, spcities_df.shape[0]))

spcities_df['Longitude'] = longitude
spcities_df['Latitude'] = latitude

1/643
2/643
3/643
4/643
5/643
6/643
7/643
8/643
9/643
10/643
11/643
12/643
13/643
14/643
15/643
16/643
17/643
18/643
19/643
20/643
21/643
22/643
23/643
24/643
25/643
26/643
27/643
28/643
29/643
30/643
31/643
32/643
33/643
34/643
35/643
36/643
37/643
38/643
39/643
40/643
41/643
42/643
43/643
44/643
45/643
46/643
47/643
48/643
49/643
50/643
51/643
52/643
53/643
54/643
55/643
56/643
57/643
58/643
59/643
60/643
61/643
62/643
63/643
64/643
65/643
66/643
67/643
68/643
69/643
70/643
71/643
72/643
73/643
74/643
75/643
76/643
77/643
78/643
79/643
80/643
81/643
82/643
83/643
84/643
85/643
86/643
87/643
88/643
89/643
90/643
91/643
92/643
93/643
94/643
95/643
96/643
97/643
98/643
99/643
100/643
101/643
102/643
103/643
104/643
105/643
106/643
107/643
108/643
109/643
110/643
111/643
112/643
113/643
114/643
115/643
116/643
117/643
118/643
119/643
120/643
121/643
122/643
123/643
124/643
125/643
126/643
127/643
128/643
129/643
130/643
131/643
132/643
133/643
134/643
135/643
136/643
137/643
138/643
139/

## Return activity from here

Just read the prepared CSV file with all geographical data from São Paulo state.

In [2]:
spcities_df = pd.read_csv('saopaulo_cities.csv', index_col=0)
total_cities = spcities_df.shape[0]
print("Shape of spcities_df: ", spcities_df.shape)
spcities_df.head()

Shape of spcities_df:  (643, 6)


Unnamed: 0,Mesoregion,Microregion,City,Population,Longitude,Latitude
0,São José do Rio Preto,Jales,Aparecida d'Oeste,4196,-50.880871,-20.449811
1,São José do Rio Preto,Jales,Aspásia,1822,-50.728046,-20.160028
2,São José do Rio Preto,Jales,Dirce Reis,1793,-50.606276,-20.466407
3,São José do Rio Preto,Jales,Dolcinópolis,2115,-50.513261,-20.123074
4,São José do Rio Preto,Jales,Jales,49107,-50.549244,-20.267177


In [3]:
# Collecting state of São Paulo coordinates
geolocator = Nominatim(user_agent='it_is_me')
saopaulo = geolocator.geocode('São Paulo, Brazil')
latitude, longitude = (saopaulo.latitude+1, saopaulo.longitude-1)

# Create map of São Paulo using latitude and longitude values
map_sp = folium.Map(location=[latitude, longitude], zoom_start=7)

# add markers to map
max_population = spcities_df['Population'].max()
for lat, lng, population, city in zip(spcities_df['Latitude'], spcities_df['Longitude'], spcities_df['Population'], spcities_df['City']):
    label = '{}, {}'.format(city, population)
    label = folium.Popup(label, parse_html=True, max_width=100)
    if(city != 'São Paulo'):
        rad = 6*(population/1370000)
    else:
        rad = 7    
    folium.CircleMarker(
        [lat, lng],
        radius=rad,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sp)  
    
map_sp

**JUMP TO NEXT STEP!**

In [4]:
CLIENT_ID = 'YOUR_CLIENT_ID' # your Foursquare ID
CLIENT_SECRET = 'YOUR_CLIENT_SECRET' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
ACCESS_TOKEN = 'YOUR_ACCESS_TOKEN' # your access token

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    counter = 1
    max_city = len(names)
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(counter, '/', max_city)
        counter+=1
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        try:
            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
            
            print(name)
        except:
            venues_list.append([
                name, 
                lat, 
                lng, 
                '', 
                np.nan, 
                np.nan,  
                ''])
            
            print("No venue found at " + name)

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
saopaulo_venues = getNearbyVenues(names=spcities_df['City'],
                                 latitudes=spcities_df['Latitude'],
                                 longitudes=spcities_df['Longitude'],
                                 radius=5000,
                                 LIMIT=200)

1 / 643
Aparecida d'Oeste
2 / 643
Aspásia
3 / 643
Dirce Reis
4 / 643
Dolcinópolis
5 / 643
Jales
6 / 643
Marinópolis
7 / 643
Mesópolis
8 / 643
Nova Canaã Paulista
9 / 643
Palmeira d'Oeste
10 / 643
Paranapuã
11 / 643
Pontalinda
12 / 643
Populina
13 / 643
Rubineia
14 / 643
Santa Albertina
15 / 643
Santa Clara d'Oeste
16 / 643
Santa Fé do Sul
17 / 643
Santa Rita d'Oeste
18 / 643
Santa Salete
19 / 643
Santana da Ponte Pensa
20 / 643
São Francisco
21 / 643
Três Fronteiras
22 / 643
Urânia
23 / 643
Vitória Brasil
24 / 643
Estrela d'Oeste
25 / 643
Fernandópolis
26 / 643
Guarani d'Oeste
27 / 643
Indiaporã
28 / 643
Macedônia
29 / 643
Meridiano
30 / 643
Mira Estrela
31 / 643
Ouroeste
32 / 643
Pedranópolis
33 / 643
São João das Duas Pontes
34 / 643
Turmalina
35 / 643
Álvares Florence
36 / 643
Américo de Campos
37 / 643
Cardoso
38 / 643
Cosmorama
39 / 643
Parisi
40 / 643
Pontes Gestal
41 / 643
Riolândia
42 / 643
Valentim Gentil
43 / 643
Votuporanga
44 / 643
Adolfo
45 / 643
Altair
46 / 643
Bady Bassi

In [14]:
#saopaulo_venues.to_csv('1_saopaulo_venues.csv')
#saopaulo_venues = pd.read_csv('1_saopaulo_venues.csv')
print("Shape of saopaulo_venues: ", saopaulo_venues.shape)
saopaulo_venues.head(3)

Shape of saopaulo_venues:  (14757, 7)


Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Aparecida d'Oeste,-20.449811,-50.880871,Farmácia do Pedro,-20.451159,-50.881847,Pharmacy
1,Aparecida d'Oeste,-20.449811,-50.880871,Bar do Fabião,-20.45006,-50.886469,African Restaurant
2,Dirce Reis,-20.466407,-50.606276,Padaria Doce Pao,-20.46483,-50.606962,Bakery


In [15]:
print("There are {} different categories!".format(len(saopaulo_venues['Venue Category'].unique())))

There are 392 different categories!


In [19]:
# one hot encoding
saopaulo_onehot = pd.get_dummies(saopaulo_venues[['Venue Category']], prefix="", prefix_sep="")

# add city column back to dataframe
saopaulo_onehot['City'] = saopaulo_venues['City'] 

# move City column to the first column
fixed_columns = [saopaulo_onehot.columns[-1]] + list(saopaulo_onehot.columns[:-1])
saopaulo_onehot = saopaulo_onehot[fixed_columns]

saopaulo_onehot.to_csv('2_saopaulo_onehot.csv')
print('saopaulo_onehot.shape: ', saopaulo_onehot.shape)
saopaulo_onehot

saopaulo_onehot.shape:  (14757, 392)


Unnamed: 0,Zoo Exhibit,ATM,Acai House,Accessories Store,Adult Boutique,African Restaurant,Airport,Airport Lounge,Airport Terminal,American Restaurant,...,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14752,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14754,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# List and sort all categories
columns = list(saopaulo_onehot.columns)
columns.remove('City')
columns.sort()

# New categories dataframe
spcities_onehot = pd.DataFrame()

# Finding similar categories, by finding similar words, to merge them
for col in columns:
    spcities_onehot[col] = saopaulo_onehot[col]
    key = col.split(' ')[0]
    matching = [s for s in columns if (key) in s]
    for i in range (1, len(matching)):
        spcities_onehot[col] += saopaulo_onehot[matching[i]]
        columns.remove(matching[i])

spcities_onehot.insert(0, 'City', saopaulo_onehot['City'])
#spcities_onehot.to_csv('3_spcities_onehot.csv')
print("spcities_onehot.shape: ", spcities_onehot.shape)
spcities_onehot.head(3)

spcities_onehot.shape:  (14757, 229)


Unnamed: 0,City,ATM,Acai House,Accessories Store,Adult Boutique,African Restaurant,Airport,American Restaurant,Antique Shop,Aquarium,...,Volleyball Court,Warehouse Store,Watch Shop,Waterfall,Waterfront,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Aparecida d'Oeste,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Aparecida d'Oeste,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Dirce Reis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**CONTINUE FROM HERE!**

In [23]:
spcities_onehot = pd.read_csv('3_spcities_onehot.csv', index_col=0)
print("spcities_onehot.shape: ", spcities_onehot.shape)
spcities_onehot.head(3)

spcities_onehot.shape:  (14757, 229)


Unnamed: 0,City,ATM,Acai House,Accessories Store,Adult Boutique,African Restaurant,Airport,American Restaurant,Antique Shop,Aquarium,...,Volleyball Court,Warehouse Store,Watch Shop,Waterfall,Waterfront,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Aparecida d'Oeste,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Aparecida d'Oeste,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Dirce Reis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Eliminate categories with too few or too much occurences.
categories_counter = spcities_onehot.sum()
categories_counter = categories_counter[categories_counter<630]  # too much means most of cities have, so it does not differenciate cities
categories_counter = categories_counter[categories_counter>50]  # too few means too few cities have it, so it also does not differenciate them
relevant_categories = list(categories_counter.index)
relevant_categories

['Art Gallery',
 'BBQ Joint',
 'Beach',
 'Bed & Breakfast',
 'Beer Garden',
 'Breakfast Spot',
 'Burger Joint',
 'Bus Line',
 'Café',
 'Chocolate Shop',
 'Churrascaria',
 'Clothing Store',
 'Coffee Shop',
 'Convenience Store',
 'Cosmetics Shop',
 'Department Store',
 'Dessert Shop',
 'Diner',
 'Dog Run',
 'Eastern European Restaurant',
 'Farm',
 'Fast Food Restaurant',
 'Fish & Chips Shop',
 'Food',
 'Go Kart Track',
 'Grocery Store',
 'Hot Spring',
 'Ice Cream Shop',
 'Italian Restaurant',
 'Japanese Restaurant',
 'Market',
 'Movie Theater',
 'Music Venue',
 'Nightclub',
 'Park',
 'Pet Service',
 'Pharmacy',
 'Plaza',
 'Pub',
 'Resort',
 'Theater']

In [47]:
# Calculating mean frequency of occurence of each venue on each city = cities' profiles
saopaulo_grouped = saopaulo_onehot.groupby('City').mean().reset_index()
print('saopaulo_grouped.shape: ', saopaulo_grouped.shape)
saopaulo_grouped.head(3)

saopaulo_grouped.shape:  (631, 397)


Unnamed: 0,City,Zoo Exhibit,ATM,Acai House,Accessories Store,Adult Boutique,African Restaurant,Airport,Airport Lounge,Airport Terminal,...,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Adamantina,0.0,0.0,0.0,0.0,0.0,0.0,0.019231,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Adolfo,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aguaí,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
# Merge cities' profile to population info
spcities_profile = saopaulo_grouped.merge(spcities_df[['City','Population']], left_on='City', right_on='City')

# Normalize Population data
min_population = spcities_profile['Population'].min()
max_population = spcities_profile['Population'].max()
spcities_profile['Population'] = (spcities_profile['Population'] - min_population)/(max_population - min_population)

print("spcities_profile.shape = ", spcities_profile.shape)
spcities_profile.head(3)

spcities_profile.shape =  (631, 398)


Unnamed: 0,City,Zoo Exhibit,ATM,Acai House,Accessories Store,Adult Boutique,African Restaurant,Airport,Airport Lounge,Airport Terminal,...,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Population
0,Adamantina,0.0,0.0,0.0,0.0,0.0,0.0,0.019231,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1164.997138
1,Adolfo,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1164.999709
2,Aguaí,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1164.997037


In [None]:
#spcities_profile = pd.read_csv('spcities_profile.csv')

In [59]:
# Function to sort the first 'num_top_venues' most common venues on a row.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:-1]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10  # considerate the first 10 most common venues.
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cities_venues_sorted = pd.DataFrame(columns=columns)
cities_venues_sorted['City'] = saopaulo_grouped['City']

for ind in np.arange(saopaulo_grouped.shape[0]):
    cities_venues_sorted.iloc[ind, 1:] = return_most_common_venues(saopaulo_grouped.iloc[ind, :], num_top_venues)

print("cities_venues_sorted.shape: ", cities_venues_sorted.shape)
cities_venues_sorted.head(3)

cities_venues_sorted.shape:  (631, 11)


Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adamantina,Ice Cream Shop,Bar,Hotel,Restaurant,Bakery,Pizza Place,Park,Burger Joint,Japanese Restaurant,Sandwich Place
1,Adolfo,Adult Boutique,Ice Cream Shop,Pizza Place,Yoga Studio,Fishing Spot,Factory,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant
2,Aguaí,Bakery,Restaurant,Pizza Place,Bus Station,Soccer Stadium,Fish & Chips Shop,Gas Station,Park,Grocery Store,Market


## Clustering

In [49]:
# Import K-Means from clustering stage
from sklearn.cluster import KMeans

In [88]:
# set number of clusters
kclusters = 6

spcities_clustering = spcities_profile.drop('City', 1)

# run K-Means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(spcities_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 4, 0, 0, 0, 0])

In [89]:
# add clustering labels
#cities_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
cities_venues_sorted['Cluster Labels'] = kmeans.labels_

# copy neighborhoods_df on toronto_merged
spcities_merged = spcities_df[['City','Latitude','Longitude','Population']]

# merge toronto neighborhoods dataframe with neighborhoods profile dataframe
spcities_merged = spcities_merged.merge(cities_venues_sorted.set_index('City'), left_on='City', right_on='City')

print("shape of spcities_merged: ", spcities_merged.shape)
spcities_merged.head(3) # check first rows

shape of spcities_merged:  (631, 15)


Unnamed: 0,City,Latitude,Longitude,Population,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Aparecida d'Oeste,-20.449811,-50.880871,4196,0,Boutique,Pharmacy,Yoga Studio,Fishing Spot,Factory,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field
1,Dirce Reis,-20.466407,-50.606276,1793,0,Park,Yoga Studio,Event Space,Factory,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field,Film Studio
2,Dolcinópolis,-20.123074,-50.513261,2115,1,Ice Cream Shop,Tree,Yoga Studio,Fish Market,Exhibit,Factory,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant


In [80]:
#spcities_merged = pd.read_csv('spcities_merged.csv')

In [90]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude+0.3, longitude-0.8], zoom_start=7.3)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
min_population = spcities_merged['Population'].min()
for lat, lon, poi, cluster, population in zip(spcities_merged['Latitude'], spcities_merged['Longitude'], spcities_merged['City'], spcities_merged['Cluster Labels'], spcities_merged['Population']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    if(poi != 'São Paulo'):
        rad = 4*(population/(1370000-min_population)) + 2.5
    else:
        rad = 10 
    folium.CircleMarker(
        [lat, lon],
        radius=rad,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters