### Read html and have a quick look at it. The table we are interested is the first one so we choose table [0]

In [1]:
import pandas as pd

In [2]:
hoods = pd.read_html("https://www.asuntojenhinnat.fi/myytyjen-asuntojen-tilastot/kunta/helsinki", encoding="utf-8")

In [3]:
hoods = hoods[0]

In [4]:
hoods.head()

Unnamed: 0,#,Kunta,Q2´19 (€/m²),Q1´19 (€/m²),+/- (€/m²),Ennuste 2018 (%)
0,1,Kaivopuisto,8376,8474,-98,"-0,39%"
1,2,Kaartinkaupunki,8205,8205,0,"0,18%"
2,3,Punavuori,7813,7481,332,"-0,55%"
3,4,Eira,7765,8051,-286,"-0,25%"
4,5,Ruoholahti,7645,7595,50,"0,43%"


### Clean the data

In [5]:
hoods.drop(['#','Q1´19 (€/m²)', '+/- (€/m²)','Ennuste 2018 (%)'], axis=1, inplace=True)

In [6]:
hoods.rename(columns={'Kunta':'Neighborhood','Q2´19 (€/m²)':'Price'}, inplace=True)

In [7]:
hoods.tail()

Unnamed: 0,Neighborhood,Price
70,Pihlajamäki,2748
71,Suurmetsä,2728
72,Mellunkylä,2477
73,Kontula,2324
74,Jakomäki,2126


### Building the Geocoder (Nominatim) to retrieve latitude and longitude values

In [8]:
from tqdm import tqdm
tqdm.pandas()
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from geopy.extra.rate_limiter import RateLimiter

address = 'Helsinki, Finland'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
hoods['Location'] = hoods['Neighborhood'].progress_apply(geocode)

hoods['point'] = hoods['Location'].apply(lambda loc: tuple(loc.point) if loc else None)

100%|██████████| 75/75 [01:19<00:00,  1.07s/it]


In [9]:
hoods.head()

Unnamed: 0,Neighborhood,Price,Location,point
0,Kaivopuisto,8376,"(Kaivopuisto, Eteläinen suurpiiri, Helsinki, H...","(60.1568425, 24.9567212, 0.0)"
1,Kaartinkaupunki,8205,"(Kaartinkaupunki, Eteläinen suurpiiri, Helsink...","(60.1652138, 24.9472225, 0.0)"
2,Punavuori,7813,"(Punavuori, Eteläinen suurpiiri, Helsinki, Hel...","(60.1612371, 24.9365046, 0.0)"
3,Eira,7765,"(Eira, Eteläinen suurpiiri, Helsinki, Helsingi...","(60.1561911, 24.9383747, 0.0)"
4,Ruoholahti,7645,"(Ruoholahti, Eteläinen suurpiiri, Helsinki, He...","(60.162925, 24.9114974, 0.0)"


### Checking for neighborhoods that didn't get location values. Change their names a bit so that Nominatim can find the values

In [10]:
nan_rows = hoods[hoods.isnull().T.any().T]
nan_rows

Unnamed: 0,Neighborhood,Price,Location,point
9,Pohjois-Meilahti,6469,,
19,Toukola-Vanhakaupunki,5671,,
60,Itäkeskus-Marjaniemi,3244,,


In [11]:
import numpy as np
hoods['Neighborhood'].replace("Pohjois-Meilahti", "Meilahti", inplace = True)
hoods['Neighborhood'].replace("Toukola-Vanhakaupunki", "Toukola", inplace = True)
hoods['Neighborhood'].replace("Itäkeskus-Marjaniemi", "Itäkeskus", inplace = True)

In [12]:
tqdm.pandas()

geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
hoods['Location'] = hoods['Neighborhood'].progress_apply(geocode)

hoods['point'] = hoods['Location'].apply(lambda loc: tuple(loc.point) if loc else None)

100%|██████████| 75/75 [01:18<00:00,  1.08s/it]


In [13]:
nan_rows = hoods[hoods.isnull().T.any().T]
nan_rows

Unnamed: 0,Neighborhood,Price,Location,point


### Format the latitude and longitude values into a cleaner look

In [14]:
hoods['point'] = hoods['point'].astype(str)

In [15]:
hoods[['Latitude','Longitude','GetOff']] = hoods.point.str.split(expand=True) 
hoods.head()

Unnamed: 0,Neighborhood,Price,Location,point,Latitude,Longitude,GetOff
0,Kaivopuisto,8376,"(Kaivopuisto, Eteläinen suurpiiri, Helsinki, H...","(60.1568425, 24.9567212, 0.0)","(60.1568425,","24.9567212,",0.0)
1,Kaartinkaupunki,8205,"(Kaartinkaupunki, Eteläinen suurpiiri, Helsink...","(60.1652138, 24.9472225, 0.0)","(60.1652138,","24.9472225,",0.0)
2,Punavuori,7813,"(Punavuori, Eteläinen suurpiiri, Helsinki, Hel...","(60.1612371, 24.9365046, 0.0)","(60.1612371,","24.9365046,",0.0)
3,Eira,7765,"(Eira, Eteläinen suurpiiri, Helsinki, Helsingi...","(60.1561911, 24.9383747, 0.0)","(60.1561911,","24.9383747,",0.0)
4,Ruoholahti,7645,"(Ruoholahti, Eteläinen suurpiiri, Helsinki, He...","(60.162925, 24.9114974, 0.0)","(60.162925,","24.9114974,",0.0)


In [16]:
hoods[['Latitude']] = hoods.Latitude.str.replace(',', '')
hoods[['Latitude']] = hoods.Latitude.str.replace('(', '')
hoods[['Longitude']] = hoods.Longitude.str.replace(',', '')

hoods.drop(columns =['point', 'GetOff'], inplace=True)

hoods[['Latitude']] = hoods.Latitude.astype(float)
hoods[['Longitude']] = hoods.Longitude.astype(float)

hoods.head()

Unnamed: 0,Neighborhood,Price,Location,Latitude,Longitude
0,Kaivopuisto,8376,"(Kaivopuisto, Eteläinen suurpiiri, Helsinki, H...",60.156843,24.956721
1,Kaartinkaupunki,8205,"(Kaartinkaupunki, Eteläinen suurpiiri, Helsink...",60.165214,24.947222
2,Punavuori,7813,"(Punavuori, Eteläinen suurpiiri, Helsinki, Hel...",60.161237,24.936505
3,Eira,7765,"(Eira, Eteläinen suurpiiri, Helsinki, Helsingi...",60.156191,24.938375
4,Ruoholahti,7645,"(Ruoholahti, Eteläinen suurpiiri, Helsinki, He...",60.162925,24.911497


### Install Folium to do some serious mapping

In [17]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.3 MB

The following NEW packages will be 

### Get the coordinates of Helsinki and map all the neighborhoods

In [18]:
address = 'Helsinki'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Helsinki are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Helsinki are 60.1674086, 24.9425683.


In [19]:
map_hki = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(hoods['Latitude'], hoods['Longitude'], hoods['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_hki)  
    
map_hki

### Setting up Foursquare and using it to find out what the neughborhoods are like.

In [20]:
CLIENT_ID = 'YOURFOURSQUAREID' # Foursquare ID
CLIENT_SECRET = 'YOURFOURSQUARESECRET' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [21]:
hoods.loc[0, 'Neighborhood']

'Kaivopuisto'

In [22]:
neighborhood_latitude = hoods.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = hoods.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = hoods.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Kaivopuisto are 60.1568425, 24.9567212.


In [23]:
radius = 500
LIMIT = 50

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=IRNOGRDYYJ11E54M55G1EFFTIOJ30SGHJCDIE4KMDKJHEHVH&client_secret=VLP3HC3BNNZH0BIPH0KFHXSFN30QZGFA2ZERZONEY3DYUEQM&ll=60.1568425,24.9567212&v=20180605&radius=500&limit=50'

In [24]:
import requests
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d87875892e7a9002ce30c36'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Kaivopuisto',
  'headerFullLocation': 'Kaivopuisto, Helsinki',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 26,
  'suggestedBounds': {'ne': {'lat': 60.161342504500006,
    'lng': 24.965747234279576},
   'sw': {'lat': 60.1523424955, 'lng': 24.947695165720425}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4adcdb22f964a520826021e3',
       'name': 'Kaivopuisto / Brunnsparken (Kaivopuisto)',
       'location': {'lat': 60.15680978091396,
        'lng': 24.95673179626465,
        'labeledLatLngs': [{'label': 'display',
          'lat': 60.15680978091396

In [25]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [26]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Kaivopuisto / Brunnsparken (Kaivopuisto),Park,60.15681,24.956732
1,Kaivopuiston ranta,Waterfront,60.153937,24.955273
2,Mattolaituri,Wine Bar,60.153767,24.955871
3,Ullanlinnanmäki,Park,60.155718,24.955471
4,Helsingin Jäätelötehdas,Ice Cream Shop,60.155643,24.950891


In [27]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Hood', 
                  'Hood Latitude', 
                  'Hood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [28]:
dt_venues = getNearbyVenues(names=hoods['Neighborhood'],
                                   latitudes=hoods['Latitude'],
                                   longitudes=hoods['Longitude']
                                  )

Kaivopuisto
Kaartinkaupunki
Punavuori
Eira
Ruoholahti
Kruununhaka
Etu-Töölö
Keski-Töölö
Taka-Töölö
Meilahti
Sörnäinen
Etu-Vallila
Vattuniemi
Kallio
Katajanokka
Kalasatama
Lauttasaari
Munkkiniemi
Ruskeasuo
Toukola
Kulosaari
Vallila
Käpylä
Pikku Huopalahti
Pitäjänmäen teollisuusalue
Etelä-Haaga
Herttoniemi
Itä-Pasila
Länsi-Herttoniemi
Länsi-Pasila
Aurinkolahti
Viikki
Pajamäki
Kivihaka
Munkkivuori-Niemenmäki
Kuusisaari-Lehtisaari
Länsi-Pakila
Pohjois-Haaga
Metsälä-Etelä-Oulunkylä
Lassila
Vartioharju
Roihuvuori
Laajasalo
Oulunkylä-Patola
Jollas
Tapanila
Paloheinä
Tammisalo
Itä-Pakila
Reimarla
Maununneva
Maunula-Suursuo
Konala
Kannelmäki
Puotila
Etelä-Vuosaari
Pukinmäki-Savela
Etelä-Laajasalo
Koskela-Helsinki
Veräjämäki
Itäkeskus
Tapaninvainio
Myllypuro
Puotinharju
Malmi
Tuomarinkylä-Torpparinmäki
Puistola
Pohjois-Vuosaari
Malminkartano
Siltamäki
Pihlajamäki
Suurmetsä
Mellunkylä
Kontula
Jakomäki


In [29]:
print(dt_venues.shape)
dt_venues.head()

(1349, 7)


Unnamed: 0,Hood,Hood Latitude,Hood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Kaivopuisto,60.156843,24.956721,Kaivopuisto / Brunnsparken (Kaivopuisto),60.15681,24.956732,Park
1,Kaivopuisto,60.156843,24.956721,Kaivopuiston ranta,60.153937,24.955273,Waterfront
2,Kaivopuisto,60.156843,24.956721,Mattolaituri,60.153767,24.955871,Wine Bar
3,Kaivopuisto,60.156843,24.956721,Ullanlinnanmäki,60.155718,24.955471,Park
4,Kaivopuisto,60.156843,24.956721,Helsingin Jäätelötehdas,60.155643,24.950891,Ice Cream Shop


In [30]:
dt_venues.groupby('Hood').count()

Unnamed: 0_level_0,Hood Latitude,Hood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Hood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aurinkolahti,19,19,19,19,19,19
Eira,36,36,36,36,36,36
Etelä-Haaga,13,13,13,13,13,13
Etelä-Laajasalo,4,4,4,4,4,4
Etelä-Vuosaari,28,28,28,28,28,28
Etu-Töölö,39,39,39,39,39,39
Etu-Vallila,34,34,34,34,34,34
Herttoniemi,37,37,37,37,37,37
Itä-Pakila,8,8,8,8,8,8
Itä-Pasila,29,29,29,29,29,29


In [31]:
# one hot encoding
dt_onehot = pd.get_dummies(dt_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dt_onehot['Hood'] = dt_venues['Hood'] 

# move neighborhood column to the first column
fixed_columns = [dt_onehot.columns[-1]] + list(dt_onehot.columns[:-1])
dt_onehot = dt_onehot[fixed_columns]

dt_onehot.head()

Unnamed: 0,Hood,ATM,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auditorium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Waterfront,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Kaivopuisto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Kaivopuisto,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,Kaivopuisto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,Kaivopuisto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Kaivopuisto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
dt_grouped = dt_onehot.groupby('Hood').mean().reset_index()
dt_grouped

Unnamed: 0,Hood,ATM,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auditorium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Waterfront,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Aurinkolahti,0.0,0.00000,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,Eira,0.0,0.00000,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.027778,0.027778,0.000000,0.000000,0.000000,0.000000,0.000000
2,Etelä-Haaga,0.0,0.00000,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,Etelä-Laajasalo,0.0,0.00000,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,Etelä-Vuosaari,0.0,0.00000,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,Etu-Töölö,0.0,0.00000,0.00,0.00,0.000000,0.000000,0.025641,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.025641,0.000000
6,Etu-Vallila,0.0,0.00000,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.029412,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.029412
7,Herttoniemi,0.0,0.00000,0.00,0.00,0.000000,0.027027,0.000000,0.000000,0.000000,...,0.027027,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.027027,0.000000
8,Itä-Pakila,0.0,0.00000,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,Itä-Pasila,0.0,0.00000,0.00,0.00,0.000000,0.000000,0.034483,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### I want to know what the top 7 venues in each neighborhood are

In [33]:
num_top_venues = 7

for hood in dt_grouped['Hood']:
    print("----"+hood+"----")
    temp = dt_grouped[dt_grouped['Hood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Aurinkolahti----
                   venue  freq
0          Grocery Store  0.16
1               Bus Stop  0.11
2                   Park  0.11
3                 Resort  0.05
4     Salon / Barbershop  0.05
5  Sri Lankan Restaurant  0.05
6   Gym / Fitness Center  0.05


----Eira----
                     venue  freq
0                     Park  0.08
1  Scandinavian Restaurant  0.08
2        French Restaurant  0.06
3               Playground  0.06
4       Italian Restaurant  0.06
5            Boat or Ferry  0.06
6                     Café  0.06


----Etelä-Haaga----
                venue  freq
0  Chinese Restaurant  0.15
1           Cafeteria  0.08
2        Soccer Field  0.08
3                Café  0.08
4                Park  0.08
5         Gas Station  0.08
6   Indian Restaurant  0.08


----Etelä-Laajasalo----
                     venue  freq
0                 Bus Stop  0.50
1              Flower Shop  0.25
2               Playground  0.25
3       Persian Restaurant  0.00
4              

In [34]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [35]:
num_top_venues = 7

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Hood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Hood'] = dt_grouped['Hood']

for ind in np.arange(dt_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dt_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Hood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,Aurinkolahti,Grocery Store,Bus Stop,Park,Bridge,Gym / Fitness Center,Canal,Beach
1,Eira,Scandinavian Restaurant,Park,French Restaurant,Bakery,Boat or Ferry,Café,Italian Restaurant
2,Etelä-Haaga,Chinese Restaurant,Intersection,Grocery Store,Pizza Place,Park,Skate Park,Soccer Field
3,Etelä-Laajasalo,Bus Stop,Playground,Flower Shop,Zoo,Flea Market,Gastropub,Gas Station
4,Etelä-Vuosaari,Bus Stop,Pizza Place,Discount Store,Grocery Store,Recreation Center,Café,Cafeteria


### Now that we know what the neighborhoods are like, we can group the neighborhoods based on thei characteristics

In [36]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

dt_grouped_clustering = dt_grouped.drop('Hood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dt_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5] 

array([1, 1, 1, 0, 1], dtype=int32)

In [37]:
neighborhoods_venues_sorted.head(1)

Unnamed: 0,Hood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,Aurinkolahti,Grocery Store,Bus Stop,Park,Bridge,Gym / Fitness Center,Canal,Beach


In [38]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dt_merged = hoods

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dt_merged = dt_merged.join(neighborhoods_venues_sorted.set_index('Hood'), on='Neighborhood')

dt_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Price,Location,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,Kaivopuisto,8376,"(Kaivopuisto, Eteläinen suurpiiri, Helsinki, H...",60.156843,24.956721,1.0,Coffee Shop,Grocery Store,Ice Cream Shop,Park,Pier,Nightclub,Boat or Ferry
1,Kaartinkaupunki,8205,"(Kaartinkaupunki, Eteläinen suurpiiri, Helsink...",60.165214,24.947222,1.0,Scandinavian Restaurant,Cocktail Bar,Hotel,Restaurant,French Restaurant,Pizza Place,Music Venue
2,Punavuori,7813,"(Punavuori, Eteläinen suurpiiri, Helsinki, Hel...",60.161237,24.936505,1.0,Coffee Shop,Bakery,Park,Sushi Restaurant,Beer Bar,Japanese Restaurant,Wine Bar
3,Eira,7765,"(Eira, Eteläinen suurpiiri, Helsinki, Helsingi...",60.156191,24.938375,1.0,Scandinavian Restaurant,Park,French Restaurant,Bakery,Boat or Ferry,Café,Italian Restaurant
4,Ruoholahti,7645,"(Ruoholahti, Eteläinen suurpiiri, Helsinki, He...",60.162925,24.911497,1.0,Gym,Museum,Restaurant,Historic Site,Food Truck,Middle Eastern Restaurant,Sandwich Place


### Look for neighborhoods that didn't get labeled. As there is only one such neighborhood, it is easiest to just exclude it (Pihlajamäki) from further analysis

In [39]:
dt_merged.isnull().sum().sum()

16

In [40]:
nan_rows = dt_merged[dt_merged.isnull().T.any().T]
nan_rows

Unnamed: 0,Neighborhood,Price,Location,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
35,Kuusisaari-Lehtisaari,4374,"(Lehtisaari, Bennäsvägen, Pedersöre, Pietarsaa...",63.608082,22.774918,,,,,,,,
70,Pihlajamäki,2748,"(Pihlajamäki, Kuusiokuntien seutukunta, Länsi-...",62.626649,23.294745,,,,,,,,


In [41]:
dt_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 13 columns):
Neighborhood             75 non-null object
Price                    75 non-null int64
Location                 75 non-null object
Latitude                 75 non-null float64
Longitude                75 non-null float64
Cluster Labels           73 non-null float64
1st Most Common Venue    73 non-null object
2nd Most Common Venue    73 non-null object
3rd Most Common Venue    73 non-null object
4th Most Common Venue    73 non-null object
5th Most Common Venue    73 non-null object
6th Most Common Venue    73 non-null object
7th Most Common Venue    73 non-null object
dtypes: float64(3), int64(1), object(9)
memory usage: 7.7+ KB


In [42]:
dt_merged.dropna(subset=["Cluster Labels"], axis=0, inplace=True)

In [43]:
dt_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73 entries, 0 to 74
Data columns (total 13 columns):
Neighborhood             73 non-null object
Price                    73 non-null int64
Location                 73 non-null object
Latitude                 73 non-null float64
Longitude                73 non-null float64
Cluster Labels           73 non-null float64
1st Most Common Venue    73 non-null object
2nd Most Common Venue    73 non-null object
3rd Most Common Venue    73 non-null object
4th Most Common Venue    73 non-null object
5th Most Common Venue    73 non-null object
6th Most Common Venue    73 non-null object
7th Most Common Venue    73 non-null object
dtypes: float64(3), int64(1), object(9)
memory usage: 8.0+ KB


In [44]:
dt_merged['Cluster Labels']=dt_merged['Cluster Labels'].astype(int)
dt_merged.head(1)

Unnamed: 0,Neighborhood,Price,Location,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,Kaivopuisto,8376,"(Kaivopuisto, Eteläinen suurpiiri, Helsinki, H...",60.156843,24.956721,1,Coffee Shop,Grocery Store,Ice Cream Shop,Park,Pier,Nightclub,Boat or Ferry


### In the next cell I check each cluster one by one for their characteristics

In [45]:
dt_merged[dt_merged.values == 4]

Unnamed: 0,Neighborhood,Price,Location,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
9,Meilahti,6469,"(Meilahti, Läntinen suurpiiri, Helsinki, Helsi...",60.191348,24.902664,4,Bus Stop,Café,Scandinavian Restaurant,Gym / Fitness Center,Bar,Tram Station,Pizza Place
16,Lauttasaari,5980,"(Lauttasaari, Eteläinen suurpiiri, Helsinki, H...",60.159369,24.875304,4,Bus Stop,Playground,Coffee Shop,Bar,Pizza Place,Supermarket,Sushi Restaurant
18,Ruskeasuo,5792,"(Ruskeasuo, Läntinen suurpiiri, Helsinki, Hels...",60.204803,24.905621,4,Bus Stop,Garden,Sports Club,Recreation Center,Himalayan Restaurant,Gym,Pharmacy
20,Kulosaari,5652,"(Kulosaari, Helsinki, Helsingin seutukunta, Uu...",60.185513,25.003549,4,Park,Bus Stop,Restaurant,Badminton Court,Grocery Store,Taxi Stand,Gym
22,Käpylä,5465,"(Käpylä, Helsinki, Helsingin seutukunta, Uusim...",60.215684,24.952786,4,Bus Stop,Grocery Store,Scenic Lookout,Flower Shop,Coffee Shop,Gym Pool,Thai Restaurant
26,Herttoniemi,5051,"(Herttoniemi, Helsinki, Helsingin seutukunta, ...",60.195525,25.029063,4,Bus Stop,Gym / Fitness Center,Supermarket,Convenience Store,Pharmacy,Flower Shop,Gym
28,Länsi-Herttoniemi,5006,"(Länsi-Herttoniemi, Helsinki, Helsingin seutuk...",60.209119,25.040027,4,Bus Stop,Scenic Lookout,Himalayan Restaurant,Recreation Center,Flea Market,Pizza Place,Food Truck
29,Länsi-Pasila,4907,"(Länsi-Pasila, Pasila, Helsinki, Helsingin seu...",60.198701,24.924254,4,Bus Stop,Gym / Fitness Center,Plaza,Dog Run,Café,Dance Studio,Massage Studio
36,Länsi-Pakila,4187,"(Länsi-Pakila, Helsinki, Helsingin seutukunta,...",60.243538,24.926913,4,Bus Stop,Soccer Field,Pizza Place,Hockey Rink,Flower Shop,Zoo,Flea Market
37,Pohjois-Haaga,4039,"(Pohjois-Haaga, Kuparitie, Lassila, Haaga, Län...",60.230377,24.883545,4,Bus Stop,Platform,Sporting Goods Shop,Cafeteria,Furniture / Home Store,Chinese Restaurant,Gym


### The clusters are as follows:

#### Cluster 0: Bus stops as the most common venue.
#### Cluster 1: LIVING. Restaurants, Coffee shops, Parks, etc.
#### Cluster 2: Only Kuusisaari-Lehtisaari (Grocery Store,	Zoo,	Fast Food Restaurant,	Garden,	Furniture / Home Store,	French Restaurant,	Forest)
#### Cluster 3: Only Tapanila (Sports Club,	Zoo,	Fast Food Restaurant,	Garden,	Furniture / Home Store,	French Restaurant,	Forest)
#### Cluster 4: Bus stops again

#### This means I will focus on Cluster 1 for the remainder of this project because that is where downtown Helsinki is.

In [46]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dt_merged['Latitude'], dt_merged['Longitude'], dt_merged['Neighborhood'], dt_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ', Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [47]:
placetobe = dt_merged[dt_merged.values == 1]
placetobe.head()

Unnamed: 0,Neighborhood,Price,Location,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,Kaivopuisto,8376,"(Kaivopuisto, Eteläinen suurpiiri, Helsinki, H...",60.156843,24.956721,1,Coffee Shop,Grocery Store,Ice Cream Shop,Park,Pier,Nightclub,Boat or Ferry
1,Kaartinkaupunki,8205,"(Kaartinkaupunki, Eteläinen suurpiiri, Helsink...",60.165214,24.947222,1,Scandinavian Restaurant,Cocktail Bar,Hotel,Restaurant,French Restaurant,Pizza Place,Music Venue
2,Punavuori,7813,"(Punavuori, Eteläinen suurpiiri, Helsinki, Hel...",60.161237,24.936505,1,Coffee Shop,Bakery,Park,Sushi Restaurant,Beer Bar,Japanese Restaurant,Wine Bar
3,Eira,7765,"(Eira, Eteläinen suurpiiri, Helsinki, Helsingi...",60.156191,24.938375,1,Scandinavian Restaurant,Park,French Restaurant,Bakery,Boat or Ferry,Café,Italian Restaurant
4,Ruoholahti,7645,"(Ruoholahti, Eteläinen suurpiiri, Helsinki, He...",60.162925,24.911497,1,Gym,Museum,Restaurant,Historic Site,Food Truck,Middle Eastern Restaurant,Sandwich Place


### Mapping only Cluster 1 with price data

In [48]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map

map_prices = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, hinta in zip(placetobe['Latitude'], placetobe['Longitude'], placetobe['Neighborhood'], placetobe['Cluster Labels'], placetobe['Price']):
    label = folium.Popup(str(poi) + ', ' + str(hinta) + ' €/m2', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_prices)
       
map_prices

In [49]:
placetobe.head()

Unnamed: 0,Neighborhood,Price,Location,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,Kaivopuisto,8376,"(Kaivopuisto, Eteläinen suurpiiri, Helsinki, H...",60.156843,24.956721,1,Coffee Shop,Grocery Store,Ice Cream Shop,Park,Pier,Nightclub,Boat or Ferry
1,Kaartinkaupunki,8205,"(Kaartinkaupunki, Eteläinen suurpiiri, Helsink...",60.165214,24.947222,1,Scandinavian Restaurant,Cocktail Bar,Hotel,Restaurant,French Restaurant,Pizza Place,Music Venue
2,Punavuori,7813,"(Punavuori, Eteläinen suurpiiri, Helsinki, Hel...",60.161237,24.936505,1,Coffee Shop,Bakery,Park,Sushi Restaurant,Beer Bar,Japanese Restaurant,Wine Bar
3,Eira,7765,"(Eira, Eteläinen suurpiiri, Helsinki, Helsingi...",60.156191,24.938375,1,Scandinavian Restaurant,Park,French Restaurant,Bakery,Boat or Ferry,Café,Italian Restaurant
4,Ruoholahti,7645,"(Ruoholahti, Eteläinen suurpiiri, Helsinki, He...",60.162925,24.911497,1,Gym,Museum,Restaurant,Historic Site,Food Truck,Middle Eastern Restaurant,Sandwich Place


### I'm curious about the average price per square meter of Cluster 1. I will also assign all the remaining neighborhoods into bins based on their price

In [50]:
placetobe["Price"].mean()

5299.861111111111

In [51]:
placetobe['marker_color'] = pd.cut(placetobe['Price'], bins=4, 
                              labels=[1, 2, 3, 4])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [52]:
placetobe.head()

Unnamed: 0,Neighborhood,Price,Location,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,marker_color
0,Kaivopuisto,8376,"(Kaivopuisto, Eteläinen suurpiiri, Helsinki, H...",60.156843,24.956721,1,Coffee Shop,Grocery Store,Ice Cream Shop,Park,Pier,Nightclub,Boat or Ferry,4
1,Kaartinkaupunki,8205,"(Kaartinkaupunki, Eteläinen suurpiiri, Helsink...",60.165214,24.947222,1,Scandinavian Restaurant,Cocktail Bar,Hotel,Restaurant,French Restaurant,Pizza Place,Music Venue,4
2,Punavuori,7813,"(Punavuori, Eteläinen suurpiiri, Helsinki, Hel...",60.161237,24.936505,1,Coffee Shop,Bakery,Park,Sushi Restaurant,Beer Bar,Japanese Restaurant,Wine Bar,4
3,Eira,7765,"(Eira, Eteläinen suurpiiri, Helsinki, Helsingi...",60.156191,24.938375,1,Scandinavian Restaurant,Park,French Restaurant,Bakery,Boat or Ferry,Café,Italian Restaurant,4
4,Ruoholahti,7645,"(Ruoholahti, Eteläinen suurpiiri, Helsinki, He...",60.162925,24.911497,1,Gym,Museum,Restaurant,Historic Site,Food Truck,Middle Eastern Restaurant,Sandwich Place,4


### Below you can see the final map with different price ranges

In [53]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map

map_prices = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.hot(np.linspace(0, 1, len(ys)))
hot = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, markercolor, hinta in zip(placetobe['Latitude'], placetobe['Longitude'], placetobe['Neighborhood'], placetobe['marker_color'], placetobe['Price']):
    label = folium.Popup(str(poi) + ', ' + str(hinta) + ' €/m2', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color='black',
#        color=placetobe['marker_color'],
        fill=True,
        fill_color=hot[markercolor-3],
        fill_opacity=0.7).add_to(map_prices)
       
map_prices