# 1. Built dataframe of the postal code of each neighborhood along with the borough name and neighborhood name

In [1]:
import pandas as pd

In [2]:
# Manually create table of Borough for Warsaw since information not readily available, reference https://en.wikipedia.org/wiki/Category:Districts_of_Warsaw

warsaw_boroughs = {'Borough': ['Bemowo','Białołęka','Bielany','Mokotów','Ochota','Praga-Północ', 'Praga-Południe', 'Rembertów', 'Śródmieście', 'Targówek', 'Ursus', 'Ursynów', 'Wawer', 'Wesoła', 'Wilanów','Włochy','Wola','Żoliborz']}

df = pd.DataFrame(warsaw_boroughs, columns = ['Borough'])
df

Unnamed: 0,Borough
0,Bemowo
1,Białołęka
2,Bielany
3,Mokotów
4,Ochota
5,Praga-Północ
6,Praga-Południe
7,Rembertów
8,Śródmieście
9,Targówek


# 2. Get the latitude and the longitude coordinates of each neighborhood

In [3]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [4]:
# Test on one district to see if geolocater works
address = 'Bemowo, Warsaw'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

52.2389738 20.9132881


In [5]:
# Create loop to append latitude and longitude to table
lat_list = []
long_list= []

for ind in df.index:
    address = df['Borough'][ind],', Warsaw'

    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    #print(latitude, longitude)
    lat_list.append(latitude)
    long_list.append(longitude)

In [6]:
df['Latitude'] = lat_list
df['Longitude'] = long_list
warsaw_data = df
warsaw_data

Unnamed: 0,Borough,Latitude,Longitude
0,Bemowo,52.238974,20.913288
1,Białołęka,52.319665,21.021177
2,Bielany,52.285043,20.943949
3,Mokotów,52.193987,21.045781
4,Ochota,52.212225,20.97263
5,Praga-Północ,52.264884,21.027344
6,Praga-Południe,52.237396,21.071258
7,Rembertów,52.261415,21.162819
8,Śródmieście,52.23281,21.019067
9,Targówek,52.275192,21.058085


# 3. Explore and cluster the boroughs in Warsaw

Download all the dependencies that we will need.

In [7]:
conda install -c conda-forge folium

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [8]:
import numpy as np # library to handle data in a vectorized manner

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


#### Use geopy library to get the latitude and longitude values of Warsaw.

In [9]:
address = 'Warsaw, Poland'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Warsaw are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Warsaw are 52.2319581, 21.0067249.


#### Create a map of Warsaw with districts superimposed on top

In [10]:
# create map of Warsaw using latitude and longitude values
map_warsaw = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough in zip(warsaw_data['Latitude'], warsaw_data['Longitude'], warsaw_data['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_warsaw)
    
map_warsaw

In [11]:
# create map of Warsaw using latitude and longitude values
map_warsaw = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough in zip(warsaw_data['Latitude'], warsaw_data['Longitude'], warsaw_data['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.Circle(
        [lat, lng],
        radius=2000,
        ).add_to(map_warsaw)

map_warsaw

#### Define Foursquare Credentials and Version

In [12]:
CLIENT_ID = 'RZ3ECL15BKXU1CIGMALQQBMEVHPRH0HKM1TRTHKA0YZ0JMUH' # your Foursquare ID
CLIENT_SECRET = '2NHUCTUY5T5SV40U5PPX02QCLWHJMQLZET3Y1VM41WIVBW25' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RZ3ECL15BKXU1CIGMALQQBMEVHPRH0HKM1TRTHKA0YZ0JMUH
CLIENT_SECRET:2NHUCTUY5T5SV40U5PPX02QCLWHJMQLZET3Y1VM41WIVBW25


#### Explore one of my districts in my dataframe.

Get the district's name.

In [13]:
warsaw_data.loc[0, 'Borough']

'Bemowo'

Get the neighborhood's latitude and longitude values.

In [14]:
borough_latitude = warsaw_data.loc[0, 'Latitude'] # neighborhood latitude value
borough_longitude = warsaw_data.loc[0, 'Longitude'] # neighborhood longitude value

borough_name = warsaw_data.loc[0, 'Borough'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(borough_name, 
                                                               borough_latitude, 
                                                               borough_longitude))

Latitude and longitude values of Bemowo are 52.2389738, 20.9132881.


#### Now, let's get the top 100 venues that are in Regent Park, Harbourfront within a radius of 500 meters.

In [15]:
LIMIT = 200 # limit of number of venues returned by Foursquare API
radius = 2000 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    borough_latitude, 
    borough_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=RZ3ECL15BKXU1CIGMALQQBMEVHPRH0HKM1TRTHKA0YZ0JMUH&client_secret=2NHUCTUY5T5SV40U5PPX02QCLWHJMQLZET3Y1VM41WIVBW25&v=20180605&ll=52.2389738,20.9132881&radius=2000&limit=200'

Send the GET request and examine the resutls

In [16]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f241d58bca31a705eaff2f7'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Jelonki Północne',
  'headerFullLocation': 'Jelonki Północne, Warsaw',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 52,
  'suggestedBounds': {'ne': {'lat': 52.25697381800001,
    'lng': 20.94262726844129},
   'sw': {'lat': 52.22097378199998, 'lng': 20.88394893155871}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4f119381e4b01eefee464165',
       'name': 'Zdrofit Bemowo',
       'location': {'address': 'Dywizjonu 303 129, 01-001 Warszawa',
        'lat': 52.245641222846984,
        'lng': 20.906136121137468,
        'labeledLatLngs': [{'label': 'di

Borrow the **get_category_type** function from the Foursquare lab.

In [17]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a *pandas* dataframe.

In [18]:
venues = results['response']['groups'][0]['items']
nearby_venues = pd.json_normalize(venues) # flatten JSON
nearby_venues.head()

Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.address,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,venue.location.distance,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups,venue.location.crossStreet,venue.location.postalCode,venue.location.neighborhood
0,e-0-4f119381e4b01eefee464165-0,0,"[{'summary': 'This spot is popular', 'type': '...",4f119381e4b01eefee464165,Zdrofit Bemowo,"Dywizjonu 303 129, 01-001 Warszawa",52.245641,20.906136,"[{'label': 'display', 'lat': 52.24564122284698...",887,PL,Warszawa,Województwo mazowieckie,Polska,"[Dywizjonu 303 129, 01-001 Warszawa, Warszawa,...","[{'id': '4bf58dd8d48988d175941735', 'name': 'G...",0,[],,,
1,e-0-4b9d1702f964a520ad8e36e3-1,0,"[{'summary': 'This spot is popular', 'type': '...",4b9d1702f964a520ad8e36e3,Park Gorczewska,Park Gorczewska,52.233976,20.905266,"[{'label': 'display', 'lat': 52.23397635455908...",780,PL,Warszawa,Województwo mazowieckie,Polska,"[Park Gorczewska, Warszawa, Polska]","[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",0,[],,,
2,e-0-5038b770e4b0307ba345612f-2,0,"[{'summary': 'This spot is popular', 'type': '...",5038b770e4b0307ba345612f,CieKawa,Powstańców Śląskich 80D,52.242059,20.913374,"[{'label': 'display', 'lat': 52.24205920976705...",343,PL,Warszawa,Województwo mazowieckie,Polska,"[Powstańców Śląskich 80D, Warszawa, Polska]","[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",0,[],,,
3,e-0-4c949cb003413704477a73ef-3,0,"[{'summary': 'This spot is popular', 'type': '...",4c949cb003413704477a73ef,Lidl,Powstańców Śląskich,52.239942,20.913861,"[{'label': 'display', 'lat': 52.23994195303825...",114,PL,Warszawa,mazwowieckie,Polska,"[Powstańców Śląskich (Górczewska), Warszawa, P...","[{'id': '4bf58dd8d48988d1f9941735', 'name': 'F...",0,[],Górczewska,,
4,e-0-4b799ec6f964a5200d062fe3-4,0,"[{'summary': 'This spot is popular', 'type': '...",4b799ec6f964a5200d062fe3,La Fiaccola,Gorczewska 200,52.240811,20.912208,"[{'label': 'display', 'lat': 52.24081101073666...",217,PL,Warszawa,Województwo mazowieckie,Polska,"[Gorczewska 200, Warszawa, Polska]","[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",0,[],,,


In [19]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Zdrofit Bemowo,Gym / Fitness Center,52.245641,20.906136
1,Park Gorczewska,Park,52.233976,20.905266
2,CieKawa,Café,52.242059,20.913374
3,Lidl,Food & Drink Shop,52.239942,20.913861
4,La Fiaccola,Italian Restaurant,52.240811,20.912208


Check number of venues returned by Foursquare.

In [20]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

52 venues were returned by Foursquare.


#### Borrow function to repeat the same process to all the districts in Warsaw

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=2000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the function on each neighborhood and create a new dataframe called *warsaw_venues*.

In [22]:
warsaw_venues = getNearbyVenues(names=warsaw_data['Borough'],
                                   latitudes=warsaw_data['Latitude'],
                                   longitudes=warsaw_data['Longitude']
                                  )

Bemowo
Białołęka
Bielany
Mokotów
Ochota
Praga-Północ
Praga-Południe
Rembertów
Śródmieście
Targówek
Ursus
Ursynów
Wawer
Wesoła
Wilanów
Włochy
Wola
Żoliborz


Analyze each neighborhood

In [23]:
# one hot encoding
warsaw_onehot = pd.get_dummies(warsaw_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
warsaw_onehot['Borough'] = warsaw_venues['Borough'] 

# move neighborhood column to the first column
fixed_columns = [warsaw_onehot.columns[-1]] + list(warsaw_onehot.columns[:-1])
warsaw_onehot = warsaw_onehot[fixed_columns]

warsaw_onehot.head()

Unnamed: 0,Borough,Accessories Store,Airport Lounge,American Restaurant,Aquarium,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bar,Beach,Beach Bar,Bed & Breakfast,Beer Bar,Bistro,Board Shop,Bookstore,Boutique,Breakfast Spot,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Business Service,Butcher,Café,Cantonese Restaurant,Caribbean Restaurant,Caucasian Restaurant,Cemetery,Chinese Restaurant,Chocolate Shop,Circus,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Academic Building,College Library,Comedy Club,Comfort Food Restaurant,Concert Hall,Convenience Store,Cosmetics Shop,Creperie,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Event Space,Exhibit,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Field,Flea Market,Flower Shop,Food,Food & Drink Shop,Forest,Frozen Yogurt Shop,Furniture / Home Store,Garden,Gas Station,Gastropub,General Entertainment,German Restaurant,Go Kart Track,Golf Course,Greek Restaurant,Grocery Store,Gun Range,Gym,Gym / Fitness Center,Gym Pool,Hardware Store,Health & Beauty Service,Historic Site,History Museum,Hobby Shop,Hostel,Hotel,Hotel Bar,Hungarian Restaurant,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Israeli Restaurant,Italian Restaurant,Japanese Restaurant,Jewelry Store,Juice Bar,Kebab Restaurant,Korean Restaurant,Kosher Restaurant,Lake,Laser Tag,Light Rail Station,Liquor Store,Lounge,Market,Martial Arts Dojo,Mediterranean Restaurant,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Monument / Landmark,Moroccan Restaurant,Motel,Mountain,Movie Theater,Multiplex,Museum,Music Store,Music Venue,Neighborhood,New American Restaurant,Night Market,Nightclub,Noodle House,Optical Shop,Other Great Outdoors,Other Nightlife,Outdoors & Recreation,Outlet Mall,Outlet Store,Paintball Field,Park,Pedestrian Plaza,Performing Arts Venue,Pharmacy,Pizza Place,Playground,Plaza,Polish Restaurant,Pool,Pool Hall,Pub,Public Art,Racetrack,Ramen Restaurant,Recreation Center,Rest Area,Restaurant,Road,Rock Climbing Spot,Rock Club,Russian Restaurant,Salad Place,Sandwich Place,Scandinavian Restaurant,Scenic Lookout,Science Museum,Seafood Restaurant,Shopping Mall,Skating Rink,Ski Area,Smoke Shop,Snack Place,Soccer Field,Soccer Stadium,Spa,Spanish Restaurant,Sporting Goods Shop,Steakhouse,Street Food Gathering,Supermarket,Sushi Restaurant,Tapas Restaurant,Tea Room,Tennis Court,Tennis Stadium,Thai Restaurant,Theater,Theme Park,Tiki Bar,Toy / Game Store,Train Station,Tram Station,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Water Park,Wine Bar,Wine Shop,Yoga Studio,Zoo,Zoo Exhibit
0,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
list(warsaw_onehot.columns)

['Borough',
 'Accessories Store',
 'Airport Lounge',
 'American Restaurant',
 'Aquarium',
 'Arcade',
 'Art Gallery',
 'Art Museum',
 'Arts & Crafts Store',
 'Asian Restaurant',
 'Athletics & Sports',
 'Bakery',
 'Bar',
 'Beach',
 'Beach Bar',
 'Bed & Breakfast',
 'Beer Bar',
 'Bistro',
 'Board Shop',
 'Bookstore',
 'Boutique',
 'Breakfast Spot',
 'Burger Joint',
 'Burrito Place',
 'Bus Line',
 'Bus Station',
 'Bus Stop',
 'Business Service',
 'Butcher',
 'Café',
 'Cantonese Restaurant',
 'Caribbean Restaurant',
 'Caucasian Restaurant',
 'Cemetery',
 'Chinese Restaurant',
 'Chocolate Shop',
 'Circus',
 'Climbing Gym',
 'Clothing Store',
 'Cocktail Bar',
 'Coffee Shop',
 'College Academic Building',
 'College Library',
 'Comedy Club',
 'Comfort Food Restaurant',
 'Concert Hall',
 'Convenience Store',
 'Cosmetics Shop',
 'Creperie',
 'Cupcake Shop',
 'Dance Studio',
 'Deli / Bodega',
 'Department Store',
 'Dessert Shop',
 'Dim Sum Restaurant',
 'Diner',
 'Discount Store',
 'Dive Bar',
 'D

In [25]:
# List out categories that impact housing decision
important_categories = ['Borough', 
                        'Bakery', 
                        'Bus Line',
                        'Bus Station', 
                        'Bus Stop', 
                        'Beach', 
                        'Convenience Store', 
                        'Deli / Bodega',  
                        'Department Store', 
                        'Drugstore', 
                        'Farmers Market', 
                        'Flower Shop', 
                        'Gas Station', 
                        'Garden', 
                        'Grocery Store', 
                        'Gym',
                        'Gym / Fitness Center',
                        'Gym Pool', 
                        'Ice Cream Shop', 
                        'Lake', 
                        'Market', 
                        'Metro Station', 
                        'Movie Theater', 
                        'Multiplex', 
                        'Outlet Mall', 
                        'Outlet Store', 
                        'Park', 
                        'Outdoors & Recreation', 
                        'Pharmacy', 
                        'Scenic Lookout', 
                        'Shopping Mall', 
                        'Supermarket', 
                        'Tennis Court', 
                        'Tennis Stadium', 
                        'Train Station', 
                        'Tram Station']
print(*important_categories, sep = ", ")

Borough, Bakery, Bus Line, Bus Station, Bus Stop, Beach, Convenience Store, Deli / Bodega, Department Store, Drugstore, Farmers Market, Flower Shop, Gas Station, Garden, Grocery Store, Gym, Gym / Fitness Center, Gym Pool, Ice Cream Shop, Lake, Market, Metro Station, Movie Theater, Multiplex, Outlet Mall, Outlet Store, Park, Outdoors & Recreation, Pharmacy, Scenic Lookout, Shopping Mall, Supermarket, Tennis Court, Tennis Stadium, Train Station, Tram Station


In [26]:
# Filtering only important venue categories that impact housing decision
warsaw_onehot_filtered = warsaw_onehot[important_categories]
warsaw_onehot_filtered.head(5)

Unnamed: 0,Borough,Bakery,Bus Line,Bus Station,Bus Stop,Beach,Convenience Store,Deli / Bodega,Department Store,Drugstore,Farmers Market,Flower Shop,Gas Station,Garden,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Ice Cream Shop,Lake,Market,Metro Station,Movie Theater,Multiplex,Outlet Mall,Outlet Store,Park,Outdoors & Recreation,Pharmacy,Scenic Lookout,Shopping Mall,Supermarket,Tennis Court,Tennis Stadium,Train Station,Tram Station
0,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Bemowo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
warsaw_onehot_filtered['Gym Amenities'] = warsaw_onehot_filtered.loc[:,"Gym"] + warsaw_onehot_filtered.loc[:,"Gym / Fitness Center"] + warsaw_onehot_filtered.loc[:,"Gym Pool"]
warsaw_onehot_filtered['Supermarket Amenities'] = warsaw_onehot_filtered.loc[:,"Grocery Store"] + warsaw_onehot_filtered.loc[:,"Supermarket"] + warsaw_onehot_filtered.loc[:,"Market"]
warsaw_onehot_filtered['Tennis Amenities'] = warsaw_onehot_filtered.loc[:,"Tennis Court"] + warsaw_onehot_filtered.loc[:,"Tennis Stadium"]
warsaw_onehot_filtered['Public Transportation'] = warsaw_onehot_filtered.loc[:,"Train Station"] + warsaw_onehot_filtered.loc[:,"Tram Station"] + warsaw_onehot_filtered.loc[:,"Bus Line"] + warsaw_onehot_filtered.loc[:,"Bus Stop"] + warsaw_onehot_filtered.loc[:,"Bus Station"] + warsaw_onehot_filtered.loc[:,"Metro Station"]
warsaw_onehot_filtered['Nature'] = warsaw_onehot_filtered.loc[:,"Park"] + warsaw_onehot_filtered.loc[:,"Garden"] + warsaw_onehot_filtered.loc[:,"Lake"] + warsaw_onehot_filtered.loc[:,"Outdoors & Recreation"] + warsaw_onehot_filtered.loc[:,"Scenic Lookout"] 
warsaw_onehot_filtered['Cinema'] = warsaw_onehot_filtered.loc[:,"Movie Theater"] + warsaw_onehot_filtered.loc[:,"Multiplex"]
warsaw_onehot_filtered['Shopping'] = warsaw_onehot_filtered.loc[:,"Outlet Mall"] + warsaw_onehot_filtered.loc[:,"Outlet Store"] + warsaw_onehot_filtered.loc[:,"Shopping Mall"] + warsaw_onehot_filtered.loc[:,"Department Store"]
warsaw_onehot_filtered['Pharmacy Amenities'] = warsaw_onehot_filtered.loc[:,"Drugstore"] + warsaw_onehot_filtered.loc[:,"Pharmacy"]
warsaw_onehot_filtered['Bakery Amenities'] = warsaw_onehot_filtered.loc[:,"Bakery"] + warsaw_onehot_filtered.loc[:,"Deli / Bodega"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  warsaw_onehot_filtered['Gym Amenities'] = warsaw_onehot_filtered.loc[:,"Gym"] + warsaw_onehot_filtered.loc[:,"Gym / Fitness Center"] + warsaw_onehot_filtered.loc[:,"Gym Pool"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  warsaw_onehot_filtered['Supermarket Amenities'] = warsaw_onehot_filtered.loc[:,"Grocery Store"] + warsaw_onehot_filtered.loc[:,"Supermarket"] + warsaw_onehot_filtered.loc[:,"Market"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = va

In [28]:
warsaw_onehot_filtered_drop = warsaw_onehot_filtered.drop(columns=['Gym', 
                                     'Gym / Fitness Center',
                                     'Gym Pool',
                                     'Grocery Store',
                                     'Supermarket',
                                     'Market',
                                     'Tennis Court',
                                     'Tennis Stadium',
                                     'Train Station',
                                     'Tram Station',
                                     'Bus Station',
                                     'Bus Line',
                                     'Bus Stop',
                                     'Metro Station',
                                     'Park',
                                     'Garden',
                                     'Scenic Lookout',
                                     'Movie Theater',
                                     'Multiplex',
                                     'Outlet Mall',
                                     'Outlet Store',
                                     'Shopping Mall',
                                     'Department Store',
                                     'Drugstore',
                                     'Pharmacy',
                                     'Bakery',
                                     'Deli / Bodega',
                                    ])

Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [29]:
warsaw_grouped = warsaw_onehot_filtered_drop.groupby('Borough').mean().reset_index()
warsaw_grouped.head()

Unnamed: 0,Borough,Beach,Convenience Store,Farmers Market,Flower Shop,Gas Station,Ice Cream Shop,Lake,Outdoors & Recreation,Gym Amenities,Supermarket Amenities,Tennis Amenities,Public Transportation,Nature,Cinema,Shopping,Pharmacy Amenities,Bakery Amenities
0,Bemowo,0.0,0.0,0.0,0.0,0.019231,0.019231,0.0,0.0,0.057692,0.134615,0.0,0.057692,0.019231,0.019231,0.076923,0.019231,0.0
1,Białołęka,0.0,0.0,0.083333,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.083333,0.083333,0.083333,0.083333,0.0,0.0,0.0
2,Bielany,0.0,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.107692,0.123077,0.0,0.092308,0.061538,0.0,0.015385,0.030769,0.015385
3,Mokotów,0.0,0.02,0.0,0.0,0.0,0.02,0.01,0.0,0.08,0.07,0.01,0.03,0.08,0.01,0.0,0.0,0.01
4,Ochota,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.08,0.01,0.01,0.01,0.07,0.0,0.0,0.01,0.01


Borrow function to sort the venues in descending order.

In [30]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create the new dataframe and display the top 10 venues for each district.

In [31]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
districts_venues_sorted = pd.DataFrame(columns=columns)
districts_venues_sorted['Borough'] = warsaw_grouped['Borough']

for ind in np.arange(warsaw_grouped.shape[0]):
   districts_venues_sorted.iloc[ind, 1:] = return_most_common_venues(warsaw_grouped.iloc[ind, :], num_top_venues)

districts_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Bemowo,Supermarket Amenities,Shopping,Gym Amenities,Public Transportation,Gas Station
1,Białołęka,Cinema,Nature,Public Transportation,Tennis Amenities,Farmers Market
2,Bielany,Supermarket Amenities,Gym Amenities,Public Transportation,Nature,Pharmacy Amenities
3,Mokotów,Gym Amenities,Nature,Supermarket Amenities,Public Transportation,Convenience Store
4,Ochota,Gym Amenities,Nature,Supermarket Amenities,Farmers Market,Ice Cream Shop


## Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [32]:
# set number of clusters
kclusters = 4

warsaw_grouped_clustering = warsaw_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(warsaw_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 1, 0, 0, 0, 0, 1, 1, 1])

In [33]:
warsaw_grouped_clustering.head()

Unnamed: 0,Beach,Convenience Store,Farmers Market,Flower Shop,Gas Station,Ice Cream Shop,Lake,Outdoors & Recreation,Gym Amenities,Supermarket Amenities,Tennis Amenities,Public Transportation,Nature,Cinema,Shopping,Pharmacy Amenities,Bakery Amenities
0,0.0,0.0,0.0,0.0,0.019231,0.019231,0.0,0.0,0.057692,0.134615,0.0,0.057692,0.019231,0.019231,0.076923,0.019231,0.0
1,0.0,0.0,0.083333,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.083333,0.083333,0.083333,0.083333,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.107692,0.123077,0.0,0.092308,0.061538,0.0,0.015385,0.030769,0.015385
3,0.0,0.02,0.0,0.0,0.0,0.02,0.01,0.0,0.08,0.07,0.01,0.03,0.08,0.01,0.0,0.0,0.01
4,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.08,0.01,0.01,0.01,0.07,0.0,0.0,0.01,0.01


Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [34]:
# add clustering labels
districts_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

warsaw_merged = warsaw_data

# merge warsaw_grouped with warsaw_data to add latitude/longitude for each neighborhood
warsaw_merged = warsaw_merged.join(districts_venues_sorted.set_index('Borough'), on='Borough')

warsaw_merged.head() # check the last columns!

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Bemowo,52.238974,20.913288,1,Supermarket Amenities,Shopping,Gym Amenities,Public Transportation,Gas Station
1,Białołęka,52.319665,21.021177,3,Cinema,Nature,Public Transportation,Tennis Amenities,Farmers Market
2,Bielany,52.285043,20.943949,1,Supermarket Amenities,Gym Amenities,Public Transportation,Nature,Pharmacy Amenities
3,Mokotów,52.193987,21.045781,0,Gym Amenities,Nature,Supermarket Amenities,Public Transportation,Convenience Store
4,Ochota,52.212225,20.97263,0,Gym Amenities,Nature,Supermarket Amenities,Farmers Market,Ice Cream Shop


Visualize the resulting clusters

In [35]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.jet(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(warsaw_merged['Latitude'], warsaw_merged['Longitude'], warsaw_merged['Borough'], warsaw_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

Cluster 1

In [36]:
warsaw_merged.loc[warsaw_merged['Cluster Labels'] == 0, warsaw_merged.columns[[0] + list(range(4, warsaw_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
3,Mokotów,Gym Amenities,Nature,Supermarket Amenities,Public Transportation,Convenience Store
4,Ochota,Gym Amenities,Nature,Supermarket Amenities,Farmers Market,Ice Cream Shop
5,Praga-Północ,Nature,Ice Cream Shop,Gym Amenities,Supermarket Amenities,Convenience Store
6,Praga-Południe,Nature,Gym Amenities,Ice Cream Shop,Supermarket Amenities,Beach
8,Śródmieście,Nature,Bakery Amenities,Ice Cream Shop,Gym Amenities,Shopping
11,Ursynów,Supermarket Amenities,Convenience Store,Gym Amenities,Nature,Tennis Amenities
16,Wola,Supermarket Amenities,Gym Amenities,Nature,Shopping,Public Transportation
17,Żoliborz,Gym Amenities,Nature,Ice Cream Shop,Bakery Amenities,Cinema


Cluster 2

In [37]:
warsaw_merged.loc[warsaw_merged['Cluster Labels'] == 1, warsaw_merged.columns[[0] + list(range(4, warsaw_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Bemowo,Supermarket Amenities,Shopping,Gym Amenities,Public Transportation,Gas Station
2,Bielany,Supermarket Amenities,Gym Amenities,Public Transportation,Nature,Pharmacy Amenities
7,Rembertów,Public Transportation,Supermarket Amenities,Gym Amenities,Nature,Lake
9,Targówek,Supermarket Amenities,Public Transportation,Gym Amenities,Lake,Convenience Store
10,Ursus,Supermarket Amenities,Gym Amenities,Nature,Public Transportation,Ice Cream Shop
14,Wilanów,Supermarket Amenities,Gym Amenities,Tennis Amenities,Gas Station,Pharmacy Amenities
15,Włochy,Supermarket Amenities,Public Transportation,Gym Amenities,Convenience Store,Nature


Cluster 3

In [38]:
warsaw_merged.loc[warsaw_merged['Cluster Labels'] == 2, warsaw_merged.columns[[0] + list(range(4, warsaw_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
12,Wawer,Public Transportation,Bakery Amenities,Flower Shop,Gym Amenities,Nature
13,Wesoła,Public Transportation,Bakery Amenities,Outdoors & Recreation,Convenience Store,Farmers Market


Cluster 4

In [39]:
warsaw_merged.loc[warsaw_merged['Cluster Labels'] == 3, warsaw_merged.columns[[0] + list(range(4, warsaw_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,Białołęka,Cinema,Nature,Public Transportation,Tennis Amenities,Farmers Market


Cluster 5

In [40]:
warsaw_merged.loc[warsaw_merged['Cluster Labels'] == 4, warsaw_merged.columns[[0] + list(range(4, warsaw_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
