In [1]:
import pandas as pd
import numpy as np

In [2]:
MO_zip_codes= "https://public.opendatasoft.com/explore/dataset/us-zip-code-latitude-and-longitude/download/?format=csv&refine.state=MO&timezone=America/Chicago&lang=en&use_labels_for_header=true&csv_separator=%3B"
df_MO=pd.read_csv(MO_zip_codes, sep=';')


In [3]:
df_MO.head()

Unnamed: 0,Zip,City,State,Latitude,Longitude,Timezone,Daylight savings time flag,geopoint
0,65807,Springfield,MO,37.168435,-93.31297,-6,1,"37.168435,-93.31297"
1,65582,Vienna,MO,38.175146,-91.94655,-6,1,"38.175146,-91.94655"
2,65236,Brunswick,MO,39.436768,-93.11175,-6,1,"39.436768,-93.11175"
3,63103,Saint Louis,MO,38.631451,-90.21415,-6,1,"38.631451,-90.21415"
4,63931,Briar,MO,36.66101,-90.850785,-6,1,"36.66101,-90.850785"


## Filter the data with City=Saint Louis

In [4]:
df_SL=df_MO[(df_MO.City=="Saint Louis")]

In [5]:
df_SL.head()

Unnamed: 0,Zip,City,State,Latitude,Longitude,Timezone,Daylight savings time flag,geopoint
3,63103,Saint Louis,MO,38.631451,-90.21415,-6,1,"38.631451,-90.21415"
11,63124,Saint Louis,MO,38.645802,-90.37687,-6,1,"38.645802,-90.37687"
12,63133,Saint Louis,MO,38.679684,-90.30186,-6,1,"38.679684,-90.30186"
14,63180,Saint Louis,MO,38.6531,-90.243462,-6,1,"38.6531,-90.243462"
72,63196,Saint Louis,MO,38.6531,-90.243462,-6,1,"38.6531,-90.243462"


## Drop unwanted columns

In [6]:
cols = ['Timezone','Daylight savings time flag', 'geopoint']
df_SL = df_SL.drop(cols, axis=1)
df_SL.head()

Unnamed: 0,Zip,City,State,Latitude,Longitude
3,63103,Saint Louis,MO,38.631451,-90.21415
11,63124,Saint Louis,MO,38.645802,-90.37687
12,63133,Saint Louis,MO,38.679684,-90.30186
14,63180,Saint Louis,MO,38.6531,-90.243462
72,63196,Saint Louis,MO,38.6531,-90.243462


In [7]:
df_SL.shape
## Our final dataframe has 71 samples and 5 columns or fields

(71, 5)

### Import libraries to visualise the Saint Louis City locations on a map

In [8]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.3
  latest version: 4.8.4

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-2.0.0                |     pyh9f0ad1d_0          63 KB  conda-forge
    openssl-1.1.1g             |       h516909a_1         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.2 MB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geo

### Find the geographical coordinates of our target city of Saint Louis

In [9]:
address = 'Saint Louis, MO'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Saint Louis City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Saint Louis City are 38.6268039, -90.1994097.


In [10]:
# Finding the datatypes of our features
df_SL.dtypes

Zip            int64
City          object
State         object
Latitude     float64
Longitude    float64
dtype: object

In [11]:
#df_SL[['Zip', 'Latitude', 'Longitude']] = df_SL[['Zip', 'Latitude', 'Longitude']].apply(pd.to_numeric) 
#df_SL.dtypes

In [12]:
df_SL.shape

(71, 5)

### Map of Saint Louis City

In [13]:
# create map of Saint Louis using latitude and longitude values
map_SL = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_SL['Latitude'], df_SL['Longitude'], df_SL['Zip']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_SL)  
    
map_SL

### FourSquare credentials

In [14]:
CLIENT_ID = 'LIQHFZZVREGLWEHRS45FYKUXTTYXAXBI1GBEAWJN3AQERBDU' # your Foursquare ID
CLIENT_SECRET = '50B11GFYZ45GOHPN04GF0TUND0L352T5BI3NSW0R33GA11CZ'
VERSION = '20180605' # Foursquare API version

print('Kalpana Joshi')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Kalpana Joshi
CLIENT_ID: LIQHFZZVREGLWEHRS45FYKUXTTYXAXBI1GBEAWJN3AQERBDU
CLIENT_SECRET:50B11GFYZ45GOHPN04GF0TUND0L352T5BI3NSW0R33GA11CZ


### Let's Analyse the first zip code in our dataframe

In [15]:
neighborhood_latitude = df_SL.iloc[0,3] # neighborhood latitude value
neighborhood_longitude = df_SL.iloc[0,4] # neighborhood longitude value

neighborhood_name = df_SL.iloc[0,0] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of 63103 are 38.631451, -90.21415.


### Create a url for the API call to Foursquare to extract upto a 100 venues in a radius of 500km of the first Zip

In [16]:
radius=500
LIMIT=100

url= 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=LIQHFZZVREGLWEHRS45FYKUXTTYXAXBI1GBEAWJN3AQERBDU&client_secret=50B11GFYZ45GOHPN04GF0TUND0L352T5BI3NSW0R33GA11CZ&v=20180605&ll=38.631451,-90.21415&radius=500&limit=100'

### Make the API call to get all the possible venues in the surroundings of the first Zip

In [17]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f4d2bda2f43ee5d572d2c92'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': '$-$$$$', 'key': 'price'},
    {'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Downtown West',
  'headerFullLocation': 'Downtown West, St Louis',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 23,
  'suggestedBounds': {'ne': {'lat': 38.6359510045, 'lng': -90.20840021813892},
   'sw': {'lat': 38.626950995499996, 'lng': -90.21989978186109}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e9708288231e0b8aeb87ba9',
       'name': 'Sam Light Loan Company',
       'location': {'address': '2601 Olive St',
        'crossStreet': 'Jefferson',
        'lat': 38.633457,
        'lng': -90.214346,
   

In [18]:
#def get_category_type1(row):
 # categories_list = row['venue.categories']
  #return categories_list[0]['name']

### Clean the json and structure it into a pandas dataframe.

In [19]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Sam Light Loan Company,"[{'id': '52f2ab2ebcbc57f1066b8b34', 'name': 'P...",38.633457,-90.214346
1,The Schlafly Tap Room,"[{'id': '50327c8591d4c4b30a586d5d', 'name': 'B...",38.632944,-90.209796
2,Go Gyro Go,"[{'id': '4bf58dd8d48988d1cb941735', 'name': 'F...",38.632902,-90.216862
3,Schlafly's HOP in the City,"[{'id': '4bf58dd8d48988d117941735', 'name': 'B...",38.633086,-90.210092
4,Firebird,"[{'id': '4bf58dd8d48988d1e9931735', 'name': 'R...",38.633444,-90.216817


In [20]:
# We can observe the details of the Fist venue in the above dataframe
nearby_venues1=nearby_venues.iloc[0,1]
nearby_venues1

[{'id': '52f2ab2ebcbc57f1066b8b34',
  'name': 'Pawn Shop',
  'pluralName': 'Pawn Shops',
  'shortName': 'Pawn Shop',
  'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/default_',
   'suffix': '.png'},
  'primary': True}]

In [21]:
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Sam Light Loan Company,"[{'id': '52f2ab2ebcbc57f1066b8b34', 'name': 'P...",38.633457,-90.214346
1,The Schlafly Tap Room,"[{'id': '50327c8591d4c4b30a586d5d', 'name': 'B...",38.632944,-90.209796
2,Go Gyro Go,"[{'id': '4bf58dd8d48988d1cb941735', 'name': 'F...",38.632902,-90.216862
3,Schlafly's HOP in the City,"[{'id': '4bf58dd8d48988d117941735', 'name': 'B...",38.633086,-90.210092
4,Firebird,"[{'id': '4bf58dd8d48988d1e9931735', 'name': 'R...",38.633444,-90.216817


### Let us create a function to find nearby venues of all the Zip codes across Saint Louis

In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Now, let's create a dataframe with the nearby venues for all the zip codes in Saint Louis by calling the above function

In [23]:
SaintLouis_venues= getNearbyVenues(names=df_SL['Zip'],
                                   latitudes=df_SL['Latitude'],
                                   longitudes=df_SL['Longitude']
                                  )

63103
63124
63133
63180
63196
63177
63178
63113
63134
63144
63121
63136
63151
63182
63125
63188
63101
63102
63118
63198
63120
63150
63106
63109
63164
63179
63111
63160
63135
63114
63140
63132
63126
63116
63115
63104
63166
63167
63117
63156
63143
63127
63145
63107
63169
63146
63131
63141
63155
63112
63195
63147
63197
63123
63130
63153
63138
63105
63110
63157
63137
63119
63158
63163
63128
63199
63129
63122
63171
63139
63108


### We can observe the first few rows of our dataframe with Saint Louis Venues across different Zip codes

In [24]:
SaintLouis_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,63103,38.631451,-90.21415,Sam Light Loan Company,38.633457,-90.214346,Pawn Shop
1,63103,38.631451,-90.21415,The Schlafly Tap Room,38.632944,-90.209796,Brewery
2,63103,38.631451,-90.21415,Go Gyro Go,38.632902,-90.216862,Food Truck
3,63103,38.631451,-90.21415,Schlafly's HOP in the City,38.633086,-90.210092,Beer Garden
4,63103,38.631451,-90.21415,Firebird,38.633444,-90.216817,Rock Club


In [25]:
SaintLouis_venues.shape

(630, 7)

#### We can see that there are a total of 566 venues across different Zip codes. Now, we can group our dataframe by the Zip codes to see how many venues were extracted across each Zip

In [26]:
SaintLouis_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
63101,86,86,86,86,86,86
63102,23,23,23,23,23,23
63103,23,23,23,23,23,23
63104,20,20,20,20,20,20
63105,17,17,17,17,17,17
...,...,...,...,...,...,...
63195,7,7,7,7,7,7
63196,7,7,7,7,7,7
63197,7,7,7,7,7,7
63198,2,2,2,2,2,2


### Unique categories of venues across all the zip codes are as follows:

In [27]:
print('There are {} uniques categories.'.format(len(SaintLouis_venues['Venue Category'].unique())))

There are 169 uniques categories.


### Analyse each neighborhood by using one hot coding
#### One hot coding is essential in performing the Clustering exercise which will help us identify the different types of venues across different Zips

In [28]:
# one hot encoding
SL_onehot = pd.get_dummies(SaintLouis_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
SL_onehot['Neighborhood'] = SaintLouis_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [SL_onehot.columns[-1]] + list(SL_onehot.columns[:-1])
SL_onehot = SL_onehot[fixed_columns]

SL_onehot.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Advertising Agency,Afghan Restaurant,American Restaurant,Antique Shop,Arcade,Art Gallery,Art Museum,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Store,Volleyball Court,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Zoo
0,63103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,63103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,63103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,63103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,63103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# New Dataframe size
SL_onehot.shape

(630, 170)

### grouping rows by mean of frequency of each category

In [30]:
SL_grouped = SL_onehot.groupby('Neighborhood').mean().reset_index()
SL_grouped

Unnamed: 0,Neighborhood,ATM,Accessories Store,Advertising Agency,Afghan Restaurant,American Restaurant,Antique Shop,Arcade,Art Gallery,Art Museum,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Store,Volleyball Court,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Zoo
0,63101,0.011628,0.0,0.011628,0.0,0.023256,0.0,0.000000,0.000000,0.011628,...,0.0,0.0,0.000000,0.011628,0.011628,0.0,0.011628,0.000000,0.000000,0.0
1,63102,0.000000,0.0,0.000000,0.0,0.043478,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
2,63103,0.000000,0.0,0.000000,0.0,0.043478,0.0,0.000000,0.043478,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
3,63104,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.050000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
4,63105,0.000000,0.0,0.000000,0.0,0.058824,0.0,0.058824,0.000000,0.000000,...,0.0,0.0,0.058824,0.000000,0.000000,0.0,0.000000,0.058824,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,63195,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.142857,0.0
66,63196,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.142857,0.0
67,63197,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.142857,0.0
68,63198,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0


### Let's analyse each Neighborhood/Zip with top 5 venues

In [31]:
num_top_venues = 5

for hood in SL_grouped['Neighborhood']:
    print("----",hood, "----")
    temp = SL_grouped[SL_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- 63101 ----
                venue  freq
0                 Bar  0.06
1      Sandwich Place  0.06
2               Hotel  0.06
3         Coffee Shop  0.05
4  Mexican Restaurant  0.03


---- 63102 ----
                venue  freq
0               Hotel  0.17
1  Italian Restaurant  0.09
2              Casino  0.09
3          Steakhouse  0.09
4          Restaurant  0.09


---- 63103 ----
          venue  freq
0    Food Truck  0.22
1  Intersection  0.13
2         Hotel  0.09
3       Brewery  0.04
4   Beer Garden  0.04


---- 63104 ----
                venue  freq
0        Intersection  0.20
1  Chinese Restaurant  0.10
2          Steakhouse  0.05
3         Gas Station  0.05
4         Pizza Place  0.05


---- 63105 ----
                           venue  freq
0                            Bar  0.06
1  Vegetarian / Vegan Restaurant  0.06
2                  Deli / Bodega  0.06
3                   Home Service  0.06
4                          Hotel  0.06


---- 63106 ----
            venue  freq


### Let's create a dataframe of the top 10 venues across each of the Neighborhoods or Zips

In [32]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [33]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = SL_grouped['Neighborhood']

for ind in np.arange(SL_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(SL_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,63101,Hotel,Bar,Sandwich Place,Coffee Shop,Italian Restaurant,Mexican Restaurant,Sports Bar,American Restaurant,Pizza Place,Boutique
1,63102,Hotel,Casino,Restaurant,Italian Restaurant,Steakhouse,Bar,Dive Bar,Coffee Shop,Cocktail Bar,Outdoor Sculpture
2,63103,Food Truck,Intersection,Hotel,Sandwich Place,Beer Garden,American Restaurant,Bus Line,Art Gallery,Pawn Shop,Brewery
3,63104,Intersection,Chinese Restaurant,Pharmacy,Brewery,Photography Studio,Steakhouse,Supermarket,Pub,Print Shop,Gas Station
4,63105,Home Service,Bar,Business Service,Automotive Shop,Lawyer,Italian Restaurant,Steakhouse,Gym,Arcade,Seafood Restaurant


### Cluster Neighborhoods using K Means

In [34]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

SL_grouped_clustering = SL_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(SL_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 0, 0, 1, 0,
       0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2], dtype=int32)

In [35]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

SL_merged = df_SL

In [36]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,63101,Hotel,Bar,Sandwich Place,Coffee Shop,Italian Restaurant,Mexican Restaurant,Sports Bar,American Restaurant,Pizza Place,Boutique
1,0,63102,Hotel,Casino,Restaurant,Italian Restaurant,Steakhouse,Bar,Dive Bar,Coffee Shop,Cocktail Bar,Outdoor Sculpture
2,0,63103,Food Truck,Intersection,Hotel,Sandwich Place,Beer Garden,American Restaurant,Bus Line,Art Gallery,Pawn Shop,Brewery
3,0,63104,Intersection,Chinese Restaurant,Pharmacy,Brewery,Photography Studio,Steakhouse,Supermarket,Pub,Print Shop,Gas Station
4,0,63105,Home Service,Bar,Business Service,Automotive Shop,Lawyer,Italian Restaurant,Steakhouse,Gym,Arcade,Seafood Restaurant


In [37]:
neighborhoods_venues_sorted.dtypes

Cluster Labels             int32
Neighborhood               int64
1st Most Common Venue     object
2nd Most Common Venue     object
3rd Most Common Venue     object
4th Most Common Venue     object
5th Most Common Venue     object
6th Most Common Venue     object
7th Most Common Venue     object
8th Most Common Venue     object
9th Most Common Venue     object
10th Most Common Venue    object
dtype: object

In [38]:
neighborhoods_venues_sorted.rename(columns = {'Neighborhood':'Zip'}, inplace = True) 

In [39]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Zip,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,63101,Hotel,Bar,Sandwich Place,Coffee Shop,Italian Restaurant,Mexican Restaurant,Sports Bar,American Restaurant,Pizza Place,Boutique
1,0,63102,Hotel,Casino,Restaurant,Italian Restaurant,Steakhouse,Bar,Dive Bar,Coffee Shop,Cocktail Bar,Outdoor Sculpture
2,0,63103,Food Truck,Intersection,Hotel,Sandwich Place,Beer Garden,American Restaurant,Bus Line,Art Gallery,Pawn Shop,Brewery
3,0,63104,Intersection,Chinese Restaurant,Pharmacy,Brewery,Photography Studio,Steakhouse,Supermarket,Pub,Print Shop,Gas Station
4,0,63105,Home Service,Bar,Business Service,Automotive Shop,Lawyer,Italian Restaurant,Steakhouse,Gym,Arcade,Seafood Restaurant


In [40]:
# merge SL_grouped with SL_data to add latitude/longitude for each neighborhood
SL_merged1= pd.merge(SL_merged, neighborhoods_venues_sorted, on='Zip', how='right')
SL_merged1.head() # check the last columns!

Unnamed: 0,Zip,City,State,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,63103,Saint Louis,MO,38.631451,-90.21415,0,Food Truck,Intersection,Hotel,Sandwich Place,Beer Garden,American Restaurant,Bus Line,Art Gallery,Pawn Shop,Brewery
1,63124,Saint Louis,MO,38.645802,-90.37687,4,Farm,Zoo,Factory,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
2,63133,Saint Louis,MO,38.679684,-90.30186,0,Music Store,Farm,Food Court,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
3,63180,Saint Louis,MO,38.6531,-90.243462,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop
4,63196,Saint Louis,MO,38.6531,-90.243462,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop


In [41]:
SL_merged1.shape


(70, 16)

In [42]:
SL_merged1.dtypes

Zip                         int64
City                       object
State                      object
Latitude                  float64
Longitude                 float64
Cluster Labels              int32
1st Most Common Venue      object
2nd Most Common Venue      object
3rd Most Common Venue      object
4th Most Common Venue      object
5th Most Common Venue      object
6th Most Common Venue      object
7th Most Common Venue      object
8th Most Common Venue      object
9th Most Common Venue      object
10th Most Common Venue     object
dtype: object

### Visualising clusters

In [43]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]



In [44]:
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(SL_merged1['Latitude'], SL_merged1['Longitude'], SL_merged1['Zip'], SL_merged1['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

### Cluster 1

In [45]:
SL_merged1.loc[SL_merged1['Cluster Labels'] == 0, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]

Unnamed: 0,Zip,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,63103,0,Food Truck,Intersection,Hotel,Sandwich Place,Beer Garden,American Restaurant,Bus Line,Art Gallery,Pawn Shop,Brewery
2,63133,0,Music Store,Farm,Food Court,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
8,63144,0,Pharmacy,Italian Restaurant,Coffee Shop,Rental Car Location,Donut Shop,Chinese Restaurant,Bank,Salon / Barbershop,Zoo,Factory
9,63121,0,Chinese Restaurant,Thrift / Vintage Store,Pizza Place,American Restaurant,Fast Food Restaurant,Event Service,Food,Flower Shop,Flea Market,Fish & Chips Shop
10,63136,0,Cosmetics Shop,Dive Bar,Park,Farm,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish & Chips Shop,Fast Food Restaurant
15,63101,0,Hotel,Bar,Sandwich Place,Coffee Shop,Italian Restaurant,Mexican Restaurant,Sports Bar,American Restaurant,Pizza Place,Boutique
16,63102,0,Hotel,Casino,Restaurant,Italian Restaurant,Steakhouse,Bar,Dive Bar,Coffee Shop,Cocktail Bar,Outdoor Sculpture
17,63118,0,Mexican Restaurant,Fried Chicken Joint,Grocery Store,Bar,Bakery,Tea Room,Pizza Place,Coffee Shop,Noodle House,Music Store
19,63120,0,American Restaurant,Cafeteria,Café,Performing Arts Venue,Food Court,Zoo,Farmers Market,Food,Flower Shop,Flea Market
22,63109,0,Pizza Place,Liquor Store,Ice Cream Shop,Gym / Fitness Center,Public Art,Record Shop,Flower Shop,Automotive Shop,Mexican Restaurant,Furniture / Home Store


In [80]:
SL1=SL_merged1.loc[SL_merged1['Cluster Labels'] == 0, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]
SL1.shape

(38, 12)

In [78]:

SL1['1st Most Common Venue'].value_counts()

Pizza Place            4
Hotel                  3
Chinese Restaurant     3
Mexican Restaurant     2
Food Truck             2
Football Stadium       2
Pharmacy               2
American Restaurant    2
Ice Cream Shop         2
Pool                   2
Garden Center          1
Home Service           1
BBQ Joint              1
Italian Restaurant     1
Event Service          1
Cosmetics Shop         1
Fish & Chips Shop      1
Wine Bar               1
Park                   1
Dance Studio           1
Intersection           1
Music Store            1
Brewery                1
Bar                    1
Name: 1st Most Common Venue, dtype: int64

In [79]:
SL1['2nd Most Common Venue'].value_counts()

Chinese Restaurant                 3
Bar                                2
American Restaurant                2
Fried Chicken Joint                2
Intersection                       2
Zoo                                2
Italian Restaurant                 1
Surf Spot                          1
Arcade                             1
College Administrative Building    1
Sports Bar                         1
Outdoor Supply Store               1
New American Restaurant            1
Farm                               1
Dive Bar                           1
Museum                             1
Soccer Field                       1
Liquor Store                       1
Salon / Barbershop                 1
Speakeasy                          1
Construction & Landscaping         1
Home Service                       1
Park                               1
Flea Market                        1
Wine Bar                           1
Cafeteria                          1
Thrift / Vintage Store             1
L

In [94]:
SL1['3rd Most Common Venue'].value_counts()

Factory               5
Pharmacy              2
Zoo                   2
Soccer Field          1
Plaza                 1
Hotel                 1
Basketball Court      1
Ice Cream Shop        1
Café                  1
Bakery                1
Food Court            1
Playground            1
Gourmet Shop          1
Sandwich Place        1
Grocery Store         1
Dive Bar              1
Business Service      1
Beer Garden           1
Hobby Shop            1
Tour Provider         1
Dog Run               1
Gym                   1
Breakfast Spot        1
Park                  1
Greek Restaurant      1
ATM                   1
Italian Restaurant    1
Pizza Place           1
Hardware Store        1
Coffee Shop           1
Restaurant            1
Sushi Restaurant      1
Name: 3rd Most Common Venue, dtype: int64

In [95]:
SL1['4th Most Common Venue'].value_counts()

Food & Drink Shop        5
Farm                     3
Convenience Store        2
Pharmacy                 2
American Restaurant      2
Food                     2
Zoo                      2
Sandwich Place           2
Rental Car Location      1
Bar                      1
Gym / Fitness Center     1
Café                     1
Automotive Shop          1
Grocery Store            1
Gift Shop                1
Coffee Shop              1
Dog Run                  1
Gym                      1
Performing Arts Venue    1
Brewery                  1
Food Truck               1
Nightlife Spot           1
Moving Target            1
Italian Restaurant       1
Lingerie Store           1
Factory                  1
Name: 4th Most Common Venue, dtype: int64

In [96]:
SL1['5th Most Common Venue'].value_counts()

Food                    6
Food & Drink Shop       5
Fast Food Restaurant    4
Factory                 3
Flower Shop             3
Italian Restaurant      1
Donut Shop              1
American Restaurant     1
Breakfast Spot          1
Photography Studio      1
Public Art              1
Lawyer                  1
Farm                    1
Café                    1
Beer Garden             1
Steakhouse              1
Farmers Market          1
Zoo                     1
Trail                   1
Locksmith               1
Bakery                  1
Food Court              1
Name: 5th Most Common Venue, dtype: int64

In [82]:
# List of Zip codes in this Cluster
SL1['Zip'].values

array([63103, 63133, 63144, 63121, 63136, 63101, 63102, 63118, 63120,
       63109, 63111, 63135, 63114, 63132, 63126, 63116, 63115, 63104,
       63117, 63143, 63127, 63107, 63146, 63131, 63141, 63112, 63147,
       63123, 63130, 63105, 63110, 63137, 63119, 63128, 63129, 63122,
       63139, 63108])

In [56]:
# Find the number of Zip codes in Cluster 1
SL_merged1.loc[SL_merged1['Cluster Labels'] == 0, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]].shape

(38, 12)

### Cluster 2

In [46]:
SL_merged1.loc[SL_merged1['Cluster Labels'] == 1, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]

Unnamed: 0,Zip,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,63151,1,Fast Food Restaurant,Resort,Zoo,Event Service,Food,Flower Shop,Flea Market,Fish & Chips Shop,Farmers Market,Farm
18,63198,1,Fast Food Restaurant,Resort,Zoo,Event Service,Food,Flower Shop,Flea Market,Fish & Chips Shop,Farmers Market,Farm
36,63167,1,Fast Food Restaurant,Resort,Zoo,Event Service,Food,Flower Shop,Flea Market,Fish & Chips Shop,Farmers Market,Farm
41,63145,1,Fast Food Restaurant,Resort,Zoo,Event Service,Food,Flower Shop,Flea Market,Fish & Chips Shop,Farmers Market,Farm


In [84]:
# Find the Zip Codes in Cluster 2
SL_merged1.loc[SL_merged1['Cluster Labels'] == 1, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]["Zip"].values

array([63151, 63198, 63167, 63145])

In [57]:
# Find the number of Zip Codes in Cluster 2
(SL_merged1.loc[SL_merged1['Cluster Labels'] == 1, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]).shape

(4, 12)

## Cluster 3

In [47]:
SL_merged1.loc[SL_merged1['Cluster Labels'] == 2, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]

Unnamed: 0,Zip,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,63180,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop
4,63196,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop
5,63177,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop
6,63178,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop
7,63113,2,Discount Store,Zoo,Factory,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
12,63182,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop
14,63188,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop
20,63150,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop
21,63106,2,Bar,Park,Zoo,Food Truck,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish & Chips Shop,Fast Food Restaurant
23,63164,2,Women's Store,Diner,Discount Store,Food,Bar,Business Service,Grocery Store,Fast Food Restaurant,Food & Drink Shop,Flower Shop


In [85]:
# Find the Zip codes in Cluster 3
SL_merged1.loc[SL_merged1['Cluster Labels'] == 2, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]["Zip"].values

array([63180, 63196, 63177, 63178, 63113, 63182, 63188, 63150, 63106,
       63164, 63179, 63160, 63140, 63166, 63156, 63169, 63155, 63195,
       63197, 63153, 63157, 63158, 63163, 63199, 63171])

In [88]:
# Find the number of Zip codes in Cluster 3
SL3=SL_merged1.loc[SL_merged1['Cluster Labels'] == 2, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]
SL3.shape

(25, 12)

In [89]:
SL3["1st Most Common Venue"].value_counts()

Women's Store     22
Bar                2
Discount Store     1
Name: 1st Most Common Venue, dtype: int64

In [90]:
SL3["2nd Most Common Venue"].value_counts()

Diner          22
Zoo             1
Park            1
Shoe Repair     1
Name: 2nd Most Common Venue, dtype: int64

In [91]:
SL3["3rd Most Common Venue"].value_counts()

Discount Store    22
Zoo                2
Factory            1
Name: 3rd Most Common Venue, dtype: int64

In [92]:
SL3["4th Most Common Venue"].value_counts()

Food                 22
Food Truck            2
Food & Drink Shop     1
Name: 4th Most Common Venue, dtype: int64

In [93]:
SL3["5th Most Common Venue"].value_counts()

Bar                  22
Food & Drink Shop     2
Food                  1
Name: 5th Most Common Venue, dtype: int64

## Cluster 4

In [48]:
SL_merged1.loc[SL_merged1['Cluster Labels'] == 3, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]

Unnamed: 0,Zip,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,63125,3,Home Service,Theater,Zoo,Factory,Food,Flower Shop,Flea Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
55,63138,3,Home Service,Zoo,Event Service,Food,Flower Shop,Flea Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm


In [59]:
# Find the number of Zip codes in Cluster 4
(SL_merged1.loc[SL_merged1['Cluster Labels'] == 3, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]).shape

(2, 12)

## Cluster 5

In [49]:
SL_merged1.loc[SL_merged1['Cluster Labels'] == 4, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]

Unnamed: 0,Zip,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,63124,4,Farm,Zoo,Factory,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


In [61]:
# Find the number of Zip codes in cluster 5
(SL_merged1.loc[SL_merged1['Cluster Labels'] == 4, SL_merged1.columns[[0] + list(range(5, SL_merged1.shape[1]))]]).shape

(1, 12)