In [4]:
# Read the file HVA_FedCodes_20190301.csv into pandas dataframe
#The above file is downloaded for the state of Virginia, USA from the 'State Files with Federal Codes’ section of the website:
#https://geonames.usgs.gov/domestic/download_data.htm
# Import pandas 
import pandas as pd 
  
# reading csv file  
df_file=pd.read_csv("HVA_FedCodes_20190301.csv") 
df_file.head()

Unnamed: 0,FEATURE_ID,FEATURE_NAME,FEATURE_CLASS,CENSUS_CODE,CENSUS_CLASS_CODE,GSA_CODE,OPM_CODE,STATE_NUMERIC,STATE_ALPHA,COUNTY_SEQUENCE,COUNTY_NUMERIC,COUNTY_NAME,PRIMARY_LATITUDE,PRIMARY_LONGITUDE,DATE_CREATED,DATE_EDITED
0,1314481,Waycross,Populated Place,78570,U6,,,47,TN,1,73,Hawkins,36.593155,-82.64849,5/19/80,
1,1314481,Waycross,Populated Place,78570,U6,,,51,VA,2,169,Scott,36.593155,-82.64849,5/19/80,
2,1462355,Achilles,Populated Place,244,U6,26.0,510026073.0,51,VA,1,73,Gloucester,37.280142,-76.440226,9/28/79,
3,1462375,Adria,Populated Place,420,U6,,,51,VA,1,185,Tazewell,37.168726,-81.545943,9/28/79,
4,1462395,Alfonso,Populated Place,1016,U6,45.0,510045103.0,51,VA,1,103,Lancaster,37.808749,-76.508008,9/28/79,


In [5]:
# Selecting the required 5 columns for the project
df_file_selcols = df_file[["STATE_ALPHA", "COUNTY_NAME", "FEATURE_NAME", "PRIMARY_LATITUDE" ,"PRIMARY_LONGITUDE"]]
df_file_selcols.head()

Unnamed: 0,STATE_ALPHA,COUNTY_NAME,FEATURE_NAME,PRIMARY_LATITUDE,PRIMARY_LONGITUDE
0,TN,Hawkins,Waycross,36.593155,-82.64849
1,VA,Scott,Waycross,36.593155,-82.64849
2,VA,Gloucester,Achilles,37.280142,-76.440226
3,VA,Tazewell,Adria,37.168726,-81.545943
4,VA,Lancaster,Alfonso,37.808749,-76.508008


In [6]:
# Renaming the columns to have meaningful column names
df_file_selcols.columns = ["State", "County", "Area", "Latitude", "Longitude"]
df_file_selcols.head()

Unnamed: 0,State,County,Area,Latitude,Longitude
0,TN,Hawkins,Waycross,36.593155,-82.64849
1,VA,Scott,Waycross,36.593155,-82.64849
2,VA,Gloucester,Achilles,37.280142,-76.440226
3,VA,Tazewell,Adria,37.168726,-81.545943
4,VA,Lancaster,Alfonso,37.808749,-76.508008


In [7]:
# Filtering the 5 columns for the 3 counties considered for the project namely Henrico', Chesterfield and Richmond City.
df_area=df_file_selcols.loc[df_file_selcols['County'].isin(['Henrico', 
                                                            'Chesterfield', 'Richmond (city)'])]
df_area.reset_index(drop=True, inplace=True)

df_area.head()

Unnamed: 0,State,County,Area,Latitude,Longitude
0,VA,Chesterfield,Beulah,37.424593,-77.47054
1,VA,Henrico,Capitol View,37.47598,-77.390537
2,VA,Chesterfield,Five Forks,37.407371,-77.566933
3,VA,Henrico,Hunton,37.688478,-77.499707
4,VA,Henrico,Laurel,37.642923,-77.508874


In [8]:
#Verifying that the selection involves just the 3 counties that are considered for the project
df_area['County'].unique()

array(['Chesterfield', 'Henrico', 'Richmond (city)'], dtype=object)

In [9]:
# Understanding how many Areas are selected for analysis in the 3 counties
df_area.shape

(190, 5)

In [10]:
# Importing the packages and libraries required for the data analysis

import numpy as np # library to handle data in a vectorized manner

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [11]:
# Finding the geographical coordinates of Richmond Virginia to be the center of the visualization on a map
address = 'Richmond, VA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Richmond Virginia are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Richmond Virginia are 37.5385087, -77.43428.


In [12]:
# create map of Richmond, VA using latitude and longitude values
map_richmond = folium.Map(location=[latitude, longitude], zoom_start=10.25)

# add markers to map
for lat, lng, county, area in zip(df_area['Latitude'], df_area['Longitude'], df_area['County'], df_area['Area']):
    label = '{}, {}'.format(area, county)
    label = folium.Popup(label, parse_html=True)   
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_richmond)  

    
map_richmond


In [13]:
# Moving the FourSquared Credentials to a variable names

CLIENT_ID = '3FLCY0FCIPMBAPYX5XZZVBPJXOTVKOJLMNCFAGCEWV40YCHQ' # your Foursquare ID
CLIENT_SECRET = '14ES2EGPPZ1AUNB5XZEJCYQUJTTBSGSGPMV4DVX0OXY24SF1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3FLCY0FCIPMBAPYX5XZZVBPJXOTVKOJLMNCFAGCEWV40YCHQ
CLIENT_SECRET:14ES2EGPPZ1AUNB5XZEJCYQUJTTBSGSGPMV4DVX0OXY24SF1


In [14]:
# Function to find the nearby venues using the foursquare API with the inputs as area name, longitude and latitudes 
# The limit of the number of venues selected is set to be 100 and the venues returned for each area is within a radius of 500 
LIMIT=100
radius=500
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Area', 
                  'Area Latitude', 
                  'Area Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
# Fetching the venues for each area in the 3 counties in question
df_area_venues = getNearbyVenues(names=df_area['Area'],
                                   latitudes=df_area['Latitude'],
                                   longitudes=df_area['Longitude']
                                  )

Beulah
Capitol View
Five Forks
Hunton
Laurel
Mount Nebo
Elko
Glendale
Hallsboro
Lorraine
Manbur
Meadowville
Robious
Chesterfield County
Henrico County
Beaufont Hills
Bellwood
Bellwood Manor
Bon Air
Bosher
Cambridge
Canterbury
Centralia
Chester
Chimney Corner
Crestview
Deep Bottom
Deerfield Estates
Dorset Woods
Drewrys Bluff
Drouin Hill
Dutch Gap
Fair Hill
Falling Creek Farms
Fort Brady (historical)
Fort Gregg (historical)
Fort Harrison (historical)
Fort Lee
Gayton
Glenbrook Hills
Granite
Gravel Hill
Holiday Hills
Hylton Park
Jessup Farm Acres
Lake Crystal Farms
Land O'Pines
Longwood Acres
Mooreland
Mooreland Farms
Nottingham
Oakland
Old Gun
Oxford
Penn Acres
Pickadat Corner
Richmond Heights
River Road Hills
Roslyn Hills
Ziontown
Sheffield Court
Skinquarter
Staffordshire
Stratford Hills
Sweet Briar Park
Tuckahoe
Tuckahoe Village
Walthall
Wayland
Wedgewood
Westbriar
Westchester
Westham
Westhampton
Westover Heights
Wilkinson Terrace
Woodmont
Varina Grove
Glen Allen
Short Pump
Cameron Hill

In [16]:
# Understanding the total number of venues selected for all the areas of the 3 counties and viewing the format of the data
print(df_area_venues.shape)
df_area_venues.head()

(947, 7)


Unnamed: 0,Area,Area Latitude,Area Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Beulah,37.424593,-77.47054,Domino's Pizza,37.423347,-77.470602,Pizza Place
1,Beulah,37.424593,-77.47054,Rite Aid,37.423519,-77.469569,Pharmacy
2,Beulah,37.424593,-77.47054,Lin's Garden,37.424043,-77.47085,Asian Restaurant
3,Beulah,37.424593,-77.47054,Rose's Discount Store,37.423689,-77.469982,Department Store
4,Hunton,37.688478,-77.499707,Hunton Sports Complex,37.68841,-77.50104,Baseball Field


In [17]:
#Selecting the venues with Category listing as 'Pool'
df_pools=df_area_venues.loc[df_area_venues["Venue Category"] == "Pool"]
#df.loc[df['column_name'] == some_value]
df_pools

Unnamed: 0,Area,Area Latitude,Area Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
82,Bon Air,37.52487,-77.557765,Bon Air Community Association (BACA),37.524066,-77.560999,Pool
83,Bon Air,37.52487,-77.557765,Bon Air Community Center,37.52402,-77.560478,Pool
86,Canterbury,37.601813,-77.602489,Canterbury Recreation Association,37.599829,-77.607465,Pool
145,Glenbrook Hills,37.580424,-77.562765,Kanawha Recreation Association,37.579252,-77.557388,Pool
146,Glenbrook Hills,37.580424,-77.562765,Gate Guard Kingdom,37.579266,-77.55736,Pool
150,Granite,37.532925,-77.507208,Willow Oaks Pool,37.536965,-77.507234,Pool
155,Gravel Hill,37.536536,-77.514708,Granite Swim and Tennis Club,37.533123,-77.513366,Pool
171,Mooreland Farms,37.57348,-77.590544,Mooreland Farms Pool,37.572846,-77.591697,Pool
250,Westbriar,37.623201,-77.562487,Chestnut Oaks Recreation Association,37.621147,-77.559575,Pool
253,Westham,37.589035,-77.539987,Ridgetop Pool,37.587662,-77.535428,Pool


In [18]:
# Understanding the total number of pools in all the areas of the 3 counties
df_pools.shape

(18, 7)

In [19]:
# Grouping the pools based on the Area they belong and renaming the field to Pool Count
df_pool_cnt=df_pools.groupby('Area').count()
df_pool_cnt=df_pool_cnt.loc[:, df_pool_cnt.columns.isin(['Venue'])]
df_pool_cnt.rename(columns={'Venue': 'Pool Count'},  inplace=True)
df_pool_cnt

Unnamed: 0_level_0,Pool Count
Area,Unnamed: 1_level_1
Bon Air,2
Bon Air Census Designated Place,1
Canterbury,1
Chamberlayne,1
Chamberlayne Census Designated Place,1
Chamberlayne Heights,1
Glenbrook Hills,2
Granite,1
Gravel Hill,1
Meadowbrook Census Designated Place,1


In [20]:
#Going back to the venues dataframe, grouping the venues based on the Area they belong
df_area_venues.groupby('Area').count()

Unnamed: 0_level_0,Area Latitude,Area Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Acca,2,2,2,2,2,2
Bellwood,8,8,8,8,8,8
Bellwood Census Designated Place,3,3,3,3,3,3
Bellwood Manor,5,5,5,5,5,5
Bensley,4,4,4,4,4,4
Bensley Census Designated Place,5,5,5,5,5,5
Bermuda District,3,3,3,3,3,3
Beulah,4,4,4,4,4,4
Biltmore,4,4,4,4,4,4
Bon Air,2,2,2,2,2,2


In [21]:
#Understanding the number of unique venue categories from the foursquare API data for each area
print('There are {} uniques categories.'.format(len(df_area_venues['Venue Category'].unique())))

There are 194 uniques categories.


In [None]:
# Preparing the dataframe for use in clustering by applying one hot encoding
df_area_onehot = pd.get_dummies(df_area_venues[['Venue Category']], prefix="", prefix_sep="")
#df_area_onehot.head()

# add Area column back to dataframe
df_area_onehot['Area'] = df_area_venues['Area'] 
df_area_onehot.head()

# move Area column to the first column
fixed_columns = [df_area_onehot.columns[-1]] + list(df_area_onehot.columns[:-1])
df_area_onehot = df_area_onehot[fixed_columns]

#x axis will have the different category names and
#y axis will have the location with the intersection having a 1 and all other intersection as zeros.
df_area_onehot.head()

Unnamed: 0,Area,Accessories Store,Airport,Airport Terminal,American Restaurant,Amphitheater,Antique Shop,Arcade,Art Gallery,Arts & Crafts Store,...,Train Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Weight Loss Center,Wine Shop,Wings Joint,Women's Store
0,Beulah,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Beulah,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Beulah,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Beulah,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Hunton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#Understanding the number of rows and columns of the one hot output
df_area_onehot.shape

(947, 195)

In [None]:
#preparing the one hot dataframe to be used in the kmeans clustering by grouping on Area and with mean as the dataframe value
# for each Area and Category - This data frame can be used as the input to k-means clustering
df_area_grouped = df_area_onehot.groupby('Area').mean().reset_index()
df_area_grouped.head()
df_area_grouped

Unnamed: 0,Area,Accessories Store,Airport,Airport Terminal,American Restaurant,Amphitheater,Antique Shop,Arcade,Art Gallery,Arts & Crafts Store,...,Train Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Weight Loss Center,Wine Shop,Wings Joint,Women's Store
0,Acca,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0
1,Bellwood,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0
2,Bellwood Census Designated Place,0.0,0.0,0.0,0.333333,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0
3,Bellwood Manor,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0
4,Bensley,0.0,0.0,0.0,0.250000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0
5,Bensley Census Designated Place,0.0,0.0,0.0,0.200000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0
6,Bermuda District,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.333333,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0
7,Beulah,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0
8,Biltmore,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0
9,Bon Air,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.000,0.000000,0.00,0.00,0.0,0.00,0.0


In [None]:
#Below code can be used to check how the one hot encoding is working in sequencing the area with top 5 commmon 
#venue types for each area in the dataframe
#Run as necessary - Currently the code is commented
num_top_venues = 5

'''
for hood in df_area_grouped['Area']:
    print("----"+hood+"----")
    temp = df_area_grouped[df_area_grouped['Area'] == hood].T.reset_index()
    #print(temp.head())
    temp.columns = ['venue','freq']
    #print(temp.head())
    temp = temp.iloc[1:]
    #print(temp.head())
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    #print(temp.head())
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
'''

'\nfor hood in df_area_grouped[\'Area\']:\n    print("----"+hood+"----")\n    temp = df_area_grouped[df_area_grouped[\'Area\'] == hood].T.reset_index()\n    #print(temp.head())\n    temp.columns = [\'venue\',\'freq\']\n    #print(temp.head())\n    temp = temp.iloc[1:]\n    #print(temp.head())\n    temp[\'freq\'] = temp[\'freq\'].astype(float)\n    temp = temp.round({\'freq\': 2})\n    #print(temp.head())\n    print(temp.sort_values(\'freq\', ascending=False).reset_index(drop=True).head(num_top_venues))\n    print(\'\n\')\n'

In [None]:
#A function to sort the venues in descending order.

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
#sort the venues in using above function for each Area in descending order
num_top_venues = 20

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Area']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue Type'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue Type'.format(ind+1))

# create a new dataframe
df_area_venues_sorted = pd.DataFrame(columns=columns)
df_area_venues_sorted['Area'] = df_area_grouped['Area']

for ind in np.arange(df_area_grouped.shape[0]):
    df_area_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_area_grouped.iloc[ind, :], num_top_venues)

df_area_venues_sorted.head()

Unnamed: 0,Area,1st Most Common Venue Type,2nd Most Common Venue Type,3rd Most Common Venue Type,4th Most Common Venue Type,5th Most Common Venue Type,6th Most Common Venue Type,7th Most Common Venue Type,8th Most Common Venue Type,9th Most Common Venue Type,...,11th Most Common Venue Type,12th Most Common Venue Type,13th Most Common Venue Type,14th Most Common Venue Type,15th Most Common Venue Type,16th Most Common Venue Type,17th Most Common Venue Type,18th Most Common Venue Type,19th Most Common Venue Type,20th Most Common Venue Type
0,Acca,Playground,Platform,Women's Store,Doctor's Office,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,...,Electronics Store,Donut Shop,Dog Run,Discount Store,Food,Diner,Dessert Shop,Department Store,Deli / Bodega,Cuban Restaurant
1,Bellwood,Discount Store,Convenience Store,Mexican Restaurant,Chinese Restaurant,Carpet Store,Fish & Chips Shop,Fast Food Restaurant,Women's Store,Electronics Store,...,Flea Market,Fish Market,Farmers Market,Farm,Doctor's Office,Donut Shop,Dog Run,Food Truck,Diner,Dessert Shop
2,Bellwood Census Designated Place,Motel,Latin American Restaurant,American Restaurant,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,...,Farm,Electronics Store,Donut Shop,Women's Store,Food Truck,Doctor's Office,Discount Store,Diner,Dessert Shop,Department Store
3,Bellwood Manor,Southern / Soul Food Restaurant,Cuban Restaurant,Restaurant,Diner,Breakfast Spot,Women's Store,Donut Shop,Flea Market,Fish Market,...,Fast Food Restaurant,Farmers Market,Farm,Electronics Store,Doctor's Office,Dog Run,Comfort Food Restaurant,Discount Store,Dessert Shop,Department Store
4,Bensley,Flea Market,American Restaurant,Supermarket,Donut Shop,Women's Store,Dog Run,Flower Shop,Fish Market,Fish & Chips Shop,...,Farmers Market,Farm,Electronics Store,Doctor's Office,Food Truck,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega


In [None]:
#Understanding the size of the dataframe
df_area_venues_sorted.shape

(150, 21)

In [None]:
# set number of clusters and apply kmeans clustering algorithm
kclusters = 5
df_area_grouped_clustering = df_area_grouped.drop('Area', 1)
df_area_grouped_clustering.head()
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_area_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:200] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 3, 1, 4, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 3, 2, 1, 1, 1, 4, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [None]:
# add clustering labels
df_area_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_area_venues_merged = df_area

# merge and add latitude/longitude for each neighborhood
df_area_venues_merged = df_area_venues_merged.join(df_area_venues_sorted.set_index('Area'), on='Area')

df_area_venues_merged.head() # check the last columns!

Unnamed: 0,State,County,Area,Latitude,Longitude,Cluster Labels,1st Most Common Venue Type,2nd Most Common Venue Type,3rd Most Common Venue Type,4th Most Common Venue Type,...,11th Most Common Venue Type,12th Most Common Venue Type,13th Most Common Venue Type,14th Most Common Venue Type,15th Most Common Venue Type,16th Most Common Venue Type,17th Most Common Venue Type,18th Most Common Venue Type,19th Most Common Venue Type,20th Most Common Venue Type
0,VA,Chesterfield,Beulah,37.424593,-77.47054,1.0,Pizza Place,Pharmacy,Department Store,Asian Restaurant,...,Fast Food Restaurant,Farmers Market,Farm,Electronics Store,Discount Store,Dog Run,Doctor's Office,Food Truck,Diner,Dessert Shop
1,VA,Henrico,Capitol View,37.47598,-77.390537,,,,,,...,,,,,,,,,,
2,VA,Chesterfield,Five Forks,37.407371,-77.566933,,,,,,...,,,,,,,,,,
3,VA,Henrico,Hunton,37.688478,-77.499707,1.0,Baseball Field,Women's Store,Donut Shop,Food,...,Farm,Electronics Store,Dog Run,Football Stadium,Doctor's Office,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega
4,VA,Henrico,Laurel,37.642923,-77.508874,1.0,Sandwich Place,Light Rail Station,Coffee Shop,Asian Restaurant,...,Frozen Yogurt Shop,Big Box Store,Video Game Store,Golf Course,Farm,Donut Shop,Gym,Dog Run,Doctor's Office,Gym / Fitness Center


In [None]:
# dropping NaN for cluster labels caused by areas where we do not have any venues - 
# merging  venues dataframe with cluster labels
df_area_venues_merged.dropna(inplace=True)
df_area_venues_merged = df_area_venues_merged.reset_index(drop=True)
df_area_venues_merged = df_area_venues_merged.astype({"Cluster Labels": int})
df_area_venues_merged.head(20)

Unnamed: 0,State,County,Area,Latitude,Longitude,Cluster Labels,1st Most Common Venue Type,2nd Most Common Venue Type,3rd Most Common Venue Type,4th Most Common Venue Type,...,11th Most Common Venue Type,12th Most Common Venue Type,13th Most Common Venue Type,14th Most Common Venue Type,15th Most Common Venue Type,16th Most Common Venue Type,17th Most Common Venue Type,18th Most Common Venue Type,19th Most Common Venue Type,20th Most Common Venue Type
0,VA,Chesterfield,Beulah,37.424593,-77.47054,1,Pizza Place,Pharmacy,Department Store,Asian Restaurant,...,Fast Food Restaurant,Farmers Market,Farm,Electronics Store,Discount Store,Dog Run,Doctor's Office,Food Truck,Diner,Dessert Shop
1,VA,Henrico,Hunton,37.688478,-77.499707,1,Baseball Field,Women's Store,Donut Shop,Food,...,Farm,Electronics Store,Dog Run,Football Stadium,Doctor's Office,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega
2,VA,Henrico,Laurel,37.642923,-77.508874,1,Sandwich Place,Light Rail Station,Coffee Shop,Asian Restaurant,...,Frozen Yogurt Shop,Big Box Store,Video Game Store,Golf Course,Farm,Donut Shop,Gym,Dog Run,Doctor's Office,Gym / Fitness Center
3,VA,Chesterfield,Mount Nebo,37.510703,-77.579433,1,Cosmetics Shop,Pub,Beer Store,Gym / Fitness Center,...,Asian Restaurant,Athletics & Sports,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
4,VA,Henrico,Glendale,37.445702,-77.233031,1,Gas Station,Women's Store,Food Truck,Flower Shop,...,Electronics Store,Donut Shop,Dog Run,Doctor's Office,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega,Cuban Restaurant
5,VA,Chesterfield,Hallsboro,37.488482,-77.725272,1,Antique Shop,Women's Store,Dog Run,Food,...,Farm,Electronics Store,Donut Shop,Doctor's Office,Football Stadium,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega
6,VA,Henrico,Manbur,37.534591,-77.342202,1,Social Club,Bed & Breakfast,Historic Site,Women's Store,...,Farm,Electronics Store,Donut Shop,Doctor's Office,Food,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega
7,VA,Chesterfield,Robious,37.522092,-77.612212,1,Coffee Shop,Salon / Barbershop,Video Store,American Restaurant,...,Flower Shop,Food Truck,Diner,Sports Bar,Sushi Restaurant,Cosmetics Shop,Seafood Restaurant,Pet Store,Italian Restaurant,Weight Loss Center
8,VA,Henrico,Henrico County,37.457705,-77.296586,1,Home Service,Women's Store,Dog Run,Flower Shop,...,Electronics Store,Donut Shop,Doctor's Office,Food Truck,Discount Store,Diner,Dessert Shop,Department Store,Deli / Bodega,Cuban Restaurant
9,VA,Chesterfield,Bellwood,37.421815,-77.437483,1,Discount Store,Convenience Store,Mexican Restaurant,Chinese Restaurant,...,Flea Market,Fish Market,Farmers Market,Farm,Doctor's Office,Donut Shop,Dog Run,Food Truck,Diner,Dessert Shop


In [None]:
# create map of the Richmond with the clustering of the venues obtained from k-means clustering
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_area_venues_merged['Latitude'], df_area_venues_merged['Longitude'], df_area_venues_merged['Area'], df_area_venues_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
#showing data from the 1st cluster
df_area_venues_merged.loc[df_area_venues_merged['Cluster Labels'] == 0, df_area_venues_merged.columns[[2] + 
                                                            list(range(5, df_area_venues_merged.shape[1]))]]

In [None]:
#showing data from the 2nd cluster
df_area_venues_merged.loc[df_area_venues_merged['Cluster Labels'] == 1, df_area_venues_merged.columns[[2] + 
                                                            list(range(5, df_area_venues_merged.shape[1]))]]

In [None]:
#showing data from the 3rd cluster
df_area_venues_merged.loc[df_area_venues_merged['Cluster Labels'] == 2, df_area_venues_merged.columns[[2] + 
                                                            list(range(5, df_area_venues_merged.shape[1]))]]

In [None]:
#showing data from the 4th cluster
df_area_venues_merged.loc[df_area_venues_merged['Cluster Labels'] == 3, df_area_venues_merged.columns[[2] + 
                                                            list(range(5, df_area_venues_merged.shape[1]))]]

In [None]:
#showing data from the 5th cluster
df_area_venues_merged.loc[df_area_venues_merged['Cluster Labels'] == 4, df_area_venues_merged.columns[[2] + 
                                                            list(range(5, df_area_venues_merged.shape[1]))]]

In [None]:
# Code to drop the column 'Cluster Labels' from the dataframe - run as necessary
df_area_venues_sorted.drop('Cluster Labels', axis=1, inplace=True)

df_area_venues_sorted.head()

In [None]:
#Preparation of a summary table with the Area Name, Cluster information,Ranking of pools in the area and Pool Count.
# Starting with Area Name and Ranking of Pools amongst other Common Venues in the Area

x_max=df_area_venues_sorted.shape[1]
y_max=df_area_venues_sorted.shape[0]
print(x_max, y_max)

x=0
y=0
data = []
for y in range(x_max):
    for x in range(y_max):
      if (df_area_venues_sorted.iloc[x,y] == 'Pool'):
#           rank=rank+1
            data.append([df_area_venues_sorted.iloc[x,0], y])
            df_pool_ranking=pd.DataFrame(data)
            
#           print("Rank",rank ,"Area with Pools is :",df.iloc[x,0],"with a position of" ,y, "amongst 20 other leading venue categories")


df_pool_ranking.rename(columns={0: 'Area Name'},  inplace=True)
#dfd.rename(columns={1: 'Area Ranking for Opening Pool Equipment Store'},  inplace=True)
df_pool_ranking.rename(columns={1: 'Ranking of Pools amongst other Common Venues in the Area'},  inplace=True)

df_pool_ranking



In [None]:
#Count of the Pools for each Area where pools are there
df_pool_cnt

In [None]:
# merge Pool Count with the Area Name , Ranking of Pools amongst other Common Venues in the Area
df_pool_cnt_rank_merged = df_pool_cnt


df_pool_cnt_rank_merged = df_pool_cnt_rank_merged.join(df_pool_ranking.set_index('Area Name'),on= 'Area')


df_pool_cnt_rank_merged.reset_index(inplace=True)

df_pool_cnt_rank_merged.rename(columns={'Area': 'Area Name'},  inplace=True)


df_pool_cnt_rank_merged=df_pool_cnt_rank_merged.sort_values(by=['Ranking of Pools amongst other Common Venues in the Area'])
df_pool_cnt_rank_merged

In [None]:
#Merging Pool Count, anking and Cluster Labels with the  df_area_venues_merged dataframe 
#to get a master dataframe with all columns
df_pool_cnt_rank_cluster_merged = df_pool_cnt_rank_merged

# merge and add latitude/longitude for each neighborhood
df_pool_cnt_rank_cluster_merged = df_pool_cnt_rank_cluster_merged.join(df_area_venues_merged.set_index('Area'), on='Area Name')

df_pool_cnt_rank_cluster_merged # check the last columns!

In [None]:
#Select the relevant columns from the master dataframe for the results discussion and conclusion analysis.
df_pool_selection=df_pool_cnt_rank_cluster_merged[['Area Name', 'Cluster Labels', 'Ranking of Pools amongst other Common Venues in the Area','Pool Count']]
df_pool_selection=df_pool_selection.sort_values(by=['Cluster Labels'])
df_pool_selection=df_pool_selection.sort_values(by=['Pool Count'], ascending=False)
df_pool_selection.reset_index(drop=True, inplace=True)
df_pool_selection


In [None]:
#Rename the Cluster Labels from Numerals to relevant Cluster Names based on type of Venues the the Cluster most represent
#to form the Final Pool Selection Criteria Dataset
df_pool_selection=df_pool_selection.replace({'Cluster Labels': {0: 'Pool Cluster', 1: 'Restaurant Cluster'}})
df_pool_selection