<h1 align=center><font size = 6>Segmenting and Clustering Neighborhoods in Philadelphia</font></h1>

## 1. Get, Clean and Transform data into Dataframe

#### Install packages

In [None]:
!pip install lxml
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install beautifulsoup4
!pip install uszipcode
!pip install geocoder

#### Import libraries

In [None]:

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files


from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


import folium # map rendering library


from bs4 import BeautifulSoup

import geocoder # import geocoder
from uszipcode import SearchEngine

### Scrape neighborhoods data from Wiki and transform to dataframe

In [None]:
source = requests.get('https://en.wikipedia.org/wiki/Callowhill,_Philadelphia').text
soup = BeautifulSoup(source, 'html5lib')
all_li_tag= []
for x in soup.find_all('li'):
    all_li_tag.append(x.text)
indexes = range(0,14)
for index in sorted(indexes, reverse=True):
    del all_li_tag[index]
philly_neighborhoods = pd.DataFrame({"Borough":'',"Neighborhoods":all_li_tag})
philly_neighborhoods = philly_neighborhoods.drop(philly_neighborhoods.tail(79).index)
philly_neighborhoods

In [None]:
##Add Borough to df1
philly_neighborhoods.loc[0:12]['Borough'] = "Center City"
philly_neighborhoods.loc[13:48]['Borough'] = "South"
philly_neighborhoods.loc[49:59]['Borough'] = "Southwest"
philly_neighborhoods.loc[60:111]['Borough'] = "Lower North"
philly_neighborhoods.loc[112:120]['Borough'] = "North, Uper North"
philly_neighborhoods.loc[121:128]['Borough'] = "Olney-Oak Lane"
philly_neighborhoods.loc[129:134]['Borough'] = "Lower Northwest"
philly_neighborhoods.loc[135:141]['Borough'] = "Uper Northwest"
philly_neighborhoods.loc[142:157]['Borough'] = "Near Northeast"
philly_neighborhoods.loc[158:172]['Borough'] = "Far Northeast"
philly_neighborhoods.loc[173:]['Borough'] = "River Wards"

### Get coordintates for neighborhoods - Add them to dataframe

In [None]:
philly_neighborhoods["lon"] = np.nan  #tab to show suggestion
philly_neighborhoods["lat"] = np.nan

geolocator = Nominatim(user_agent="philly_explorer")
for i in range(len(philly_neighborhoods)):
    #print(i)
    address = philly_neighborhoods['Neighborhoods'][i] + ", Philadelphia, Pennsylvania" # df1[i][2]
    location = None
    for j in range(3):
        location = geolocator.geocode(address)
        if location is not None:
            break

    if location is None:
        continue
    philly_neighborhoods["lat"][i] = location.latitude    
    philly_neighborhoods["lon"][i] = location.longitude
   
    #print(address)

In [None]:
#save dataframe to csv file since geolocator is not consistant and cannot run too many times
philly_neighborhoods.to_csv('philly_coordinates.csv')

#### Open dataframe from CSV, dataframe now includes: Borough, Neighborhoods, Latitude(lat) and Longtitude(lon)

In [None]:
philly_data = pd.read_csv("philly_coordinates.csv") 
philly_data = philly_data[["Borough","Neighborhoods","lat","lon"]]
philly_data.dropna(inplace = True)
philly_data.head()

### Use uszipcode library to get population density and average income for each neighborhood

#### Get population_density, average household income for each neighborhood. Add them to dataframe

In [None]:
search = SearchEngine(simple_zipcode=True) # set simple_zipcode=False to use rich info database

# philly_data['average income'] = np.nan
average_income = []
population = []
average_housing_units = []
average_home_value = []
for x in range(len(philly_data)):
    zipcode = search.by_coordinates(philly_data['lat'].iloc[x], philly_data['lon'].iloc[x], radius=20)
    list_0=[]
    list_1 = []
    list_2 = []
    list_3 = []
    for i in range(len(zipcode)):
        if abs(zipcode[i].bounds_south) <abs(philly_data['lat'].iloc[x])<abs(zipcode[i].bounds_north) and abs(zipcode[i].bounds_east) <abs(philly_data['lon'].iloc[x])<abs(zipcode[i].bounds_west):
            list_0.append(zipcode[i].median_household_income)
            list_1.append(zipcode[i].population_density)
            list_2.append(zipcode[i].housing_units)
            list_3.append(zipcode[i].median_home_value)
            #print(list_0)
            list_0 = [q for q in filter(None, list_0)]   ## make a list and remove the Nove value
            list_1 = [k for k in filter(None, list_1)] 
            list_2 = [g for g in filter(None, list_2)] 
            list_3 = [t for t in filter(None, list_3)] 
    avg = np.mean(list_0) #calculating mean
    avg_pop = np.mean(list_1)
    avg_housing_units = np.mean(list_2)
    avg_home_value = np.mean(list_3)
    
    #print(avg)
    #print(x)
    
    average_income.append(avg) #add values to a list
    population.append(avg_pop)
    average_housing_units.append(avg_housing_units)
    average_home_value.append(avg_home_value)
    
average_income = np.array(average_income) #transform list to numpy array
population = np.array(population)
average_housing_units = np.array(average_housing_units)
average_home_value = np.array(average_home_value)

philly_data['average income'] =average_income
philly_data['population density'] = population
philly_data['average housing units'] = average_housing_units
philly_data['average home value'] = average_home_value

In [None]:
## final dataframe
philly_data.head()

### Create a map of Philly Neighborhoods

In [None]:
!jupyter notebook --ip=0.0.0.0 --allow-root #uncomment this if jupyter fail to get addresss from geolocator package

In [None]:
geolocator = Nominatim(user_agent="phil_explorer")
location = geolocator.geocode("Philadelphia, PA")
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Philadelphia are {}, {}.'.format(latitude, longitude))

In [None]:
map_philly = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(philly_data['lat'], philly_data['lon'], philly_data['Neighborhoods']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_philly)  
    
map_philly

### Define Fousquare Credentials and Version

In [None]:
CLIENT_ID = 'PA5ZIKIU0LO3MTUN3U42ZN4Y1J3LN4UGYNUOJNZDBZ4GHE4Y' # your Foursquare ID
CLIENT_SECRET = 'ORMC5BSDWVJ4PUC0FKKLNPMNVZL50BJOQBV5SQILHQBYUTTL' # your Foursquare Secret
VERSION = '20191012' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

radius = 500
LIMIT = 100

### Create a function to explore all neighborhoods in Philadelphia

In [None]:

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhoods', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Get revenues in Philadelphia neighbourhoods

In [None]:

philly_venues = getNearbyVenues(names=philly_data['Neighborhoods'],
                                   latitudes=philly_data['lat'],
                                   longitudes=philly_data['lon']
                                  )


In [None]:
##check size of philly_venues df
print(philly_venues.shape)
philly_venues.head()

In [None]:
#how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(philly_venues['Venue Category'].unique())))

## 2. Analyze Each Neighborhood

In [None]:
# one hot encoding
philly_onehot = pd.get_dummies(philly_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
philly_onehot['Neighborhoods'] = philly_venues['Neighborhoods'] 

# move neighborhood column to the first column
fixed_columns = [philly_onehot.columns[-1]] + list(philly_onehot.columns[:-1])
toronto_onehot = philly_onehot[fixed_columns]

philly_onehot.head()

In [None]:
philly_onehot.shape

#### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [None]:

philly_grouped = philly_onehot.groupby('Neighborhoods').mean().reset_index()
philly_grouped

In [None]:
philly_grouped.shape

#### Print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in philly_grouped['Neighborhoods']:
    print("----"+hood+"----")
    temp = philly_grouped[philly_grouped['Neighborhoods'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')


#### Put results into a pandas dataframe

Write a function to sort the venues in descending order.


In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhoods']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhoods'] = philly_grouped['Neighborhoods']

for ind in np.arange(philly_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(philly_grouped.iloc[ind, :], num_top_venues) # loop through range horizontally for each row 

neighborhoods_venues_sorted.head()

## 4. Cluster Neighborhoods

#### Run *k*-means to cluster the neighborhood into 10 clusters.

In [None]:
# set number of clusters
kclusters = 10

philly_grouped_clustering = philly_grouped.drop('Neighborhoods', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(philly_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

#### Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

philly_merged = philly_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
philly_merged = philly_merged.join(neighborhoods_venues_sorted.set_index('Neighborhoods'), on='Neighborhoods')

philly_merged.head() # check the last columns!

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(philly_merged['lat'], philly_merged['lon'], philly_merged['Neighborhoods'], philly_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 3. Examine Clusters

In [None]:
cluster_1 = philly_merged.loc[philly_merged['Cluster Labels'] == 0, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_1.sort_values(by=['population density','average home value'], ascending=False).head()

In [None]:
cluster_2 = philly_merged.loc[philly_merged['Cluster Labels'] == 1, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_2.sort_values(by=['population density','average home value'], ascending=False).head()

In [None]:
cluster_3 = philly_merged.loc[philly_merged['Cluster Labels'] == 2, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_3.sort_values(by=['population density','average income'], ascending=False).head()

In [None]:
cluster_4 = philly_merged.loc[philly_merged['Cluster Labels'] == 3, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_4.sort_values(by=['population density','average income'], ascending=False).head()

In [None]:
cluster_5 = philly_merged.loc[philly_merged['Cluster Labels'] == 4, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_5.sort_values(by=['population density','average income'], ascending=False).head()

In [None]:
cluster_6 = philly_merged.loc[philly_merged['Cluster Labels'] == 5, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_6.sort_values(by=['population density','average income'], ascending=False).head()

In [None]:
cluster_7 = philly_merged.loc[philly_merged['Cluster Labels'] == 6, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_7.sort_values(by=['population density','average home value'], ascending=False).head()

In [None]:
cluster_8 = philly_merged.loc[philly_merged['Cluster Labels'] == 7, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_8.sort_values(by=['population density','average income'], ascending=False).head()

In [None]:
cluster_9 = philly_merged.loc[philly_merged['Cluster Labels'] == 8, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_9.sort_values(by=['population density' ,'average income'], ascending=False).head()

In [None]:
cluster_10 = philly_merged.loc[philly_merged['Cluster Labels'] == 9, philly_merged.columns[[1] + list(range(4, philly_merged.shape[1]))]]
cluster_10.sort_values(by=['population density','average income'], ascending=False).head()