##### Installing Packages

In [1]:
# Installing conda packages
!conda install -c conda-forge googlemaps #For geo coordinates
!conda install -c conda-forge folium


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - googlemaps


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    googlemaps-2.5.1           |             py_0          23 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.4 MB

The following NEW packages will be INSTALLED:

    googlemaps:      2.5.1-py_0        conda-forge

The following packages will be UPDATED:

    ca-certificates: 2019.11.27-0                  --> 2019.11.28-hecc5488_0 conda-forge
    certifi:         2019.11.28-py36_0    

In [59]:
##Importing necessary packages

# For data handling
import numpy as np
import pandas as pd 

#For extracting HTML information from URL
import requests
from bs4 import BeautifulSoup

#For Clustering
from sklearn.cluster import KMeans

#For Visulization
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import folium

#For Geo-Coordinates
from googlemaps import Client as GoogleMaps
from geopy.geocoders import Nominatim


In [60]:
##Url to San Francisco Neighborhood Wiki Page

url = 'https://en.wikipedia.org/wiki/List_of_neighborhoods_in_San_Francisco'

results_url = requests.get(url).text


In [61]:
#Creating soup object to extract HTML information

soup = BeautifulSoup(results_url, 'lxml' )
# print(soup.prettify())

In [62]:
#Initiating an empty neighborhood list and populating it

neighborhoods = []
for text in soup.find_all('span', class_='mw-headline'):
    neighborhoods.append(text.text)

neighborhoods = neighborhoods[:-4]

print(neighborhoods)    

['Alamo Square', 'Anza Vista', 'Ashbury Heights', 'Balboa Park', 'Balboa Terrace', 'Bayview', 'Belden Place', 'Bernal Heights', 'Buena Vista', 'Butchertown (Old and New)', 'Castro', 'Cathedral Hill', 'Cayuga Terrace', 'China Basin', 'Chinatown', 'Civic Center', 'Clarendon Heights', 'Cole Valley', 'Corona Heights', 'Cow Hollow', 'Crocker-Amazon', 'Design District', 'Diamond Heights', 'Dogpatch', 'Dolores Heights', 'Duboce Triangle', 'Embarcadero', 'Eureka Valley', 'Excelsior', 'Fillmore', 'Financial District', 'Financial District South', "Fisherman's Wharf", 'Forest Hill', 'Forest Knolls', 'Glen Park', 'Golden Gate Heights', 'Haight-Ashbury', 'Hayes Valley', 'Hunters Point', 'India Basin', 'Ingleside', 'Ingleside Terraces', 'Inner Sunset', 'Irish Hill', 'Islais Creek', 'Jackson Square', 'Japantown', 'Jordan Park', 'Laguna Honda', 'Lake Street', 'Lakeside', 'Lakeshore', 'Laurel Heights', 'Lincoln Manor', 'Little Hollywood', 'Little Russia', 'Little Saigon', 'Lone Mountain', 'Lower Haight

In [63]:
#Creating a df from the extracted neighborhood list

df  = pd.DataFrame(data=[neighborhoods]).T
df.columns = ['Neighborhood']
df.head()

Unnamed: 0,Neighborhood
0,Alamo Square
1,Anza Vista
2,Ashbury Heights
3,Balboa Park
4,Balboa Terrace


In [64]:
# The code was removed by Watson Studio for sharing.

In [65]:
#Adding lat & long columns to data frame

df['lat'] = ""
df['long']=""

#Creating an Address Column

df['Address'] = df['Neighborhood'].astype(str) + ', San Francisco'
df.head()

Unnamed: 0,Neighborhood,lat,long,Address
0,Alamo Square,,,"Alamo Square, San Francisco"
1,Anza Vista,,,"Anza Vista, San Francisco"
2,Ashbury Heights,,,"Ashbury Heights, San Francisco"
3,Balboa Park,,,"Balboa Park, San Francisco"
4,Balboa Terrace,,,"Balboa Terrace, San Francisco"


In [66]:
#Populate Latitude & Longitude from Google Maps API

for x in range(len(df)):
    geocode_result = gmaps.geocode(df['Address'][x])
    df['lat'][x] = geocode_result[0]['geometry']['location'] ['lat']
    df['long'][x] = geocode_result[0]['geometry']['location']['lng']
    
df.head()

Unnamed: 0,Neighborhood,lat,long,Address
0,Alamo Square,37.7775,-122.433,"Alamo Square, San Francisco"
1,Anza Vista,37.7809,-122.443,"Anza Vista, San Francisco"
2,Ashbury Heights,37.7653,-122.445,"Ashbury Heights, San Francisco"
3,Balboa Park,37.7246,-122.443,"Balboa Park, San Francisco"
4,Balboa Terrace,37.7313,-122.469,"Balboa Terrace, San Francisco"


In [67]:
#Export to CSV

df.to_csv('SanFranNeighborhood.csv')

##### San Francisco map

In [68]:
#Get Coordinates for San Francisco 

address = 'San Francisco, California'

geolocator = Nominatim(user_agent="sanfran_details")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print("The longitude is {} and the latitude is {}".format(longitude, latitude))

The longitude is -122.4192363 and the latitude is 37.7792808


In [69]:
#Create Map

sanfran_map = folium.Map(location = [latitude, longitude], zoom_start = 12)

for lat, long,neighborhood in zip(df['lat'], df['long'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat,long],
        radius = 5,
        popup = label,
        color = 'green',
        fill= True,
        fill_opacity = 0.5,
        parse_html = False).add_to(sanfran_map)
sanfran_map
        

##### Set up FourSqaure credentials

In [70]:
# The code was removed by Watson Studio for sharing.

In [71]:
#Declare global variables


limit = 500
radiustoexplore = 500

In [72]:
def getNearbyVenues(neighborhood, latitudes, longitudes, radius=radiustoexplore):
    
    venues_list=[]
    for neighborhood, lat, lng in zip(neighborhood, latitudes, longitudes):
        print(neighborhood)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        for v in results:
            
            venues_list.append((
                neighborhood, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']))

    nearby_venues = pd.DataFrame(venues_list)
    
    return(nearby_venues)

In [73]:

sanfran_venues = getNearbyVenues(  neighborhood=df['Neighborhood'],
                                   latitudes=df['lat'],
                                   longitudes=df['long']
                                  )


Alamo Square
Anza Vista
Ashbury Heights
Balboa Park
Balboa Terrace
Bayview
Belden Place
Bernal Heights
Buena Vista
Butchertown (Old and New)
Castro
Cathedral Hill
Cayuga Terrace
China Basin
Chinatown
Civic Center
Clarendon Heights


KeyError: 'groups'

In [None]:
sanfran_venues
sanfran_venues.columns = ['Neighborhood', 'NeighborhoodLatitude', 'NeighborhoodLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
sanfran_venues

In [None]:
# #Explore 

placetoexplore = 'Restaurant'

print('There are ',len(sanfran_venues['VenueCategory'].unique()),' venue categories around San Francisco')

#How many restaturant categories 

uniquerestaurants = sanfran_venues[sanfran_venues['VenueCategory'].str.contains('{}'.format(placetoexplore))]['VenueCategory'].unique().tolist()

print('There are', len(uniquerestaurants), ' unique restaurants in SF area')
uniquerestaurants

##Explore Other Categories

sanfran_venues['VenueCategory'].unique()

##Create a df for restaurants

restaurantdf = sanfran_venues[sanfran_venues['VenueCategory'].str.contains('{}'.format(placetoexplore))]
restaurantdf.head()

In [None]:
#Encode VenueCategory column

sfrestaurant_onehot = pd.get_dummies(restaurantdf[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sfrestaurant_onehot['Neighborhood'] = restaurantdf['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [sfrestaurant_onehot.columns[-1]] + list(sfrestaurant_onehot.columns[:-1])
sfrestaurant_onehot = sfrestaurant_onehot[fixed_columns]

sfrestaurant_onehot.head()

In [None]:
sfrestaurant_grouped = sfrestaurant_onehot.groupby('Neighborhood').mean().reset_index()
sfrestaurant_grouped

In [None]:
sfrestaurant_grouped.shape

In [None]:
num_top_venues = 5

for hood in sfrestaurant_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = sfrestaurant_grouped[sfrestaurant_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']



In [None]:
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = sfrestaurant_grouped['Neighborhood']

for ind in np.arange(sfrestaurant_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sfrestaurant_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
##Choosing the best value of K for Kmeans clustering 
sfrestaurant_grouped_clustering = sfrestaurant_grouped.drop('Neighborhood', 1)

cost =[] 
for i in range(1,10): 
    KM = KMeans(n_clusters = i, max_iter = 500) 
    KM.fit(sfrestaurant_grouped_clustering) 
      
    # calculates squared error 
    # for the clustered points 
    cost.append(KM.inertia_)   
    
# plot the cost against K values 
plt.figure(figsize= (12,8))
plt.plot(range(1, 10), cost, color ='g', linewidth ='2') 
plt.xlabel("Value of K") 
plt.ylabel("Sqaured Error (Cost)") 
plt.show() # clear the plot 



##### K=5 is the elbow point for inertia metric graph. The cluster size is set to 5

In [None]:
#Clustering

# set number of clusters
clusters = 5

sfrestaurant_grouped_clustering = sfrestaurant_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=clusters, random_state=0).fit(sfrestaurant_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

sanfran_merged = restaurantdf

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
sanfran_merged = sanfran_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

sanfran_merged.head() # check the last columns!

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sanfran_merged['NeighborhoodLatitude'], sanfran_merged['NeighborhoodLongitude'], sanfran_merged['Neighborhood'], sanfran_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.4).add_to(map_clusters)
       
map_clusters

#### Examine Clusters

In [None]:
#Cluster 1

sanfran_merged.loc[sanfran_merged['Cluster Labels'] == 0, sanfran_merged.columns[[1] + list(range(5, sanfran_merged.shape[1]))]]

In [None]:
#Cluster 2

sanfran_merged.loc[sanfran_merged['Cluster Labels'] == 1, sanfran_merged.columns[[1] + list(range(5, sanfran_merged.shape[1]))]]

In [None]:
#Cluster 3

sanfran_merged.loc[sanfran_merged['Cluster Labels'] == 2, sanfran_merged.columns[[1] + list(range(5, sanfran_merged.shape[1]))]]

In [None]:
#Cluster 4 
sanfran_merged.loc[sanfran_merged['Cluster Labels'] == 3, sanfran_merged.columns[[1] + list(range(5, sanfran_merged.shape[1]))]]

In [None]:
#Cluster 5 
sanfran_merged.loc[sanfran_merged['Cluster Labels'] == 4, sanfran_merged.columns[[1] + list(range(5, sanfran_merged.shape[1]))]]