### Part 1: Create the SF dataset with neighborhood coordinates

In [72]:
#import libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import geocoder
from geopy.geocoders import Nominatim
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [62]:
#scrape information from healthysf.org
web = requests.get("http://www.healthysf.org/bdi/outcomes/zipmap.htm").text
soup = BeautifulSoup(web, 'lxml')
table = soup.find_all('table')[4]
table_string = pd.read_html(str(table))

#put zip code and neighborhood name into a dataframe
df = pd.DataFrame({'Zip Code':table_string[0][0], 'Neighborhood':table_string[0][1]}, columns = ['Zip Code', 'Neighborhood'])
df.head(5)

Unnamed: 0,Zip Code,Neighborhood
0,Zip Code,Neighborhood
1,94102,Hayes Valley/Tenderloin/North of Market
2,94103,South of Market
3,94107,Potrero Hill
4,94108,Chinatown


In [63]:
###data cleaning

#drop 1st and last row
df = df.drop([0, len(df['Zip Code']) - 1], axis = 0).reset_index(drop = True)
df

Unnamed: 0,Zip Code,Neighborhood
0,94102,Hayes Valley/Tenderloin/North of Market
1,94103,South of Market
2,94107,Potrero Hill
3,94108,Chinatown
4,94109,Polk/Russian Hill (Nob Hill)
5,94110,Inner Mission/Bernal Heights
6,94112,Ingelside-Excelsior/Crocker-Amazon
7,94114,Castro/Noe Valley
8,94115,Western Addition/Japantown
9,94116,Parkside/Forest Hill


In [64]:
#create latitude and longitude column
df['Latitude'] = ""
df['Longitude'] = ""

In [70]:
for zip_code in df['Zip Code']: 
    geolocator = Nominatim(user_agent='sflocator')
    loc = geolocator.geocode('{}, San Francisco, California'.format(zip_code))
    df['Latitude'][df[df['Zip Code']== zip_code].index.values.astype(int)[0]] = loc.latitude
    df['Longitude'][df[df['Zip Code']== zip_code].index.values.astype(int)[0]] = loc.longitude

In [71]:
df

Unnamed: 0,Zip Code,Neighborhood,Latitude,Longitude
0,94102,Hayes Valley/Tenderloin/North of Market,37.7795,-122.418
1,94103,South of Market,37.7744,-122.411
2,94107,Potrero Hill,37.7923,-122.409
3,94108,Chinatown,37.7911,-122.407
4,94109,Polk/Russian Hill (Nob Hill),37.7941,-122.421
5,94110,Inner Mission/Bernal Heights,37.7533,-122.417
6,94112,Ingelside-Excelsior/Crocker-Amazon,37.7234,-122.444
7,94114,Castro/Noe Valley,37.7614,-122.435
8,94115,Western Addition/Japantown,37.7839,-122.435
9,94116,Parkside/Forest Hill,37.7464,-122.473


### Part 2: Fetch data for housing price

In [121]:
#scrape information from Property Shark
cookies = {'_ga':'GA1.2.950836956.1559897414','_gid':'GA1.2.1335605111.1559897414', 'incap_ses_626_1731432':'dgAQBmovHDplcpyh1gCwCDsl+lwAAAAA8WONBTR8K5SUdynuGR2xqw==','session':'10.97.95.111.1559897414653316','visid_incap_1731432':'LsrpaOF+QvmAL7KxYndh1Tol+lwAAAAAQUIPAAAAAACsVPpMPTyICDVm+ng9Qpg7'}
webh = requests.get("https://www.propertyshark.com/Real-Estate-Reports/2017/09/28/expensive-zip-codes-san-francisco/", cookies = cookies).text
souph = BeautifulSoup(webh, 'lxml')
tableh = souph.find('table', {'class': 'tablepress tablepress-id-185'})
tableh_string = pd.read_html(str(tableh))
#put house price data into a dataframe
house_price = tableh_string[0]
house_price.head(5)

Unnamed: 0,Rank,ZIP CODE,MEDIAN PRICE/SQFT 2017,MEDIAN PRICE/SQFT 2016,Y-o-Y Change
0,1,94105,"$1,209","$1,249",-3.31%
1,2,94108,"$1,189","$1,125",5.38%
2,3,94123,"$1,188","$1,226",-3.20%
3,4,94104,"$1,179",$706,40.12%
4,5,94114,"$1,157","$1,100",4.93%


In [124]:
#remove unnecessary columns
house_price = house_price.drop(['Rank', 'MEDIAN PRICE/SQFT 2016', 'Y-o-Y Change'], axis = 1).reset_index(drop = True)

In [125]:
#merge housing data with df


Unnamed: 0,ZIP CODE,MEDIAN PRICE/SQFT 2017
0,94105,"$1,209"
1,94108,"$1,189"
2,94123,"$1,188"
3,94104,"$1,179"
4,94114,"$1,157"


### Part 3: Fetch data for nearby venues

In [74]:
#get the latitude and longitude of San Francisco
address = 'San Francisco, California'

geolocator = Nominatim(user_agent = 'sflocator')
location = geolocator.geocode(address)
city_lat = location.latitude
city_lng = location.longitude
print('The geographical coordinates of San Francisco are {}, {}'.format(city_lat, city_lng))

The geographical coordinates of San Francisco are 37.7792808, -122.4192363


In [80]:
#map of San Francisco with neighborhoods superimposed on top
sf_map = folium.Map(location = [city_lat, city_lng], zoom_start = 12)

#add markers
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat, lng], radius = 5, popup = label, color = 'orange', fill = True, fill_color = '#f49842', fill_opacity = 0.6, parse_html = False).add_to(sf_map)
    
sf_map

In [81]:
#define foursquare credentials and version
CLIENT_ID = 'HPOULZHL1DFLSPAYO5F3PR51O1A3KWEOU3320AHX0DTEDIQV'
CLIENT_SECRET = 'YBVY1UPRSHIX1NR3DFCG3ST2FV4IU2XSLLU40YOVY4QJL0WO'
VERSION = '20180605'

In [90]:
#explore popular venues in all neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius = 1000, limit = 750):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, limit)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

sfvenues = getNearbyVenues(names = df['Neighborhood'], latitudes = df['Latitude'], longitudes = df['Longitude'])
sfvenues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,Asian Art Museum,37.780178,-122.416505,Art Museum
1,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,Louise M. Davies Symphony Hall,37.777976,-122.420157,Concert Hall
2,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,Herbst Theater,37.779548,-122.420953,Concert Hall
3,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,War Memorial Opera House,37.778601,-122.420816,Opera House
4,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,Philz Coffee,37.781433,-122.417073,Coffee Shop
5,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,San Francisco Ballet,37.778580,-122.420798,Dance Studio
6,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,SHN Orpheum Theatre,37.779315,-122.414790,Theater
7,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,Siam Orchid Traditional Thai Massage,37.777111,-122.417967,Massage Studio
8,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,Fermentation Lab,37.778368,-122.415313,Beer Bar
9,Hayes Valley/Tenderloin/North of Market,37.779481,-122.418229,The Nutcracker,37.778569,-122.420800,Performing Arts Venue
