In [164]:
#Load BeautifulSoup library
import requests
from bs4 import BeautifulSoup
url_to_scrape = 'https://en.wikipedia.org/wiki/Boston'

In [165]:
r = requests.get(url_to_scrape)

# We now have the source of the page, let's ask BeaultifulSoup
# to parse it for us.
soup = BeautifulSoup(r.text,'lxml')
soup.title #Verify if we get the page or not

<title>Boston - Wikipedia</title>

In [166]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [167]:
#Load the table into a data frame
data = []
table = soup.find('table', attrs={'class':'wikitable sortable'})

In [168]:
#Parse the table and get a list 
#Transfer the list and get the original data into data frame

table_body = table.find('tbody')
table_rows = table.find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    l.append(row)
columnsList = ["Rank","ZipCode","Per capita income","Median household income","Median family income","Population","Number of households"]

df=pd.DataFrame(l, columns=columnsList)

df.head()

Unnamed: 0,Rank,ZipCode,Per capita income,Median household income,Median family income,Population,Number of households
0,,,,,,,
1,1.0,02110 (Financial District),"$152,007","$123,795","$196,518",1486.0,981.0
2,2.0,02199 (Prudential Center),"$151,060","$107,159","$146,786",1290.0,823.0
3,3.0,02210 (Fort Point),"$93,078","$111,061","$223,411",1905.0,1088.0
4,4.0,02109 (North End),"$88,921","$128,022","$162,045",4277.0,2190.0


In [169]:
#Function to find value between parentheses 
def extractArea(s):
    if(s == None):
        return None
    start = s.find('(')
    if start == -1:
        # No opening bracket found. Should this be an error?
        return ''
    start += 1  # skip the bracket, move to the next character
    end = s.find(')', start)
    if end == -1:
        # No closing bracket found after the opening bracket.
        # Should this be an error instead?
        return s[start:]
    else:
        return s[start:end]


In [170]:
#Function to find real zipcode 
def extractZip(s):
    if(s == None):
        return None
    start = s.find('(')
    if start == -1:
        # No opening bracket found. Should this be an error?
        return ''
    else:
        return s[0:start]

In [171]:
#Split the ZipCode column 
#Get ZipCode and Area into sepearte column
searchArea=[]
searchZip=[]

for values in df['ZipCode']:
    a=extractArea(values)
    z=s=extractZip(values)
    searchArea.append(a)
    searchZip.append(z)

df['Area']=searchArea
df['ZipCode']=searchZip

In [172]:
#Clean up the data
#Only process the cells that have zip code
df1=df.loc[(df['ZipCode'] != '') & (df['ZipCode'].notnull())].sort_values('Area').reset_index(drop=True)
df1.head()


Unnamed: 0,Rank,ZipCode,Per capita income,Median household income,Median family income,Population,Number of households,Area
0,19,2134,"$25,319","$37,638","$49,355",20478,8916,Allston
1,24,2163,"$21,915","$43,889","$91,190",1842,562,Allston-Harvard Business School
2,5,2116,"$81,458","$87,630","$134,875",21318,10938,Back Bay/Bay Village
3,25,2115,"$21,654","$23,677","$50,303",29178,9958,Back Bay/Fenway–Kenmore
4,6,2108,"$78,569","$95,753","$153,618",4155,2337,Beacon Hill/Financial District


In [173]:
import geocoder
from geopy.geocoders import Nominatim
geolocator = Nominatim()



In [174]:
#Get coordinates from zipCode
searchLatitude=[]
searchLongitude=[]

for zipCode in df1['ZipCode']:
    location = geolocator.geocode(zipCode)
    searchLatitude.append(location.latitude)
    searchLongitude.append(location.longitude)

df1['Latitude']=searchLatitude
df1['Longitude']=searchLongitude

In [175]:
df1.head()

Unnamed: 0,Rank,ZipCode,Per capita income,Median household income,Median family income,Population,Number of households,Area,Latitude,Longitude
0,19,2134,"$25,319","$37,638","$49,355",20478,8916,Allston,42.356341,-71.135159
1,24,2163,"$21,915","$43,889","$91,190",1842,562,Allston-Harvard Business School,54.66478,25.280405
2,5,2116,"$81,458","$87,630","$134,875",21318,10938,Back Bay/Bay Village,42.349825,-71.073294
3,25,2115,"$21,654","$23,677","$50,303",29178,9958,Back Bay/Fenway–Kenmore,42.341128,-71.095119
4,6,2108,"$78,569","$95,753","$153,618",4155,2337,Beacon Hill/Financial District,42.35766,-71.064266


In [176]:
#Fix the coordinates for 02122 since it's not accurate in geocoder
df2=df1.sort_values('Area').copy()
df2.loc[df2['ZipCode'].astype(int) ==2122, ['Latitude']] =42.2967
df2.loc[df2['ZipCode'].astype(int) == 2122, ['Longitude']] =-71.0527
df2.reindex()
#show full list of areas
df2

Unnamed: 0,Rank,ZipCode,Per capita income,Median household income,Median family income,Population,Number of households,Area,Latitude,Longitude
0,19,2134,"$25,319","$37,638","$49,355",20478,8916,Allston,42.356341,-71.135159
1,24,2163,"$21,915","$43,889","$91,190",1842,562,Allston-Harvard Business School,54.66478,25.280405
2,5,2116,"$81,458","$87,630","$134,875",21318,10938,Back Bay/Bay Village,42.349825,-71.073294
3,25,2115,"$21,654","$23,677","$50,303",29178,9958,Back Bay/Fenway–Kenmore,42.341128,-71.095119
4,6,2108,"$78,569","$95,753","$153,618",4155,2337,Beacon Hill/Financial District,42.35766,-71.064266
5,7,2114,"$65,865","$79,734","$169,107",11933,6752,Beacon Hill/West End,42.349411,-71.061651
6,16,2135,"$31,773","$50,291","$62,602",38839,18336,Brighton,42.358197,-71.144008
7,9,2129,"$56,267","$89,105","$98,445",17052,8083,Charlestown,42.377543,-71.061422
8,10,2467,"$53,382","$113,952","$148,396",22796,6351,Chestnut Hill,42.320017,-71.158139
9,8,2111,"$56,716","$44,758","$88,333",7616,3390,Chinatown/Financial District/Leather District,42.352139,-71.060927


In [177]:
# Draw a map of Boston and mark all neighborhood
address = 'Boston, MA'

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Boston are {}, {}.'.format(latitude, longitude))
# create map of Boston using latitude and longitude values
map_Boston = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood, zipCode in zip(df2['Latitude'], df2['Longitude'], df2['Area'],df2['ZipCode']):
    label = neighborhood + ' (' + zipCode + ')'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Boston)  
    
map_Boston

The geograpical coordinate of Boston are 42.3602534, -71.0582912.


In [178]:
#Initial FourSqure
CLIENT_ID = '31ZXKBSCYIY1HTQKYQ02IQVYQKSYHQDYX0CMFPSK0MJKNI2S' # your Foursquare ID
CLIENT_SECRET = '3YTYLGJI5OYLMPNWNAQEIRY4XU11KXQFMQECWA1JA3FRC1FM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
# type your answer here
radius=500
LIMIT=200
#CATEGORY_ID = '4d4b7105d754a06374d81259' #Category for food 
CATEGORY_ID = '4bf58dd8d48988d11f941735' #Night Clu

In [179]:
def getNearbyCategoryVenues(names, latitudes, longitudes, zipCode, radius=5000):
    
    venues_list=[]
    for name, lat, lng, zipCode in zip(names, latitudes, longitudes, zipCode):
        print(name + '(' + zipCode + ')')
        #print(name)
        #print(zip)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            CATEGORY_ID)
        print(url)    
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name + ' (' + zipCode + ')', 
            name,
            zipCode,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['NeighborhoodFull', 
                  'Neighborhood',
                  'ZipCode',
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [180]:
Boston_venues = getNearbyCategoryVenues(names=df2['Area'],
                                   latitudes=df2['Latitude'],
                                   longitudes=df2['Longitude'],
                                    zipCode = df2['ZipCode']
                                  )

Allston(02134 )
https://api.foursquare.com/v2/venues/explore?&client_id=31ZXKBSCYIY1HTQKYQ02IQVYQKSYHQDYX0CMFPSK0MJKNI2S&client_secret=3YTYLGJI5OYLMPNWNAQEIRY4XU11KXQFMQECWA1JA3FRC1FM&v=20180605&ll=42.3563405104863,-71.1351593406643&radius=5000&limit=200&categoryId=4bf58dd8d48988d11f941735
Allston-Harvard Business School(02163 )
https://api.foursquare.com/v2/venues/explore?&client_id=31ZXKBSCYIY1HTQKYQ02IQVYQKSYHQDYX0CMFPSK0MJKNI2S&client_secret=3YTYLGJI5OYLMPNWNAQEIRY4XU11KXQFMQECWA1JA3FRC1FM&v=20180605&ll=54.6647798340753,25.2804054004782&radius=5000&limit=200&categoryId=4bf58dd8d48988d11f941735
Back Bay/Bay Village(02116 )
https://api.foursquare.com/v2/venues/explore?&client_id=31ZXKBSCYIY1HTQKYQ02IQVYQKSYHQDYX0CMFPSK0MJKNI2S&client_secret=3YTYLGJI5OYLMPNWNAQEIRY4XU11KXQFMQECWA1JA3FRC1FM&v=20180605&ll=42.3498252143051,-71.0732937169561&radius=5000&limit=200&categoryId=4bf58dd8d48988d11f941735
Back Bay/Fenway–Kenmore(02115 )
https://api.foursquare.com/v2/venues/explore?&client_id=31Z

In [181]:
Boston_venues.head()

Unnamed: 0,NeighborhoodFull,Neighborhood,ZipCode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Allston (02134 ),Allston,2134,42.356341,-71.135159,Oberon,42.370759,-71.114599,Nightclub
1,Allston (02134 ),Allston,2134,42.356341,-71.135159,Havana Club,42.364659,-71.104323,Nightclub
2,Allston (02134 ),Allston,2134,42.356341,-71.135159,The Phoenix Landing,42.364178,-71.101817,Nightclub
3,Allston (02134 ),Allston,2134,42.356341,-71.135159,Wonder Bar,42.350876,-71.131267,Nightclub
4,Allston (02134 ),Allston,2134,42.356341,-71.135159,Club Passim,42.37433,-71.120083,Nightclub


In [182]:
#df3 = Boston_food_venues.groupby(['NeighborhoodFull','Neighborhood','ZipCode']).size()
df3=Boston_venues.groupby(['Neighborhood','ZipCode']).count().reset_index()
df3

Unnamed: 0,Neighborhood,ZipCode,NeighborhoodFull,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Allston,2134,20,20,20,20,20,20,20
1,Allston-Harvard Business School,2163,28,28,28,28,28,28,28
2,Back Bay/Bay Village,2116,46,46,46,46,46,46,46
3,Back Bay/Fenway–Kenmore,2115,48,48,48,48,48,48,48
4,Beacon Hill/Financial District,2108,43,43,43,43,43,43,43
5,Beacon Hill/West End,2114,41,41,41,41,41,41,41
6,Brighton,2135,18,18,18,18,18,18,18
7,Charlestown,2129,41,41,41,41,41,41,41
8,Chestnut Hill,2467,5,5,5,5,5,5,5
9,Chinatown/Financial District/Leather District,2111,40,40,40,40,40,40,40


In [183]:
dfAll = pd.merge(df3[['ZipCode', 'Neighborhood', 'Venue']],df2[["ZipCode","Area","Per capita income","Median household income","Median family income","Population","Number of households"]],
                       how='right', on=['ZipCode'])
dfAll.loc[dfAll['Area'] == 'Mission Hill', ['Neighborhood']] ='Mission Hill'
dfAll.loc[dfAll['Area'] == 'Mission Hill', ['Venue']] =0
dfAll

Unnamed: 0,ZipCode,Neighborhood,Venue,Area,Per capita income,Median household income,Median family income,Population,Number of households
0,2134,Allston,20.0,Allston,"$25,319","$37,638","$49,355",20478,8916
1,2163,Allston-Harvard Business School,28.0,Allston-Harvard Business School,"$21,915","$43,889","$91,190",1842,562
2,2116,Back Bay/Bay Village,46.0,Back Bay/Bay Village,"$81,458","$87,630","$134,875",21318,10938
3,2115,Back Bay/Fenway–Kenmore,48.0,Back Bay/Fenway–Kenmore,"$21,654","$23,677","$50,303",29178,9958
4,2108,Beacon Hill/Financial District,43.0,Beacon Hill/Financial District,"$78,569","$95,753","$153,618",4155,2337
5,2114,Beacon Hill/West End,41.0,Beacon Hill/West End,"$65,865","$79,734","$169,107",11933,6752
6,2135,Brighton,18.0,Brighton,"$31,773","$50,291","$62,602",38839,18336
7,2129,Charlestown,41.0,Charlestown,"$56,267","$89,105","$98,445",17052,8083
8,2467,Chestnut Hill,5.0,Chestnut Hill,"$53,382","$113,952","$148,396",22796,6351
9,2111,Chinatown/Financial District/Leather District,40.0,Chinatown/Financial District/Leather District,"$56,716","$44,758","$88,333",7616,3390


In [184]:
dfAll['Per capita income']=dfAll['Per capita income'].str.replace('$','').str.replace(',','').astype(int)
dfAll['Median household income']=dfAll['Median household income'].str.replace('$','').str.replace(',','').astype(int)
dfAll['Median family income']=dfAll['Median family income'].str.replace('$','').str.replace(',','').astype(int)
dfAll['Population']=dfAll['Population'].str.replace(',','').astype(int)
dfAll['Number of households']=dfAll['Number of households'].str.replace(',','').astype(int)
dfAll

Unnamed: 0,ZipCode,Neighborhood,Venue,Area,Per capita income,Median household income,Median family income,Population,Number of households
0,2134,Allston,20.0,Allston,25319,37638,49355,20478,8916
1,2163,Allston-Harvard Business School,28.0,Allston-Harvard Business School,21915,43889,91190,1842,562
2,2116,Back Bay/Bay Village,46.0,Back Bay/Bay Village,81458,87630,134875,21318,10938
3,2115,Back Bay/Fenway–Kenmore,48.0,Back Bay/Fenway–Kenmore,21654,23677,50303,29178,9958
4,2108,Beacon Hill/Financial District,43.0,Beacon Hill/Financial District,78569,95753,153618,4155,2337
5,2114,Beacon Hill/West End,41.0,Beacon Hill/West End,65865,79734,169107,11933,6752
6,2135,Brighton,18.0,Brighton,31773,50291,62602,38839,18336
7,2129,Charlestown,41.0,Charlestown,56267,89105,98445,17052,8083
8,2467,Chestnut Hill,5.0,Chestnut Hill,53382,113952,148396,22796,6351
9,2111,Chinatown/Financial District/Leather District,40.0,Chinatown/Financial District/Leather District,56716,44758,88333,7616,3390


In [185]:
dfRank=dfAll.copy()
dfRank['PerCapitaIncomeRanked'] = dfRank['Per capita income'].rank(ascending=1)
dfRank['HouseholdSizeRanked'] = (dfRank['Population']/dfRank['Number of households']).rank(ascending=0)
dfRank['VenuePerPopulationRanked'] = (dfRank['Venue']/dfRank['Population']).rank(ascending=0)
dfRank['Score']=dfRank['PerCapitaIncomeRanked']+dfRank['HouseholdSizeRanked']+dfRank['VenuePerPopulationRanked']
dfRank.sort_values(by='Score', ascending=False,inplace=True)
dfRank

Unnamed: 0,ZipCode,Neighborhood,Venue,Area,Per capita income,Median household income,Median family income,Population,Number of households,PerCapitaIncomeRanked,HouseholdSizeRanked,VenuePerPopulationRanked,Score
16,2110,Financial District,38.0,Financial District,152007,123795,196518,1486,981,30.0,30.0,2.0,62.0
2,2116,Back Bay/Bay Village,46.0,Back Bay/Bay Village,81458,87630,134875,21318,10938,26.0,24.0,11.0,61.0
28,2132,West Roxbury,3.0,West Roxbury,44306,82421,110219,27163,11013,19.0,15.0,27.0,61.0
5,2114,Beacon Hill/West End,41.0,Beacon Hill/West End,65865,79734,169107,11933,6752,24.0,26.0,9.0,59.0
23,2199,Prudential Center,48.0,Prudential Center,151060,107159,146786,1290,823,29.0,29.0,1.0,59.0
17,2210,Fort Point,37.0,Fort Point,93078,111061,223411,1905,1088,28.0,27.0,3.0,58.0
6,2135,Brighton,18.0,Brighton,31773,50291,62602,38839,18336,15.0,21.0,21.0,57.0
21,2109,North End,38.0,North End,88921,128022,162045,4277,2190,27.0,23.0,6.0,56.0
24,2131,Roslindale,3.0,Roslindale,29486,61099,70598,30370,11282,14.0,13.0,29.0,56.0
4,2108,Beacon Hill/Financial District,43.0,Beacon Hill/Financial District,78569,95753,153618,4155,2337,25.0,25.0,5.0,55.0


In [186]:
dfRank[['ZipCode','Neighborhood','Score']]

Unnamed: 0,ZipCode,Neighborhood,Score
16,2110,Financial District,62.0
2,2116,Back Bay/Bay Village,61.0
28,2132,West Roxbury,61.0
5,2114,Beacon Hill/West End,59.0
23,2199,Prudential Center,59.0
17,2210,Fort Point,58.0
6,2135,Brighton,57.0
21,2109,North End,56.0
24,2131,Roslindale,56.0
4,2108,Beacon Hill/Financial District,55.0


In [187]:
bins = [0,20,30,50,70]
labels=[3,2,1,0]
dfRank['bins'] = pd.cut(dfRank['Score'], bins=bins, labels=labels, include_lowest=True)
dfRank[['ZipCode','Neighborhood','Score','bins']]

Unnamed: 0,ZipCode,Neighborhood,Score,bins
16,2110,Financial District,62.0,0
2,2116,Back Bay/Bay Village,61.0,0
28,2132,West Roxbury,61.0,0
5,2114,Beacon Hill/West End,59.0,0
23,2199,Prudential Center,59.0,0
17,2210,Fort Point,58.0,0
6,2135,Brighton,57.0,0
21,2109,North End,56.0,0
24,2131,Roslindale,56.0,0
4,2108,Beacon Hill/Financial District,55.0,0


In [188]:
df2.head()

Unnamed: 0,Rank,ZipCode,Per capita income,Median household income,Median family income,Population,Number of households,Area,Latitude,Longitude
0,19,2134,"$25,319","$37,638","$49,355",20478,8916,Allston,42.356341,-71.135159
1,24,2163,"$21,915","$43,889","$91,190",1842,562,Allston-Harvard Business School,54.66478,25.280405
2,5,2116,"$81,458","$87,630","$134,875",21318,10938,Back Bay/Bay Village,42.349825,-71.073294
3,25,2115,"$21,654","$23,677","$50,303",29178,9958,Back Bay/Fenway–Kenmore,42.341128,-71.095119
4,6,2108,"$78,569","$95,753","$153,618",4155,2337,Beacon Hill/Financial District,42.35766,-71.064266


In [189]:
dfFinal = pd.merge(df2[['ZipCode','Latitude','Longitude']],dfRank[["Neighborhood","ZipCode","Score","bins"]],
                       how='left', on=['ZipCode'])
dfFinal

Unnamed: 0,ZipCode,Latitude,Longitude,Neighborhood,Score,bins
0,2134,42.356341,-71.135159,Allston,48.0,1
1,2163,54.66478,25.280405,Allston-Harvard Business School,13.0,3
2,2116,42.349825,-71.073294,Back Bay/Bay Village,61.0,0
3,2115,42.341128,-71.095119,Back Bay/Fenway–Kenmore,26.0,2
4,2108,42.35766,-71.064266,Beacon Hill/Financial District,55.0,0
5,2114,42.349411,-71.061651,Beacon Hill/West End,59.0,0
6,2135,42.358197,-71.144008,Brighton,57.0,0
7,2129,42.377543,-71.061422,Charlestown,54.0,0
8,2467,42.320017,-71.158139,Chestnut Hill,46.0,1
9,2111,42.352139,-71.060927,Chinatown/Financial District/Leather District,49.0,1


In [190]:
#### create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(4)
ys = [i+x+(i*x)**2 for i in range(4)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfFinal['Latitude'], dfFinal['Longitude'], dfFinal['Neighborhood'], dfFinal['bins']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters