## Data

Following data sources will be needed to extract/generate the required information:
* coordinate of San Antonio counties zip code will be obtained by scaping from website **http://sanantonio.areaconnect.com/zip2.htm?city=San%20Antonio&qs=TX&searchtype=bycity**
* population of San Antonio counties will be obtained by scaping from website **http://zipatlas.com/us/tx/san-antonio/zip-code-comparison/population-density.htm**
* number of venues and their categories and location in every zipcode will be obtained using **Foursquare API**

In [2]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
from bs4 import BeautifulSoup
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium # map rendering library

### Zip code dataset

We will get the zip code and their corresponding latitude and longtitude for the city of San Antonio.

In [3]:
url='http://sanantonio.areaconnect.com/zip2.htm?city=San%20Antonio&qs=TX&searchtype=bycity'

source=requests.get(url).text

soup = BeautifulSoup(source, 'html5lib')


In [449]:
match=soup.find('div', class_='row')
print(match)

<div class="row header">
				<div class="block zip1 header">Zip
				</div>
				<div class="block zip2 header">City
				</div>
				<div class="block zip3 header">State
				</div>
				<div class="block zip4 header">Area Code
				</div>
				<div class="block zip5 header">County
				</div>
				<div class="block zip6 header">Latitude
				</div>
				<div class="block zip7 header">Longitude
				</div>
				 <br/>
			</div>


In [450]:
zipc=soup.find_all('div', class_='block zip1')
county=soup.find_all('div', class_='block zip5')
lat=soup.find_all('div', class_='block zip6')
lon=soup.find_all('div', class_='block zip7')

In [451]:
for i in range(len(zipc)-1):
    #zip
    zipc[i]=zipc[i].b.text
    #county
    county[i]=county[i].contents
    county[i]=''.join(str(e) for e in county[i])
    county[i]=re.sub('\s+','',county[i])
    #lat
    lat[i]=lat[i].contents
    lat[i]=''.join(str(e) for e in lat[i])
    lat[i]=re.sub('\s+','',lat[i])
    #lon
    lon[i]=lon[i].contents
    lon[i]=''.join(str(e) for e in lon[i])
    lon[i]=re.sub('\s+','',lon[i])

In [452]:
z=[]
c=[]
la=[]
lo=[]
for i in range(len(zipc)-1):
    z.append(zipc[i])
    c.append(county[i])
    la.append(lat[i])
    lo.append(lon[i])
    

In [453]:
dic={'zip_code':z, 'county':c, 'lat': la, 'long': lo}

df=pd.DataFrame(dic)
df.head()

Unnamed: 0,zip_code,county,lat,long
0,78201,Bexar,29.472,-98.537
1,78202,Bexar,29.422,-98.466
2,78203,Bexar,29.415,-98.462
3,78204,Bexar,29.397,-98.5
4,78205,Bexar,29.424,-98.487


In [454]:
df.shape

(88, 4)

### population dataset

In [457]:
url='http://zipatlas.com/us/tx/san-antonio/zip-code-comparison/population-density.htm'

source=requests.get(url).text

soup = BeautifulSoup(source, 'html5lib')

<html>
 <head>
  <title>
   Population Density in San Antonio, TX by Zip Code
  </title>
  <meta content="Population Density in San Antonio, TX with a color coded Zip Code Heat Map." name="description"/>
  <script src="/js/map/city/?e=QP6DoA3w1Mwo9pf3HCsy0Ksjoe3yqTv4q8JJE69Wun5oa2c7ui2MFnxHi9aMm5JIJ6K5oO%2fOhwAtxRntqqBC%2bWXZKWgqlm0Ip7BZzNSliOE%3d" type="text/javascript">
  </script>
  <script src="http://maps.google.com/maps/api/js?sensor=false" type="text/javascript">
  </script>
  <meta content="all,index,follow" name="robots"/>
  <meta content="general" name="rating"/>
  <meta content="ZipAtlas.com Development Team" name="author"/>
  <meta content="en-us" name="language"/>
  <meta content="Copyright 2011 ZipAtlas.com" name="copyright"/>
  <meta content="7 Days" name="revisit-after"/>
  <meta content="-1" http-equiv="Expires"/>
  <meta content="Global" http-equiv="Distribution"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="3cRw56ihbmZI3sma

In [458]:
match=soup.find_all('td', class_='report_data')
#print(match)
print(match[0])
print(match[1].a.text)
print(match[2].text)
print(match[3].a.text)
print(match[4].text)
print(match[5].text)
print(match[6])

<td align="right" class="report_data">1.</td>
78207
29.422300, -98.524805
San Antonio
56,348
7,554.55
<td align="right" class="report_data">#909</td>


In [459]:
zc=[]
la=[]
pop=[]
den=[]
for i in range(1,len(match),7):
    #print(i)
    zc.append(match[i].a.text)
    la.append(match[i+1].text)
    pop.append(match[i+3].text)
    den.append(match[i+4].text)

In [460]:
print(zc)
print(la)
print(pop)
print(den)

['78207', '78225', '78201', '78237', '78228', '78210', '78208', '78202', '78250', '78213', '78229', '78212', '78239', '78204', '78240', '78203', '78209', '78230', '78248', '78242', '78217', '78244', '78232', '78247', '78211', '78233', '78216', '78218', '78220', '78238', '78231', '78227', '78224', '78249', '78251', '78214', '78215', '78205', '78226', '78234', '78223', '78219', '78245', '78221', '78222', '78258', '78236', '78256', '78235', '78259', '78260', '78253', '78255', '78252', '78264', '78263', '78257', '78266', '78254', '78261']
['29.422300, -98.524805', '29.387937, -98.526571', '29.469087, -98.529395', '29.420924, -98.566465', '29.460646, -98.571279', '29.395737, -98.466616', '29.439930, -98.458718', '29.428207, -98.461236', '29.510075, -98.663687', '29.520879, -98.523369', '29.505502, -98.577033', '29.464611, -98.493653', '29.516184, -98.361849', '29.404404, -98.505028', '29.525292, -98.604382', '29.415255, -98.460204', '29.488728, -98.457602', '29.545289, -98.556564', '29.5900

In [461]:
pop_dat=list(zip(zc,pop,den))
pop_dat=pd.DataFrame(pop_dat)
pop_dat.columns=['zip_code','population','density']

pop_dat.population=[i.replace(',','') for i in pop_dat.population]
pop_dat.density=[i.replace(',','') for i in pop_dat.density]

pop_dat.population=pd.to_numeric(pop_dat.population)
pop_dat.density=pd.to_numeric(pop_dat.density)
pop_dat.head()

Unnamed: 0,zip_code,population,density
0,78207,56348,7554.55
1,78225,13553,6850.89
2,78201,47387,6664.58
3,78237,36273,5270.83
4,78228,58091,5240.63


Now we can merge the population data with the county dataset.

In [462]:
df=df.merge(pop_dat,on='zip_code')
df.head()
print(df.shape)

(60, 6)


In [484]:
df.head()

Unnamed: 0,zip_code,county,lat,long,population,density
0,78201,Bexar,29.472,-98.537,47387,6664.58
1,78202,Bexar,29.422,-98.466,11746,5026.9
2,78203,Bexar,29.415,-98.462,5845,4073.78
3,78204,Bexar,29.397,-98.5,11905,4332.32
4,78205,Bexar,29.424,-98.487,1564,1472.21


### Foursquare API

#### Define Foursquare Credentials and Version

In [465]:
CLIENT_ID = 'KNUBP2IFJGCWZYEMOXZV3GVPY20GIG4U2W5VSLJRTBPSKSEE' # your Foursquare ID
CLIENT_SECRET = 'IFXYEVUSWI4AY5X2PWIQQHMEQB1ZAQGLVZWAACDFRKQRY23W' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KNUBP2IFJGCWZYEMOXZV3GVPY20GIG4U2W5VSLJRTBPSKSEE
CLIENT_SECRET:IFXYEVUSWI4AY5X2PWIQQHMEQB1ZAQGLVZWAACDFRKQRY23W


In [467]:
neighborhood_latitude = df.loc[0, 'lat'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'long'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'zip_code'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of 78201 are 29.472, -98.537.


#### Now, let's get the top 100 venues that are in zipcode 78201 within a radius of 500 meters.

In [468]:
# type your answer here
search_query = 'Store'
radius = 500
LIMIT=100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
url


'https://api.foursquare.com/v2/venues/explore?client_id=KNUBP2IFJGCWZYEMOXZV3GVPY20GIG4U2W5VSLJRTBPSKSEE&client_secret=IFXYEVUSWI4AY5X2PWIQQHMEQB1ZAQGLVZWAACDFRKQRY23W&ll=29.472,-98.537&v=20180605&radius=500&limit=100'

In [469]:
results = requests.get(url).json()

In [470]:
results

{'meta': {'code': 200, 'requestId': '5dcc259e6001fe001ca63597'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Jefferson',
  'headerFullLocation': 'Jefferson, San Antonio',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 8,
  'suggestedBounds': {'ne': {'lat': 29.476500004500007,
    'lng': -98.5318407742955},
   'sw': {'lat': 29.467499995499995, 'lng': -98.54215922570451}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b25b4fdf964a5208d7524e3',
       'name': 'Original Donut Shop',
       'location': {'address': '3307 Fredericksburg Rd',
        'lat': 29.472703481339845,
        'lng': -98.53459792742832,
        'labeledLatLngs': [{'label': 'display',
      

In [471]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [472]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Original Donut Shop,Donut Shop,29.472703,-98.534598
1,Restaurant Depot,Kitchen Supply Store,29.473163,-98.535505
2,Redbox,Video Store,29.473317,-98.534868
3,Walgreens,Pharmacy,29.47385,-98.534403
4,Bill Miller,BBQ Joint,29.472299,-98.533515


In [473]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

8 venues were returned by Foursquare.


In [474]:
nearby_venues.categories

0              Donut Shop
1    Kitchen Supply Store
2             Video Store
3                Pharmacy
4               BBQ Joint
5       Convenience Store
6      Mexican Restaurant
7              Donut Shop
Name: categories, dtype: object

#### Function for all neighbourhood

In [475]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['zipcode', 
                  'zipcode Latitude', 
                  'zipcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [476]:
sa_venues = getNearbyVenues(names=df['zip_code'],
                                   latitudes=df['lat'],
                                   longitudes=df['long'])

78201
78202
78203
78204
78205
78207
78208
78209
78210
78211
78212
78213
78214
78215
78216
78217
78218
78219
78220
78221
78222
78223
78224
78225
78226
78227
78228
78229
78230
78231
78232
78233
78234
78235
78236
78237
78238
78239
78240
78242
78244
78245
78247
78248
78249
78250
78251
78252
78253
78254
78255
78256
78257
78258
78259
78260
78261
78263
78264
78266


In [477]:
print(sa_venues.shape)
sa_venues.head()

(439, 7)


Unnamed: 0,zipcode,zipcode Latitude,zipcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,78201,29.472,-98.537,Original Donut Shop,29.472703,-98.534598,Donut Shop
1,78201,29.472,-98.537,Restaurant Depot,29.473163,-98.535505,Kitchen Supply Store
2,78201,29.472,-98.537,Redbox,29.473317,-98.534868,Video Store
3,78201,29.472,-98.537,Walgreens,29.47385,-98.534403,Pharmacy
4,78201,29.472,-98.537,Bill Miller,29.472299,-98.533515,BBQ Joint


In [478]:
sa_venues.groupby('zipcode').count()

Unnamed: 0_level_0,zipcode Latitude,zipcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
78201,8,8,8,8,8,8
78202,5,5,5,5,5,5
78203,4,4,4,4,4,4
78204,5,5,5,5,5,5
78205,100,100,100,100,100,100
78207,5,5,5,5,5,5
78208,9,9,9,9,9,9
78209,16,16,16,16,16,16
78210,6,6,6,6,6,6
78211,5,5,5,5,5,5


In [479]:
print('There are {} uniques categories.'.format(len(sa_venues['Venue Category'].unique())))
print(sa_venues['Venue Category'].unique())

There are 150 uniques categories.
['Donut Shop' 'Kitchen Supply Store' 'Video Store' 'Pharmacy' 'BBQ Joint'
 'Convenience Store' 'Mexican Restaurant' 'Grocery Store' 'Food'
 'Clothing Store' 'Electronics Store' 'Gym / Fitness Center'
 'Historic Site' 'Fried Chicken Joint' 'Park' 'Fast Food Restaurant'
 'Seafood Restaurant' 'Pedestrian Plaza' 'History Museum' 'Dessert Shop'
 'Plaza' 'Sandwich Place' 'New American Restaurant' 'Bistro'
 'Brazilian Restaurant' 'Art Museum' 'Steakhouse' 'Hotel' 'Sports Bar'
 'German Restaurant' 'Neighborhood' 'Theater' 'Cocktail Bar' 'Museum'
 'Ice Cream Shop' 'Restaurant' 'Nightclub' 'Piano Bar'
 'American Restaurant' 'Chocolate Shop' 'Hotel Bar' 'Concert Hall'
 'Theme Restaurant' 'Lingerie Store' 'Shopping Mall' 'Burger Joint'
 'Bakery' 'Bar' 'Movie Theater' 'Lounge' 'Latin American Restaurant' 'Pub'
 'Gift Shop' 'General Entertainment' 'Fountain' 'Italian Restaurant'
 'Wine Bar' 'Monument / Landmark' 'Asian Restaurant' 'Bridge'
 'Mediterranean Restaurant

Beside for restaurant, we also want to select some major categories from them. For now, we will choose hotel, gym, park, convenience store, grocery stor, fast food.

In [480]:
sa_hotel=sa_venues[sa_venues['Venue Category'].str.contains('Hotel')]
sa_gym=sa_venues[sa_venues['Venue Category'].str.contains('Gym')]
sa_park=sa_venues[sa_venues['Venue Category'].str.contains('Park')]
sa_cstore=sa_venues[sa_venues['Venue Category'].str.contains('Convenience Store')]
sa_gstore=sa_venues[sa_venues['Venue Category'].str.contains('Grocery Store')]
sa_rest=sa_venues[sa_venues['Venue Category'].str.contains('Restaurant')]
sa_fast=sa_venues[sa_venues['Venue Category'].str.contains('Fast Food')]

print(sa_hotel.shape, sa_gym.shape, sa_park.shape,sa_cstore.shape,sa_gstore.shape,sa_rest.shape,sa_fast.shape)

(23, 7) (16, 7) (5, 7) (10, 7) (8, 7) (78, 7) (17, 7)


In [482]:
dic={'gym':sa_gym,'hotel':sa_hotel,'park':sa_park,'cstore':sa_cstore,'gstore':sa_gstore,
     'rest':sa_rest,'fast':sa_fast}
df_all=df
for i in ['gym','hotel','park','cstore','gstore','rest','fast']:
    dic[i]=dic[i][['zipcode','Venue Category']].groupby('zipcode').count()

    dic[i].columns=dic[i].columns+'_'+i
    dic[i].reset_index(inplace=True)
    df_all=df_all.merge(dic[i],how='left',left_on='zip_code',right_on='zipcode')

df_all.head()

Unnamed: 0,zip_code,county,lat,long,population,density,zipcode_x,Venue Category_gym,zipcode_y,Venue Category_hotel,zipcode_x.1,Venue Category_park,zipcode_y.1,Venue Category_cstore,zipcode_x.2,Venue Category_gstore,zipcode_y.2,Venue Category_rest,zipcode,Venue Category_fast
0,78201,Bexar,29.472,-98.537,47387,6664.58,,,,,,,78201.0,1.0,,,78201.0,1.0,,
1,78202,Bexar,29.422,-98.466,11746,5026.9,,,,,,,,,78202.0,1.0,78202.0,1.0,,
2,78203,Bexar,29.415,-98.462,5845,4073.78,78203.0,1.0,,,78203.0,1.0,,,,,,,,
3,78204,Bexar,29.397,-98.5,11905,4332.32,,,,,,,78204.0,1.0,,,78204.0,2.0,78204.0,1.0
4,78205,Bexar,29.424,-98.487,1564,1472.21,,,78205.0,19.0,78205.0,1.0,,,,,78205.0,18.0,,


In [483]:
cols = [c for c in df_all.columns if c.lower()[:7] != 'zipcode']
df_all=df_all[cols]
df_all=df_all.fillna(0)
cols=[i.replace('Venue Category_','') for i in cols]
df_all.columns=cols
df_all.head(10)

Unnamed: 0,zip_code,county,lat,long,population,density,gym,hotel,park,cstore,gstore,rest,fast
0,78201,Bexar,29.472,-98.537,47387,6664.58,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,78202,Bexar,29.422,-98.466,11746,5026.9,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,78203,Bexar,29.415,-98.462,5845,4073.78,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,78204,Bexar,29.397,-98.5,11905,4332.32,0.0,0.0,0.0,1.0,0.0,2.0,1.0
4,78205,Bexar,29.424,-98.487,1564,1472.21,0.0,19.0,1.0,0.0,0.0,18.0,0.0
5,78207,Bexar,29.422,-98.523,56348,7554.55,1.0,0.0,0.0,0.0,0.0,4.0,2.0
6,78208,Bexar,29.438,-98.457,5079,5130.21,0.0,2.0,0.0,1.0,0.0,3.0,2.0
7,78209,Bexar,29.488,-98.457,40675,3848.79,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,78210,Bexar,29.399,-98.47,37345,5148.26,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,78211,Bexar,29.357,-98.56,31214,2699.5,0.0,0.0,0.0,0.0,0.0,2.0,1.0
