First, import the libraries we will be using to acquire the information about amenities in the area:
* `requests`, for making HTTP GET requests
* `pandas`, for importing CSV files into a dataframe, and viceversa
* `json_normalize`, for parsing the JSON response from the server

In [2]:
import requests 
import pandas as pd 
from pandas.io.json import json_normalize 

Define the credentials to access the FourSquare API information

In [43]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

Define a function to acquire the venues near each of the ZCTAs coordinates 

In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['ZCTA5CE10', 
                  'INTPTLAT10', 
                  'INTPTLON10', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now we load the list of ZCTAs from the CSV with the housing and commute times generated previously (in the `prepared_data` directory). However, we also need the coordinates of the internal point for each ZCTA, and that information is not in that CSV file. To solve this, we will also load the `zcta_md_dc_vz.csv` file and merge both:

In [10]:
zcta_housing_ct=pd.read_csv("prepared_data/zcta_commute_and_housing.csv")
del zcta_housing_ct["STUSPS"]
zcta_housing_ct

Unnamed: 0,ZCTA5CE10,ct_06:00,ct_06:30,ct_07:00,ct_07:30,ct_08:00,ct_08:30,ct_09:00,ct_09:30,zhvi_sfh,zhvi_mfh,zhvi_1bd,zhvi_2bd,zhvi_3bd,zhvi_4bd,zhvi_5bd,zri_all
0,20903,35.9334,43.6932,46.8932,50.1866,49.9400,46.8766,41.2900,36.9100,411485.0,132811.0,97111.0,169494.0,370863.0,443196.0,453451.0,2004.0
1,20715,42.4998,48.8400,55.5534,57.2766,57.8668,53.1632,46.0534,41.8268,353359.0,151136.0,184149.0,231795.0,344916.0,357194.0,427007.0,1895.0
2,20716,39.6434,47.4834,54.2600,56.0366,56.8734,51.8068,44.6800,40.6600,332640.0,189470.0,138294.0,230586.0,304802.0,364215.0,448082.0,1943.0
3,20720,38.2502,46.2368,50.6466,53.3268,54.7566,51.6768,43.6000,39.7900,432368.0,272576.0,398887.0,247074.0,338757.0,463750.0,503876.0,2080.0
4,20774,38.0966,46.4934,52.1066,55.9198,56.8066,51.9134,44.3232,40.6934,367446.0,186530.0,97768.0,212789.0,307462.0,394240.0,504181.0,1770.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,20024,6.8034,6.9866,6.9200,7.2734,7.6834,7.8198,8.0866,7.7832,725311.0,448488.0,398143.0,648492.0,765953.0,1016410.0,1093813.0,2425.0
110,20001,9.1868,10.0800,10.4634,12.0100,11.9766,12.4466,11.6400,11.2066,859004.0,571394.0,496113.0,690211.0,843278.0,1000819.0,1132617.0,2610.0
111,20032,12.2464,14.0400,14.9034,18.8766,19.3468,20.7600,18.6498,15.5734,360497.0,157132.0,181843.0,286805.0,361422.0,381501.0,411846.0,1755.0
112,20002,16.1966,18.3000,19.8200,22.7666,22.7468,24.5600,24.6234,22.3664,752335.0,497733.0,429444.0,616577.0,751441.0,853421.0,1028867.0,2325.0


In [11]:
zcta_coordinates=pd.read_csv("prepared_data/zcta_md_dc_va.csv")
del zcta_coordinates["STUSPS"]
zcta_coordinates

Unnamed: 0,ZCTA5CE10,INTPTLAT10,INTPTLON10
0,23085,37.688954,-76.825645
1,23086,37.668573,-77.054414
2,23089,37.455796,-76.903501
3,23091,37.656264,-76.802492
4,23092,37.606769,-76.514748
...,...,...,...
1390,20903,39.021433,-76.980774
1391,20904,39.066616,-76.980935
1392,23608,37.147813,-76.543036
1393,20905,39.109866,-76.988077


In [16]:
zcta_housing_ct_coordinates=pd.merge(zcta_coordinates, zcta_housing_ct, on="ZCTA5CE10", how="inner").copy()
zcta_housing_ct_coordinates

Unnamed: 0,ZCTA5CE10,INTPTLAT10,INTPTLON10,ct_06:00,ct_06:30,ct_07:00,ct_07:30,ct_08:00,ct_08:30,ct_09:00,ct_09:30,zhvi_sfh,zhvi_mfh,zhvi_1bd,zhvi_2bd,zhvi_3bd,zhvi_4bd,zhvi_5bd,zri_all
0,20164,39.013299,-77.395067,36.2000,38.8666,42.0200,48.3900,51.4834,49.6534,45.4434,46.5032,437998.0,273952.0,180662.0,262610.0,398316.0,457702.0,475018.0,2029.0
1,20165,39.057538,-77.392009,40.4334,43.1200,46.4066,51.8166,53.9168,51.5100,48.1668,49.9798,559097.0,340492.0,241970.0,317720.0,461537.0,644571.0,714423.0,2264.0
2,20166,38.986137,-77.455694,32.8400,33.0300,34.2366,40.2800,42.4266,40.4834,36.0298,39.4132,463485.0,326109.0,580116.0,284557.0,427618.0,522393.0,780220.0,2065.0
3,20170,38.979842,-77.379750,34.9400,35.1798,36.2502,40.5002,42.0732,40.6868,37.8368,41.8502,530412.0,257090.0,215538.0,276406.0,429003.0,579337.0,650525.0,2055.0
4,20171,38.923779,-77.396546,32.1600,32.0466,33.7666,38.0202,40.3168,39.4066,37.0366,40.1232,665265.0,376354.0,244776.0,365192.0,507937.0,714999.0,806525.0,1988.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,20877,39.140601,-77.192060,38.7668,40.7466,46.4664,53.1832,55.8102,54.9700,49.4698,45.3830,394354.0,170665.0,104773.0,135073.0,338553.0,435559.0,473227.0,1755.0
110,20878,39.112637,-77.250436,39.1234,40.2400,44.7266,49.1232,52.8436,54.1300,49.1800,46.4900,605383.0,297805.0,247252.0,309852.0,422906.0,647031.0,753939.0,1919.0
111,20886,39.169869,-77.224963,40.7934,43.8300,49.8068,56.8200,59.1102,56.7732,51.4300,46.6766,301541.0,154159.0,114683.0,198016.0,279308.0,375455.0,470379.0,1645.0
112,20901,39.021495,-77.009779,39.3734,47.2900,50.2432,53.8532,53.5536,49.8500,44.4602,40.4766,467566.0,239004.0,191780.0,327811.0,471479.0,485990.0,503319.0,1817.0


Now we can query the FourSquare API an get a list of venues near each of those 114 coordinates

In [20]:
zcta_venues = getNearbyVenues(names=zcta_housing_ct_coordinates['ZCTA5CE10'],
                                   latitudes=zcta_housing_ct_coordinates['INTPTLAT10'],
                                   longitudes=zcta_housing_ct_coordinates['INTPTLON10'],
                                   limit=100
                                  )
zcta_venues

Unnamed: 0,ZCTA5CE10,INTPTLAT10,INTPTLON10,Venue,Venue Latitude,Venue Longitude,Venue Category
0,20164,39.013299,-77.395067,Starbucks,39.010134,-77.392890,Coffee Shop
1,20164,39.013299,-77.395067,Redbox,39.013300,-77.394800,Video Store
2,20164,39.013299,-77.395067,Sterling House,39.016363,-77.392813,Bar
3,20165,39.057538,-77.392009,Volcano Island Water Park,39.060535,-77.388501,Pool
4,20166,38.986137,-77.455694,Summit Ropes,38.986198,-77.450059,Rock Climbing Spot
...,...,...,...,...,...,...,...
1532,20903,39.021433,-76.980774,Eastern Carryout,39.023087,-76.978047,Chinese Restaurant
1533,20903,39.021433,-76.980774,Unique Thrift Store,39.021681,-76.975096,Thrift / Vintage Store
1534,20903,39.021433,-76.980774,Brother's Sew & Vac,39.021705,-76.976325,Hobby Shop
1535,20903,39.021433,-76.980774,Hillandale Wine & Beer,39.021126,-76.975594,Wine Shop


We can see how many venues were returned for each ZCTA:

In [39]:
zcta_venues_count = zcta_venues.groupby('ZCTA5CE10').count()
zcta_venues_count

Unnamed: 0_level_0,INTPTLAT10,INTPTLON10,Venue,Venue Latitude,Venue Longitude,Venue Category
ZCTA5CE10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20001,31,31,31,31,31,31
20002,14,14,14,14,14,14
20003,61,61,61,61,61,61
20005,100,100,100,100,100,100
20007,7,7,7,7,7,7
...,...,...,...,...,...,...
22310,18,18,18,18,18,18
22311,13,13,13,13,13,13
22312,2,2,2,2,2,2
22314,72,72,72,72,72,72


 The next step is to convert this list of venues into a numerical form, via one-hot encoding:


In [27]:
zcta_venues_onehot = pd.get_dummies(zcta_venues[['Venue Category']], prefix="", prefix_sep="")

# Add the ZCTA5CE10 column back to dataframe (currently it is an index) and move it to the first column
zcta_venues_onehot['ZCTA5CE10'] = zcta_venues['ZCTA5CE10'] 
zctacolumn = zcta_venues_onehot['ZCTA5CE10']
zcta_venues_onehot.drop(labels=['ZCTA5CE10'], axis=1,inplace = True)
zcta_venues_onehot.insert(0, 'ZCTA5CE10', zctacolumn)

# Convert the individual counts to percentages over the total number of venues
zcta_venues_grouped = zcta_venues_onehot.groupby('ZCTA5CE10').mean().reset_index()
zcta_venues_grouped

Unnamed: 0,ZCTA5CE10,Accessories Store,African Restaurant,American Restaurant,Amphitheater,Antique Shop,Arcade,Arepa Restaurant,Art Gallery,Art Museum,...,Warehouse Store,Watch Shop,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio
0,20001,0.0,0.0,0.000000,0.0,0.0,0.00,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.032258,0.032258,0.000000,0.0,0.0,0.00
1,20002,0.0,0.0,0.071429,0.0,0.0,0.00,0.00,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.00
2,20003,0.0,0.0,0.016393,0.0,0.0,0.00,0.00,0.032787,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.016393,0.000000,0.0,0.0,0.00
3,20005,0.0,0.0,0.040000,0.0,0.0,0.01,0.01,0.010000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.00
4,20007,0.0,0.0,0.000000,0.0,0.0,0.00,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,22310,0.0,0.0,0.000000,0.0,0.0,0.00,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.00
98,22311,0.0,0.0,0.153846,0.0,0.0,0.00,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.00
99,22312,0.0,0.0,0.000000,0.0,0.0,0.00,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.50
100,22314,0.0,0.0,0.013889,0.0,0.0,0.00,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.013889,0.000000,0.013889,0.0,0.0,0.00


We will now save this data so we can perform the analysis without having to query the FourSquare API every single time. We will save 2 files:
  1. First, we will add the column with the number of venues returned for each ZCTA to the dataframe with the commute times and housing indexes and save that in `prepared_data/zcta_venuessummary_ct_housing.csv`.
  2. Then, the file `prepared_data/zcta_venues.csv` with the contents of the `zcta_venues_gouped` data frame.

In [40]:
# First, the file with the summary of venues and the commute times and housing indexes

# Add the ZCTA5CE10 column to zcta_venues_count so we can merge
zcta_venues_count.reset_index(inplace=True)

# Drop the columns we do not need
del zcta_venues_count["INTPTLAT10"]
del zcta_venues_count["INTPTLON10"]
del zcta_venues_count["Venue Latitude"]
del zcta_venues_count["Venue Longitude"]
del zcta_venues_count["Venue Category"]

# Set the column names to be more descriptive
zcta_venues_count.columns = ["ZCTA5CE10", "venue_count"]

In [45]:
df_allsummary = pd.merge(zcta_housing_ct, zcta_venues_count, on="ZCTA5CE10", how="inner").copy()
df_allsummary.to_csv('prepared_data/zcta_venuessummary_ct_housing.csv', index=False)

In [46]:
# Next, the CSV file with the detailed venue information
zcta_venues_grouped.to_csv('prepared_data/zcta_venues.csv', index=False)