In [1]:
from typing import Tuple
import configparser
from foursquare import Foursquare
import numpy as np
import pandas as pd
import geopandas
import requests
from bs4 import BeautifulSoup
from time import sleep

In [2]:
%%capture
from tqdm.notebook import tqdm
tqdm().pandas()

# Foursquare API

In [3]:
# config.ini holds the authentication for the API
config = configparser.ConfigParser()
config.read('config.ini')
config.sections()

['FOURSQUARE']

In [4]:
# Create a Foursquare API instance
fs = Foursquare(
    client_id= config['FOURSQUARE']['CLIENT_ID'],
    client_secret= config['FOURSQUARE']['CLIENT_SECRET'], 
    version= config['FOURSQUARE']['VERSION']
)

## Foursquare Categories

In [5]:
fs.categories.df.head()

Unnamed: 0,id,name,pluralName,shortName,Parent_0,Parent_1,Parent_2,Parent_3,Parent_4
0,4d4b7104d754a06370d81259,Arts & Entertainment,Arts & Entertainment,Arts & Entertainment,Top-Level,,,,
1,56aa371be4b08b9a8d5734db,Amphitheater,Amphitheaters,Amphitheater,Arts & Entertainment,Top-Level,,,
2,4fceea171983d5d06c3e9823,Aquarium,Aquariums,Aquarium,Arts & Entertainment,Top-Level,,,
3,4bf58dd8d48988d1e1931735,Arcade,Arcades,Arcade,Arts & Entertainment,Top-Level,,,
4,4bf58dd8d48988d1e2931735,Art Gallery,Art Galleries,Art Gallery,Arts & Entertainment,Top-Level,,,


Search for grocery store categories

In [6]:
fs.categories.search(['grocery', 'market'])

Unnamed: 0,id,name,pluralName,shortName,Parent_0,Parent_1,Parent_2,Parent_3,Parent_4
105,52f2ab2ebcbc57f1066b8b3b,Christmas Market,Christmas Markets,Christmas Market,Event,Top-Level,,,
487,53e510b7498ebcb1801b55d4,Night Market,Night Markets,Night Market,Nightlife Spot,Top-Level,,,
772,4bf58dd8d48988d1f7941735,Flea Market,Flea Markets,Flea Market,Shop & Service,Top-Level,,,
773,56aa371be4b08b9a8d573505,Floating Market,Floating Markets,Floating Market,Shop & Service,Top-Level,,,
781,4bf58dd8d48988d1fa941735,Farmers Market,Farmers Markets,Farmer's Market,Shop & Service,Food & Drink Shop,Top-Level,,
782,4bf58dd8d48988d10e951735,Fish Market,Fish Markets,Fish Market,Shop & Service,Food & Drink Shop,Top-Level,,
785,4bf58dd8d48988d118951735,Grocery Store,Grocery Stores,Grocery Store,Shop & Service,Food & Drink Shop,Top-Level,,
789,52f2ab2ebcbc57f1066b8b45,Organic Grocery,Organic Groceries,Organic Grocery,Shop & Service,Food & Drink Shop,Top-Level,,
792,52f2ab2ebcbc57f1066b8b46,Supermarket,Supermarkets,Supermarket,Shop & Service,Food & Drink Shop,Top-Level,,
824,50be8ee891d4fa8dcc7199a7,Market,Markets,Market,Shop & Service,Top-Level,,,


From the list above I think we can select on Supermarket and Grocery Stores to search the DFW area for offsite food access.  Later we will look at onsite food access like the density of fast food resutrants.

In [7]:
grocery = fs.categories.select(['Grocery Store', 'Supermarket'])

# Helper Functions to Collect Venues from the Foursquare API

In [8]:
def parse_response(resp: dict) -> list:
    """ Parses all the venues from the first group.
    
    From what I've seen so far there is typically only one group which
    is the 'Recommended Places' group.  We will assume this is the only group
    returned from the API.
    """
    venue_list = [item['venue'] for item in resp['response']['groups'][0]['items']]
    return [parse_venue(venue) for venue in venue_list]

def parse_venue(venue: dict) -> dict:
    """ Parses the venue dict from the items list returned from the Explore resource """
    # Drop keys from the dict under the location and categories keys
    loc = drop_keys(venue['location'], ['formattedAddress', 'labeledLatLngs'])
    cat = drop_keys(venue['categories'][0], ['pluralName', 'shortName', 'icon', 'primary'])
    
    # Rename the keys of the categroies as they will conflict with the venue name and id
    cat = {key: value for key,value in zip(['category_id','category_name'],cat.values())}
    
    # create the venue dict will all relevant information
    venue = drop_keys(venue, ['location', 'categories', 'photos'])
    venue.update(loc)
    venue.update(cat)
    return venue


def drop_keys(x: dict, keys_to_drop: list) -> dict:
    """ Creates a copy of the dict and drops the keys listed """
    x = x.copy()
    for key in keys_to_drop:
        x.pop(key, None)
    return x

def get_all_venues(api_object: Foursquare, zipcode: str, radius: float, categories: list) -> Tuple[list, int, int]:
    """ Return all the venues from a zipcode that match the category ids list.  Also returns the
    total number of results expected and the number of calls made.
    """
    response = api_object.explore(loc = zipcode, loc_type = 'near', radius = radius, categories_id = categories, page = 1)
    # Foursqaure tier only allows 2 calls per second, so I want to throttle a bit
    sleep(0.5)
    venues = parse_response(response)
    # Figure out how many calls to make
    total_results = int(response['response']['totalResults'])
    add_page = 1 if (total_results % 50) > 0 else 0
    pages = (total_results // 50) + add_page
    # make a call for each additional page needed to get all the venues
    for p in range(2, pages+1):
        response = api_object.explore(loc = zipcode, loc_type = 'near', radius = radius, categories_id = categories, page = p)
        sleep(0.5) # More Throttling
        venues.extend(parse_response(response))
    
    return venues, total_results, pages   

# Find all the Grocery Stores in DFW

We will get all the zipcodes within the DFW area according to http://www.usa.com/dallas-fort-worth-arlington-tx-area-zip-code-and-maps.htm.  Then we will try to setup a search pattern for each zipcode based on the geojson data.  We will estimate the centroid and a search radius by finding a circle that will fit the entire zipcode polygon from the geojson file. 

In [9]:
# get the DFW zipcodes from the webpage below
url = 'http://www.usa.com/dallas-fort-worth-arlington-tx-area-zip-code-and-maps.htm'
r = requests.get(url)
page = BeautifulSoup(r.text,'html.parser')
tags = page.select('div#rList4')[0].find_all('u')
dfw_metro_zipcodes=[tag.text.strip().split(",")[0] for tag in tags]

In [10]:
# GeoJSON zipcode boundaries for Texas
tx_geo = geopandas.read_file('data/State-zip-code-GeoJSON-master/tx_texas_zip_codes_geo.min.json')
# Get the subset of zipcodes within the DFW area
mask_dfw = tx_geo['ZCTA5CE10'].isin(dfw_metro_zipcodes)
dfw_geo = tx_geo[mask_dfw]

In [11]:
# Create a rectangle that includes all points in the polygon
simple_boundary = dfw_geo.envelope
# Get the bounds of the rectangle and convert to meters using the EPSG 3857 projection
bounds = simple_boundary.to_crs('epsg:3857').bounds
# calulate the euclidean distance and divid by 2 to get the radius of a cirle the will fit the rectangle inside
radius = bounds.apply(lambda p: (p['maxx'] - p['minx'])**2 + (p['maxy'] - p['miny'])**2, axis=1)
radius = np.sqrt(radius) / 2
radius.head()

53     6278.870696
54     4299.425855
55    18585.634632
56    16638.219130
57    14296.644067
dtype: float64

In [12]:
# Find the center of each polygon using the geopandas centroid attribute
centers = dfw_geo.centroid
# Format these as str "lat,lon" to use in the Foursquare API
centers = [f"{p.y},{p.x}" for p in centers]

## Loop through every zipcode and find each grocery store.

The loop below will search at the centroid of the zipcode polygon with a radius as calulated above from the rectanglar bounding box.  This will produce a lot of duplicates as the search radius given will have a lot of overlap.  I would rather make extra calls to the API to ensure we have gotten all stores within the area.  

A possilbe better way to accomplish this would be to segment the entire area into larger sections to reduce the amount of overlap and API calls.  This method would require either to look at the area and pick ways to segment or write a function optimizing the area segmentation.  Writing the segmentation function seems to be too time consuming for the payoff, and I don't want a requirment of having to look at the map to subjectively pick how to segment.  So, I want an easy solution that could be applied to any area even if it isn't the most efficient.  Another possilbity to solve this is to get city boundary geojson data and use that as larger segments.  This can also be tricky as city boundaries can be very oddly shaped and will probably also have a lot of overlaping in search areas.

In [17]:
dfw_venues = list()
total_results, total_calls = 0, 0
with tqdm(total=len(centers)) as progress_bar:
    for zipcode in zip(centers, radius):
        progress_bar.set_description(f"Results: {total_results}, Calls: {total_calls}")
        venues, n_results, n_calls = get_all_venues(fs, zipcode[0], zipcode[1], grocery['id'].values.tolist())
        dfw_venues.extend(venues)
        # update the stats
        total_results = total_results + n_results
        total_calls = total_calls + n_calls
        progress_bar.update(1)
    progress_bar.write(f'There were {total_results} expected results and received {len(dfw_venues)} actual resutls , although many will be duplicates. Total Calls: {total_calls}')

HBox(children=(FloatProgress(value=0.0, max=270.0), HTML(value='')))

There were 9880 expected results and received 9880 actual resutls , although many will be duplicates. Total Calls: 350



In [21]:
dfw_df = pd.DataFrame(dfw_venues)
dfw_df = dfw_df.drop_duplicates(subset=['id'])
# Save to CSV so we don't have to make the API calls again
dfw_df.to_csv('dfw_grocery_stores.csv')

dfw_df.head()

Unnamed: 0,id,name,address,lat,lng,postalCode,cc,city,state,country,category_id,category_name,crossStreet,neighborhood,venuePage
0,4ad767ecf964a520190a21e3,Market Street,6100 Eldorado Pkwy,33.175257,-96.695618,75070,US,McKinney,TX,United States,52f2ab2ebcbc57f1066b8b46,Supermarket,,,
1,4b0716dcf964a52004f722e3,Kroger,2901 Lake Forest Dr,33.174645,-96.681799,75072,US,McKinney,TX,United States,52f2ab2ebcbc57f1066b8b46,Supermarket,at Eldorado Pkwy,,
2,4b5a2cd1f964a5201db228e3,Kroger Marketplace,12221 Custer Rd,33.177204,-96.735344,75035,US,Frisco,TX,United States,52f2ab2ebcbc57f1066b8b46,Supermarket,at Eldorado Pkwy,,
3,56942118498e39606cb92e87,Walmart Neighborhood Market,3400 Virginia Parkway,33.197264,-96.664704,75071,US,McKinney,TX,United States,52f2ab2ebcbc57f1066b8b46,Supermarket,,,
4,53d02a0e11d2867a3fa7e90a,Trader Joe's,2851 Craig Dr Ste 100,33.173674,-96.642839,75070,US,McKinney,TX,United States,52f2ab2ebcbc57f1066b8b46,Supermarket,,,


In [22]:
dfw_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 485 entries, 0 to 8386
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             485 non-null    object 
 1   name           485 non-null    object 
 2   address        478 non-null    object 
 3   lat            485 non-null    float64
 4   lng            485 non-null    float64
 5   postalCode     479 non-null    object 
 6   cc             485 non-null    object 
 7   city           484 non-null    object 
 8   state          485 non-null    object 
 9   country        485 non-null    object 
 10  category_id    485 non-null    object 
 11  category_name  485 non-null    object 
 12  crossStreet    208 non-null    object 
 13  neighborhood   6 non-null      object 
 14  venuePage      1 non-null      object 
dtypes: float64(2), object(13)
memory usage: 60.6+ KB


53       6278.870696
54       4299.425855
55      18585.634632
56      16638.219130
57      14296.644067
            ...     
1913     5523.643323
1914     5292.782197
1915    14719.473941
1916     6881.604123
1917     6999.514146
Length: 270, dtype: float64