<a href="https://colab.research.google.com/github/kywch/geo-colab/blob/master/SafeGraph-filter-places.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
colab_requirements = [
    "apt install python3-rtree=0.8.3+ds-1", # rtree v0.8.3
    "pip install geopandas==0.7.0" # geopandas v0.7.0 
]

In [2]:
# following the post, 'Making Jupyter notebooks Google Colab ready'
# https://timsainburg.com/google%20colab.html

import os, sys, subprocess

def run_subprocess_command(cmd):
    # run the command
    process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
    # print the output
    for line in process.stdout:
        print(line.decode().strip())
        
if 'COLAB_GPU' in os.environ: # a quick and dirty way to check whether this is COLAB
    for i in colab_requirements:
        run_subprocess_command(i)

Reading package lists...
Building dependency tree...
Reading state information...
The following package was automatically installed and is no longer required:
libnvidia-common-440
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
libspatialindex-c4v5 libspatialindex-dev libspatialindex4v5
python3-pkg-resources
Suggested packages:
python3-setuptools
The following NEW packages will be installed:
libspatialindex-c4v5 libspatialindex-dev libspatialindex4v5
python3-pkg-resources python3-rtree
0 upgraded, 5 newly installed, 0 to remove and 35 not upgraded.
Need to get 671 kB of archives.
After this operation, 3,948 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libspatialindex4v5 amd64 1.8.5-5 [219 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libspatialindex-c4v5 amd64 1.8.5-5 [51.7 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/main amd64 python3-pkg-resources all 39.0.1-2 [9

In [3]:
import pandas as pd

import geopandas as gpd
from fiona.crs import from_epsg
from shapely.geometry import Point

from glob import glob
import time

import os
from shutil import copyfile

## Load the Places dataset from Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [5]:
!ls drive/My\ Drive/SafeGraph/places

brand_info.csv						  core_poi-part3.csv.gz
core_places_IL.csv					  core_poi-part4.csv.gz
core_places_IL.geojson					  core_poi-part5.csv.gz
CorePlacesMay2020Release-CORE_POI-2020_04-2020-05-06.zip  README.txt
core_poi-part1.csv.gz					  SquareFeet.csv.gz
core_poi-part2.csv.gz


In [6]:
# grab the SafeGraph core places
files = glob("drive/My Drive/SafeGraph/places/core*.csv.gz")
print(files)

['drive/My Drive/SafeGraph/places/core_poi-part4.csv.gz', 'drive/My Drive/SafeGraph/places/core_poi-part5.csv.gz', 'drive/My Drive/SafeGraph/places/core_poi-part1.csv.gz', 'drive/My Drive/SafeGraph/places/core_poi-part3.csv.gz', 'drive/My Drive/SafeGraph/places/core_poi-part2.csv.gz']


In [7]:
# copy these files from Google Drive to colab disk
os.mkdir('sg_data')
for f in files:
  copyfile(f, 'sg_data/' + os.path.basename(f))

In [8]:
#!ls sg_data
files = glob("sg_data/core*.csv.gz")
print(files)

['sg_data/core_poi-part4.csv.gz', 'sg_data/core_poi-part5.csv.gz', 'sg_data/core_poi-part1.csv.gz', 'sg_data/core_poi-part2.csv.gz', 'sg_data/core_poi-part3.csv.gz']


In [9]:
place_df = pd.read_csv(files[0])
for f in files[1:]:
    t = time.time()
    tmp_pd2 = pd.read_csv(f)
    place_df = place_df.append(tmp_pd2, ignore_index = True)
    print(f + ', new_len: ', str(len(place_df)) + ', proc time: ', str(time.time() - t))

sg_data/core_poi-part5.csv.gz, new_len:  2074033, proc time:  8.40689754486084
sg_data/core_poi-part1.csv.gz, new_len:  3143522, proc time:  9.984866380691528
sg_data/core_poi-part2.csv.gz, new_len:  4214042, proc time:  10.657033443450928
sg_data/core_poi-part3.csv.gz, new_len:  5284986, proc time:  11.22972559928894


In [10]:
place_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5284986 entries, 0 to 5284985
Data columns (total 18 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   safegraph_place_id         object 
 1   parent_safegraph_place_id  object 
 2   location_name              object 
 3   safegraph_brand_ids        object 
 4   brands                     object 
 5   top_category               object 
 6   sub_category               object 
 7   naics_code                 float64
 8   latitude                   float64
 9   longitude                  float64
 10  street_address             object 
 11  city                       object 
 12  region                     object 
 13  postal_code                int64  
 14  iso_country_code           object 
 15  phone_number               float64
 16  open_hours                 object 
 17  category_tags              object 
dtypes: float64(4), int64(1), object(13)
memory usage: 725.8+ MB


In [11]:
place_df.head()

Unnamed: 0,safegraph_place_id,parent_safegraph_place_id,location_name,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,postal_code,iso_country_code,phone_number,open_hours,category_tags
0,sg:00201f75b90b47d3be813562f946d97f,,First Presbyterian Church of Stamford,,,Religious Organizations,Religious Organizations,813110.0,41.062975,-73.538788,1101 Bedford St,Stamford,CT,6905,US,12033250000.0,"{ ""Mon"": [[""9:00"", ""17:00""]], ""Tue"": [[""9:00"",...",
1,sg:002b991d388c42a3a64db6066f8c4c0e,,El Gran Sabor,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,38.926747,-79.8484,413 Kerens Ave,Elkins,WV,26241,US,13046370000.0,,"Late Night,Latin American Food"
2,sg:002e6e7d2a0642a197e51a44bfce7a3e,,Public Storage,SG_BRAND_1a2e7806afb9c1c7d60fd00219204dde,Public Storage,Lessors of Real Estate,Lessors of Miniwarehouses and Self-Storage Units,531130.0,33.035779,-96.699591,2801 Avenue K,Plano,TX,75074,US,,,
3,sg:0045ce84ae2847a08abe59b4139f871e,,TravelCenters of America,SG_BRAND_27a15966382ba3e78015b40d2e1fa976,TravelCenters of America,Gasoline Stations,Gasoline Stations with Convenience Stores,447110.0,38.428077,-95.726311,2775 US Highway 75,Lebo,KS,66856,US,,,
4,sg:00a3a6674d95443c80e481bdf68e84d2,sg:6449a1a66fcc451abc993849f61445f0,Great American Cookie Company,,,Bakeries and Tortilla Manufacturing,Retail Bakeries,311811.0,30.257337,-97.807172,Barton Creek Mall,Austin,TX,78701,US,15123280000.0,,


In [12]:
# translate latitude and longitude to geopandas
place_gdf = gpd.GeoDataFrame(
    place_df, geometry = gpd.points_from_xy(place_df.longitude, place_df.latitude),
    crs = "epsg:4326" # https://geopandas.readthedocs.io/en/v0.6.0/projections.html#coordinate-reference-systems
).to_crs(epsg=3528) # https://spatialreference.org/ref/epsg/3528/

In [13]:
place_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 5284986 entries, 0 to 5284985
Data columns (total 19 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   safegraph_place_id         object  
 1   parent_safegraph_place_id  object  
 2   location_name              object  
 3   safegraph_brand_ids        object  
 4   brands                     object  
 5   top_category               object  
 6   sub_category               object  
 7   naics_code                 float64 
 8   latitude                   float64 
 9   longitude                  float64 
 10  street_address             object  
 11  city                       object  
 12  region                     object  
 13  postal_code                int64   
 14  iso_country_code           object  
 15  phone_number               float64 
 16  open_hours                 object  
 17  category_tags              object  
 18  geometry                   geometry
dtypes: float64(4)

In [14]:
# limit to Illinois (5.3M --> 200K places)
place_gdf_IL = place_gdf[place_gdf.region == 'IL']
place_gdf_IL.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 200614 entries, 5 to 5284973
Data columns (total 19 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   safegraph_place_id         200614 non-null  object  
 1   parent_safegraph_place_id  24590 non-null   object  
 2   location_name              200614 non-null  object  
 3   safegraph_brand_ids        47267 non-null   object  
 4   brands                     47267 non-null   object  
 5   top_category               198532 non-null  object  
 6   sub_category               198532 non-null  object  
 7   naics_code                 198532 non-null  float64 
 8   latitude                   200614 non-null  float64 
 9   longitude                  200614 non-null  float64 
 10  street_address             200614 non-null  object  
 11  city                       200614 non-null  object  
 12  region                     200614 non-null  object  
 13  posta

In [15]:
place_gdf_IL.crs

<Projected CRS: EPSG:3528>
Name: NAD83(NSRS2007) / Illinois East
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: USA - Illinois - SPCS - E
- bounds: (-89.28, 37.06, -87.02, 42.5)
Coordinate Operation:
- name: SPCS83 Illinois East zone (meters)
- method: Transverse Mercator
Datum: NAD83 (National Spatial Reference System 2007)
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

## Get the census block groups of NE Illinois

FIPS code for the counties
* Cook: 031
* Dupage: 043
* Grundy: 063
* Kane: 089
* Kankakee: 091
* Kendall: 093
* Lake: 097
* McHenry: 111
* Will: 197

In [16]:
illinois_cbg = gpd.read_file('https://raw.githubusercontent.com/loganpowell/census-geojson/master/GeoJSON/500k/2019/17/block-group.json', 
                   crs = from_epsg(2163)).to_crs(epsg = 3528)

In [17]:
# limit to 9 counties mentioned above
target_cbg = illinois_cbg[illinois_cbg.COUNTYFP.isin(['031', '043', '063', '089', '091', '093', '097', '111', '197'])]
target_cbg = target_cbg[['GEOID', 'geometry']]
target_cbg.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 5998 entries, 0 to 9688
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   GEOID     5998 non-null   object  
 1   geometry  5998 non-null   geometry
dtypes: geometry(1), object(1)
memory usage: 140.6+ KB


In [18]:
print(target_cbg.crs)
#target_cbg.head()
print(place_gdf_IL.crs)

epsg:3528
epsg:3528


In [19]:
place_gdf_target = gpd.sjoin(place_gdf_IL, target_cbg, how="inner", op="within").drop("index_right", axis = 1)
place_gdf_target.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 132091 entries, 43 to 5230499
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   safegraph_place_id         132091 non-null  object  
 1   parent_safegraph_place_id  18968 non-null   object  
 2   location_name              132091 non-null  object  
 3   safegraph_brand_ids        30605 non-null   object  
 4   brands                     30605 non-null   object  
 5   top_category               130284 non-null  object  
 6   sub_category               130284 non-null  object  
 7   naics_code                 130284 non-null  float64 
 8   latitude                   132091 non-null  float64 
 9   longitude                  132091 non-null  float64 
 10  street_address             132091 non-null  object  
 11  city                       132091 non-null  object  
 12  region                     132091 non-null  object  
 13  post

In [45]:
place_gdf_target['naics_2digit'] = place_gdf_target['naics_2digit'].fillna(0.0).astype(int)
place_gdf_target.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 132091 entries, 43 to 5230499
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   safegraph_place_id         132091 non-null  object  
 1   parent_safegraph_place_id  18968 non-null   object  
 2   location_name              132091 non-null  object  
 3   safegraph_brand_ids        30605 non-null   object  
 4   brands                     30605 non-null   object  
 5   top_category               130284 non-null  object  
 6   sub_category               130284 non-null  object  
 7   naics_code                 130284 non-null  float64 
 8   latitude                   132091 non-null  float64 
 9   longitude                  132091 non-null  float64 
 10  street_address             132091 non-null  object  
 11  city                       132091 non-null  object  
 12  region                     132091 non-null  object  
 13  post

In [47]:
naics_summary = place_gdf_target.naics_code.value_counts()
naics_summary.head(20)

722511.0    13331
621111.0     6470
812112.0     5309
621210.0     4653
813110.0     4193
722513.0     3958
712190.0     3918
524210.0     3847
713940.0     3787
611110.0     3428
722515.0     3415
522110.0     3215
811111.0     2834
624410.0     2694
447110.0     2415
445110.0     2232
621330.0     2016
812199.0     1939
812113.0     1706
621310.0     1557
Name: naics_code, dtype: int64

In [50]:
naics_2digit_summary = place_gdf_target.naics_2digit.value_counts()
naics_2digit_summary

72    21924
44    21850
62    21568
81    20696
52     9647
71     9014
45     8074
61     4885
53     2871
54     2180
51     2122
0      1807
56     1043
92      940
31      853
49      791
48      665
42      535
33      237
23      169
32      153
22       59
11        6
6         2
Name: naics_2digit, dtype: int64

In [21]:
cbg_summary = place_gdf_target.GEOID.value_counts()
cbg_summary.head(20)

170318391001    1237
170318046033     709
170313204001     505
170313201002     393
170313201001     381
170318015001     358
170438446011     322
170938901011     285
170318069001     284
170898526013     272
170310817002     264
170318330001     264
170318422001     260
170318391002     258
171118713064     256
170317707001     253
170310814011     251
170438465042     243
170318016011     237
170438465192     236
Name: GEOID, dtype: int64

## Generate the IL places file

In [None]:
file_name = 'core_places_IL'
place_gdf_target.to_file(file_name + '.geojson', driver='GeoJSON')

In [None]:
# also to a plain csv file
pd.DataFrame(place_gdf_target.drop(columns='geometry')).to_csv(file_name + '.csv')

In [None]:
# copy to google Drive
copyfile(file_name + '.geojson', "drive/My Drive/SafeGraph/places/" + file_name + '.geojson')
copyfile(file_name + '.csv', "drive/My Drive/SafeGraph/places/" + file_name + '.csv')

'drive/My Drive/SafeGraph/places/core_places_IL.csv'