# Get Geography and Adjust for Sampling

The SafeGraph Patterns dataset reveals where devices, and by proxy, people, who visit White Pass reside. This by itself does not provide a tremendous amount of information. However, by adding geogrpahic context, valuable insight can be revealed. If we know where people reside, we know a tremendous amount about who these people are through adding the demographic data. Part of this demographic data Esri curates includes a count of cell phone usage, and this can be used to adjust the representative sample of the SafeGraph Patterns origin Block Group count.

In [1]:
import importlib
import json
import math
import os
from pathlib import Path
import sys

from arcgis.features import GeoAccessor, GeoSeriesAccessor
from dm import Country, utils, proximity
import pandas as pd

# load the "autoreload" extension so that code can change, & always reload modules so that as you change code in src, it gets loaded
%load_ext autoreload
%autoreload 2

In [2]:
dir_prj = Path.cwd().parent

dir_data = dir_prj/'data'

dir_raw = dir_data/'raw'
dir_int = dir_data/'interim'

gdb_raw = dir_raw/'raw.gdb'
gdb_int = dir_int/'interim.gdb'

The first step is getting the data into a Pandas DataFrame from the last step.

In [13]:
sg_bg_df = pd.read_csv(dir_int/'block_group_raw.csv', index_col=0)
sg_bg_df.index = sg_bg_df.index.astype(str)
sg_bg_df.columns = [f'ptrn_{c}' for c in sg_bg_df.columns]

sg_bg_df.head()

Unnamed: 0,ptrn_2017_2018,ptrn_2018_2019,ptrn_2018_2019_delta,ptrn_2019_2020,ptrn_2019_2020_delta,ptrn_2020_2021,ptrn_2020_2021_delta
530670117102,10.0,4.0,-6.0,7.0,3.0,0.0,-7.0
530770034001,7.0,5.0,-2.0,9.0,4.0,4.0,-5.0
530150009003,6.0,0.0,-6.0,0.0,0.0,0.0,0.0
530770008001,6.0,9.0,3.0,9.0,0.0,4.0,-5.0
530050108033,7.0,5.0,-2.0,6.0,1.0,4.0,-2.0


Next, in preperation for getting the geometries for the block groups, to speed up the process, retrieving only the needed features dramatically accelerates the process. This can be accomplished by creating a query string to only retrieve the needed features. While the query string shown in the output only contains 10 records, the query string for analyis contains quite a few more.

In [16]:
def create_bg_query_str(bg_df):
    bg_id_lst = [f"'{val}'" for val in bg_df.index]
    bg_str = ','.join(bg_id_lst)
    q_str = f'ID IN ({bg_str})'
    return q_str

query_str = create_bg_query_str(sg_bg_df)

create_bg_query_str(sg_bg_df.iloc[:10])

"ID IN ('530670117102','530770034001','530150009003','530770008001','530050108033','530459613002','530530728001','350579632021','530419720003','530530606001')"

Now, to get the block group features, we take advantage of the Demographic Modeling Module to retrieve block group features.

In [17]:
cntry = Country('USA')

cntry

<dm.Country - USA (local)>

Using the country object, we now can get the block group features with block group ID's and geometries.

In [6]:
%%time
bg_df = cntry.block_groups.get(query_string=query_str)

bg_df.head()

Wall time: 11.9 s


Unnamed: 0,ID,NAME,SHAPE
0,150030002005,150030002.005,"{""rings"": [[[-157.72246000033724, 21.291575000..."
1,150030099022,150030099.022,"{""rings"": [[[-158.11245000020728, 21.577039999..."
2,150030101003,150030101.003,"{""rings"": [[[-157.98319000002704, 21.711790000..."
3,150070401041,150070401.041,"{""rings"": [[[-159.55847000041484, 22.226470001..."
4,410030102003,410030102.003,"{""rings"": [[[-123.43794100008853, 44.719628999..."


In [7]:
evars = cntry.enrich_variables

enrich_vars = evars[
#    ((evars.data_collection.str.startswith('Key')) & (evars.name.str.endswith('CY'))) |  # key demographic variables for the current year - useful if want to do more detailed modeling
    (evars.name.str.contains('TSEGCODE')) | (evars.name.str.contains('TSEGNAME')) |  # tapestry segment code and name
    (evars.name == 'MP19014a_B') |  # count of people with a cell phone - used to calculate market penetration and representative sample
    (evars.name == 'TOTPOP_CY')  # total population - used to calculate market penetration and resentative sample
].drop_duplicates('name')

enrich_vars

Unnamed: 0,name,alias,type,vintage,data_collection,enrich_str,enrich_field_name
3179,MP19014a_B,2020 Accessed Internet in last 30 days using c...,COUNT,2020,ElectronicsInternet,ElectronicsInternet.MP19014a_B,ElectronicsInternet_MP19014a_B
5004,TOTPOP_CY,2020 Total Population,COUNT,2020,HistoricalPopulation,HistoricalPopulation.TOTPOP_CY,HistoricalPopulation_TOTPOP_CY
7415,TSEGNAME,2020 Dom Tapestry Segment Name,TEXT,2020,Policy,Policy.TSEGNAME,Policy_TSEGNAME
8546,TSEGCODE,2020 Dominant Tapestry Segment,TEXT,2020,tapestryhouseholdsNEW,tapestryhouseholdsNEW.TSEGCODE,tapestryhouseholdsNEW_TSEGCODE


In [8]:
%%time
bg_enrch_df = bg_df.dm.enrich(enrich_vars, country=cntry).drop(columns='NAME')

bg_enrch_df.head()

Wall time: 14.2 s


Unnamed: 0,ID,ElectronicsInternet_MP19014a_B,HistoricalPopulation_TOTPOP_CY,Policy_TSEGNAME,tapestryhouseholdsNEW_TSEGCODE,SHAPE
0,150030002005,988.0,1503.0,Pacific Heights,2C,"{""rings"": [[[-157.72246000033724, 21.291575000..."
1,150030099022,775.0,1241.0,Pacific Heights,2C,"{""rings"": [[[-158.11245000020728, 21.577039999..."
2,150030101003,591.0,955.0,Pacific Heights,2C,"{""rings"": [[[-157.98319000002704, 21.711790000..."
3,150070401041,540.0,937.0,The Great Outdoors,6C,"{""rings"": [[[-159.55847000041484, 22.226470001..."
4,410030102003,712.0,1169.0,The Great Outdoors,6C,"{""rings"": [[[-123.43794100008853, 44.719628999..."


In [9]:
bg_ptrn_df = bg_enrch_df.join(sg_bg_df, on='ID')
bg_ptrn_df.spatial.set_geometry('SHAPE')

bg_ptrn_df.head()

Unnamed: 0,ID,ElectronicsInternet_MP19014a_B,HistoricalPopulation_TOTPOP_CY,Policy_TSEGNAME,tapestryhouseholdsNEW_TSEGCODE,SHAPE,ptrn_2017_2018,ptrn_2018_2019,ptrn_2018_2019_delta,ptrn_2019_2020,ptrn_2019_2020_delta,ptrn_2020_2021,ptrn_2020_2021_delta
0,150030002005,988.0,1503.0,Pacific Heights,2C,"{""rings"": [[[-157.72246000033724, 21.291575000...",0.0,0.0,0.0,4.0,4.0,0.0,-4.0
1,150030099022,775.0,1241.0,Pacific Heights,2C,"{""rings"": [[[-158.11245000020728, 21.577039999...",0.0,4.0,4.0,0.0,-4.0,0.0,0.0
2,150030101003,591.0,955.0,Pacific Heights,2C,"{""rings"": [[[-157.98319000002704, 21.711790000...",0.0,4.0,4.0,0.0,-4.0,0.0,0.0
3,150070401041,540.0,937.0,The Great Outdoors,6C,"{""rings"": [[[-159.55847000041484, 22.226470001...",4.0,0.0,-4.0,0.0,0.0,0.0,0.0
4,410030102003,712.0,1169.0,The Great Outdoors,6C,"{""rings"": [[[-123.43794100008853, 44.719628999...",0.0,4.0,4.0,0.0,-4.0,0.0,0.0


In [10]:
cell_col = 'ElectronicsInternet_MP19014a_B'
pop_col = [c for c in bg_ptrn_df.columns if c.endswith('TOTPOP_CY')][0]

print(cell_col, pop_col)

ElectronicsInternet_MP19014a_B HistoricalPopulation_TOTPOP_CY


In [11]:
for sg_col in sg_bg_df.columns:
    bg_ptrn_df[f'{sg_col}_adj'] = bg_ptrn_df[sg_col] / bg_ptrn_df[cell_col] * bg_ptrn_df[pop_col]
    
bg_ptrn_df.head()

Unnamed: 0,ID,ElectronicsInternet_MP19014a_B,HistoricalPopulation_TOTPOP_CY,Policy_TSEGNAME,tapestryhouseholdsNEW_TSEGCODE,SHAPE,ptrn_2017_2018,ptrn_2018_2019,ptrn_2018_2019_delta,ptrn_2019_2020,ptrn_2019_2020_delta,ptrn_2020_2021,ptrn_2020_2021_delta,ptrn_2017_2018_adj,ptrn_2018_2019_adj,ptrn_2018_2019_delta_adj,ptrn_2019_2020_adj,ptrn_2019_2020_delta_adj,ptrn_2020_2021_adj,ptrn_2020_2021_delta_adj
0,150030002005,988.0,1503.0,Pacific Heights,2C,"{""rings"": [[[-157.72246000033724, 21.291575000...",0.0,0.0,0.0,4.0,4.0,0.0,-4.0,0.0,0.0,0.0,6.08502,6.08502,0.0,-6.08502
1,150030099022,775.0,1241.0,Pacific Heights,2C,"{""rings"": [[[-158.11245000020728, 21.577039999...",0.0,4.0,4.0,0.0,-4.0,0.0,0.0,0.0,6.405161,6.405161,0.0,-6.405161,0.0,0.0
2,150030101003,591.0,955.0,Pacific Heights,2C,"{""rings"": [[[-157.98319000002704, 21.711790000...",0.0,4.0,4.0,0.0,-4.0,0.0,0.0,0.0,6.463621,6.463621,0.0,-6.463621,0.0,0.0
3,150070401041,540.0,937.0,The Great Outdoors,6C,"{""rings"": [[[-159.55847000041484, 22.226470001...",4.0,0.0,-4.0,0.0,0.0,0.0,0.0,6.940741,0.0,-6.940741,0.0,0.0,0.0,0.0
4,410030102003,712.0,1169.0,The Great Outdoors,6C,"{""rings"": [[[-123.43794100008853, 44.719628999...",0.0,4.0,4.0,0.0,-4.0,0.0,0.0,0.0,6.567416,6.567416,0.0,-6.567416,0.0,0.0


In [12]:
%%time
out_fc = bg_ptrn_df.spatial.to_featureclass(gdb_int/'bg_ptrn')
utils.add_enrich_aliases(out_fc, cntry)

out_fc

Wall time: 8.37 s


'D:\\projects\\safegraph-data-utilities\\data\\interim\\interim.gdb\\bg_ptrn'