In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import ast
from glob import glob

from fiona.crs import from_epsg
from shapely.geometry import Point

import pytz

### Get the census block groups of Chicago

In [2]:
cook_cbg = gpd.read_file('https://raw.githubusercontent.com/loganpowell/census-geojson/master/GeoJSON/500k/2019/17/block-group.json', 
                   crs = from_epsg(2163)).to_crs(epsg = 3528)
# select Cook county
cook_cbg = cook_cbg[cook_cbg.COUNTYFP == '031']
cook_cbg = cook_cbg[['GEOID', 'geometry']]
cook_cbg.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 3992 entries, 0 to 9688
Data columns (total 2 columns):
GEOID       3992 non-null object
geometry    3992 non-null object
dtypes: object(2)
memory usage: 93.6+ KB


In [3]:
chi_bound = gpd.read_file('https://data.cityofchicago.org/api/geospatial/ewy2-6yfk?method=export&format=GeoJSON', 
                   crs = from_epsg(2163)).to_crs(epsg = 3528)

In [4]:
chi_cbg = gpd.sjoin(cook_cbg, chi_bound,
                    op='intersects').drop("index_right", axis = 1)
chi_cbg.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2325 entries, 0 to 9685
Data columns (total 6 columns):
GEOID         2325 non-null object
geometry      2325 non-null object
name          2325 non-null object
objectid      2325 non-null object
shape_area    2325 non-null object
shape_len     2325 non-null object
dtypes: object(6)
memory usage: 127.1+ KB


### Turning device counts into the probabilities

I just use one file here, `2020-04-30-social-distancing.csv.gz`, to describe what I did.

For social distancing metrics, I focused on the `buckted_percentage_time_home` and `completely_home_device_count`.

These variables are explained in [the SafeGraph page](https://docs.safegraph.com/docs/social-distancing-metrics).
* `completely_home_device_count`: Out of the device_count, the number of devices which did not leave the geohash-7 in which their home is located during the time period.
* `buckted_percentage_time_home`: Key is range of minutes and value is device count of devices that dwelled at geohash-7 of home for the given time period, `{"<60": 0, "61-360": 0, "361-720": 10, "721-1080": 40, ">1081": 50}`. For each device, we summed the observed minutes at home across the day (whether or not these were contiguous) to get the total minutes for each device this day. Then we count how many devices are in each bucket. Beginning in v2, we include the portion of any stop within the time range regardless of whether the stop start time was in the time period.


In [5]:
to_lit = lambda x : ast.literal_eval(x) if x is not np.nan else {}

def get_prob(input_df):

    # process the date and also get weekday
    for d in ["date_range_start", "date_range_end"]:
        input_df[d] = pd.to_datetime(input_df[d])

    input_df["date"] = pd.to_datetime(input_df["date_range_start"], utc = True)\
                   .dt.tz_convert(pytz.timezone('US/Central')).dt.date
    input_df["weekday"] = input_df["date"].apply(lambda x: x.weekday())
    
    # I focused on `buckted_percentage_time_home`, along with `completely_home_device_count`
    input_df["bucketed_percentage_time_home"] = input_df["bucketed_percentage_time_home"].apply(to_lit)
    input_df["cnt_lt_6h_out"] = input_df.bucketed_percentage_time_home.apply(lambda x : x["76-100"] if "76-100" in x else 0)
    input_df["cnt_6h_12h_out"] = input_df.bucketed_percentage_time_home.apply(lambda x : x["51-75"] if "51-75" in x else 0)
    input_df["cnt_12h_18h_out"] = input_df.bucketed_percentage_time_home.apply(lambda x : x["26-50"] if "26-50" in x else 0)
    input_df["cnt_ht_18h_out"] = input_df.bucketed_percentage_time_home.apply(lambda x : x["0-25"] if "0-25" in x else 0)

    # device counts were turned into these probabilities
    input_df["p_completely_home"] = input_df.completely_home_device_count / input_df.device_count
    input_df["p_lt_6h_out"] = input_df["cnt_lt_6h_out"] / (input_df["cnt_lt_6h_out"] + input_df["cnt_6h_12h_out"] + input_df["cnt_12h_18h_out"] + input_df["cnt_ht_18h_out"])
    input_df["p_6h_12h_out"] = input_df["cnt_6h_12h_out"] / (input_df["cnt_lt_6h_out"] + input_df["cnt_6h_12h_out"] + input_df["cnt_12h_18h_out"] + input_df["cnt_ht_18h_out"])
    input_df["p_ht_12h_out"] = 1 - (input_df["p_lt_6h_out"] + input_df["p_6h_12h_out"])

    input_df["p_part_time_work"] = input_df.part_time_work_behavior_devices / input_df.device_count
    input_df["p_full_time_work"] = input_df.full_time_work_behavior_devices / input_df.device_count
    input_df["p_delivery"] = input_df.delivery_behavior_devices / input_df.device_count
    
    select_columns = ['origin_census_block_group', 'date', 'device_count', 'distance_traveled_from_home', 
                      'p_completely_home', 'p_lt_6h_out', 'p_6h_12h_out', 'p_ht_12h_out', 
                      'p_part_time_work', 'p_full_time_work', 'p_delivery']

    output_df = input_df[select_columns]
    
    return output_df
    

In [6]:
oneday_df = pd.read_csv('./2020-04-30-social-distancing.csv.gz')
oneday_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219343 entries, 0 to 219342
Data columns (total 20 columns):
origin_census_block_group                     219343 non-null int64
date_range_start                              219343 non-null object
date_range_end                                219343 non-null object
device_count                                  219343 non-null int64
distance_traveled_from_home                   219324 non-null float64
bucketed_distance_traveled                    219343 non-null object
median_dwell_at_bucketed_distance_traveled    219324 non-null object
completely_home_device_count                  219343 non-null int64
median_home_dwell_time                        219343 non-null int64
bucketed_home_dwell_time                      219343 non-null object
at_home_by_each_hour                          219343 non-null object
part_time_work_behavior_devices               219343 non-null int64
full_time_work_behavior_devices               219343 non-null int

In [7]:
# select Cook county
oneday_df = oneday_df[oneday_df.origin_census_block_group // 10000000 == 17031]
oneday_df['GEOID'] = oneday_df.origin_census_block_group.astype('str')

# filter only Chicago CBGs
oneday_df = pd.merge(chi_cbg, oneday_df,
                  left_on = 'GEOID', right_on = 'GEOID', how = 'inner')
oneday_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2315 entries, 0 to 2314
Data columns (total 26 columns):
GEOID                                         2315 non-null object
geometry                                      2315 non-null object
name                                          2315 non-null object
objectid                                      2315 non-null object
shape_area                                    2315 non-null object
shape_len                                     2315 non-null object
origin_census_block_group                     2315 non-null int64
date_range_start                              2315 non-null object
date_range_end                                2315 non-null object
device_count                                  2315 non-null int64
distance_traveled_from_home                   2314 non-null float64
bucketed_distance_traveled                    2315 non-null object
median_dwell_at_bucketed_distance_traveled    2314 non-null object
completely_home

In [8]:
import warnings; warnings.simplefilter('ignore')

oneday_df2 = get_prob(oneday_df)
oneday_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2315 entries, 0 to 2314
Data columns (total 11 columns):
origin_census_block_group      2315 non-null int64
date                           2315 non-null object
device_count                   2315 non-null int64
distance_traveled_from_home    2314 non-null float64
p_completely_home              2315 non-null float64
p_lt_6h_out                    2315 non-null float64
p_6h_12h_out                   2315 non-null float64
p_ht_12h_out                   2315 non-null float64
p_part_time_work               2315 non-null float64
p_full_time_work               2315 non-null float64
p_delivery                     2315 non-null float64
dtypes: float64(8), int64(2), object(1)
memory usage: 217.0+ KB


### These columns are saved into social_distancing_17031.csv from 2019-01-01 to 2020-05-26