In [1]:
import numpy as np
import geopandas as gpd
import cartopy.crs as ccrs
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [2]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv

In [3]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')

In [4]:
# local directory where we want to put all the data
patterns_path = os.path.join(raw_data_dir,'monthly-patterns')
# print(local)
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(patterns_path):
    for file in f:
        if 'normalization_stats.csv' in file:
            files.append(os.path.join(patterns_path, r, file))

In [5]:
files

['/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/03/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/04/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/05/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/02/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/11/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/10/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfil

In [6]:
norm_stats = pd.concat([pd.read_csv(file) for file in files])

In [7]:
norm_stats['year'] = norm_stats['year'].astype(int)
norm_stats['month'] = norm_stats['month'].astype(int)
norm_stats['day'] = norm_stats['day'].astype(int)

In [8]:
# check that picking rows where region is missing or equal to 'ALL_STATES' gives one row per day
counts = norm_stats[norm_stats['region'].isnull() | (norm_stats['region'] == 'ALL_STATES')].groupby(['year','month','day']).size().reset_index(name = 'count')
counts[counts['count'] > 1]

Unnamed: 0,year,month,day,count


In [9]:
# check all days are present
n_orig = len(norm_stats.groupby(['year','month','day']).size())
n_filtered = len(counts['count'])
n_orig == n_filtered

True

In [10]:
processed_data_dir = os.path.join(root_dir,'data/processed')
philly_patterns_df = pd.read_csv(os.path.join(processed_data_dir,'philly_patterns.csv'))

In [11]:
philly_patterns_df['date'] = pd.to_datetime(philly_patterns_df['date'], utc = True)

In [12]:
philly_patterns_df['year'] = philly_patterns_df['date'].dt.year
philly_patterns_df['month'] = philly_patterns_df['date'].dt.month
philly_patterns_df['day'] = philly_patterns_df['date'].dt.day

In [13]:
philly_patterns_df.head(20)

Unnamed: 0,safegraph_place_id,location_name,street_address,city,region,postal_code,date_range_start,date_range_end,raw_visit_counts,raw_visitor_counts,visits_by_day,poi_cbg,day_visit_counts,day,date,year,month
0,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,0,1,2020-10-01 04:00:00+00:00,2020,10
1,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,0,2,2020-10-02 04:00:00+00:00,2020,10
2,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,0,3,2020-10-03 04:00:00+00:00,2020,10
3,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,0,4,2020-10-04 04:00:00+00:00,2020,10
4,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,1,5,2020-10-05 04:00:00+00:00,2020,10
5,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,1,6,2020-10-06 04:00:00+00:00,2020,10
6,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,0,7,2020-10-07 04:00:00+00:00,2020,10
7,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,1,8,2020-10-08 04:00:00+00:00,2020,10
8,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,1,9,2020-10-09 04:00:00+00:00,2020,10
9,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,2020-10-01 00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,0,10,2020-10-10 04:00:00+00:00,2020,10


In [14]:
n_patterns = len(philly_patterns_df.groupby(['year','month','day']).size())
n_stats = len(norm_stats[norm_stats['year'] >= 2019].groupby(['year','month','day']).size())

In [15]:
n_patterns == n_stats

True

In [16]:
norm_stats = norm_stats[norm_stats['year'] >= 2019]

In [None]:
# would be better to merge in every time we read a new month of data.
philly_patterns_df = philly_patterns_df.merge(norm_stats, on = ['year','month','day'])

In [None]:
philly_patterns_df.dtypes

In [None]:
philly_patterns_df['day_visits_normalized'] = philly_patterns_df['day_visit_counts']/philly_patterns_df['total_devices_seen']

In [None]:
keep_cols = ['year','month','day','safegraph_place_id', 'postal_code','day_visits_normalized','day_visit_counts']

In [None]:
plot_data = philly_patterns_df[keep_cols]

In [None]:
plot_data.head()