In [1]:
import numpy as np
import geopandas as gpd
from matplotlib import pyplot as plt

In [2]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv

In [3]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')

In [4]:
# local directory where we want to put all the data
patterns_path = os.path.join(raw_data_dir,'monthly-patterns')
# print(local)
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(patterns_path):
    for file in f:
        if 'normalization_stats.csv' in file:
            files.append(os.path.join(patterns_path, r, file))

In [5]:
files

['/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/03/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/04/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/05/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/02/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/11/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/10/normalization_stats.csv',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/normalization_stats_backfil

In [6]:
# for files with information disaggregated at the state level, keep only the country-wide info
def keep_total_level(norm_stats):
    if 'region' in norm_stats.columns:
        if len(norm_stats[norm_stats['region'] == 'ALL_STATES']) == 0:
            raise ValueError('no region named "ALL_STATES"')
        norm_stats = norm_stats[norm_stats['region'] == 'ALL_STATES']
        norm_stats = norm_stats.drop(columns = ['region'])
    return norm_stats

In [7]:
norm_stats = pd.concat([keep_total_level(pd.read_csv(file)) for file in files], sort=True)
norm_stats = norm_stats[norm_stats['year'] >= 2019]

In [8]:
norm_stats['year'] = norm_stats['year'].astype(int)
norm_stats['month'] = norm_stats['month'].astype(int)
norm_stats['day'] = norm_stats['day'].astype(int)

In [9]:
norm_stats.head()

Unnamed: 0,day,month,total_devices_seen,total_home_visitors,total_home_visits,total_visits,year
0,1,3,19864233,15964091,27636202,69304245,2019
1,2,3,19501615,15565360,26199538,62486898,2019
2,3,3,19444962,16285916,27158187,55210974,2019
3,4,3,19781861,16034434,27567027,63672615,2019
4,5,3,19171193,15457780,26656320,63906175,2019


In [10]:
processed_data_dir = os.path.join(root_dir,'data/processed')
philly_patterns_df = pd.read_csv(os.path.join(processed_data_dir,'philly_patterns_by_zip.csv'))
philly_patterns_df['postal_code'] = philly_patterns_df['postal_code'].astype(str)

In [11]:
philly_patterns_df['date'] = pd.to_datetime(philly_patterns_df['date'], utc = True)

In [12]:
philly_patterns_df['year'] = philly_patterns_df['date'].dt.year
philly_patterns_df['month'] = philly_patterns_df['date'].dt.month
philly_patterns_df['day'] = philly_patterns_df['date'].dt.day

In [13]:
philly_patterns_df = philly_patterns_df.drop(columns = ['date'])

In [14]:
philly_patterns_df = philly_patterns_df.groupby(['year','month','day','postal_code']).agg('sum').reset_index()

In [15]:
philly_patterns_df.head(5)

Unnamed: 0,year,month,day,postal_code,day_visit_counts
0,2019,1,1,19102,4491
1,2019,1,1,19103,3821
2,2019,1,1,19104,5450
3,2019,1,1,19106,2403
4,2019,1,1,19107,7758


In [16]:
# check all dates have the same number of zips
dates_df = philly_patterns_df.groupby(['year','month','day']).size().reset_index(name = 'num_zips')
dates_df[dates_df['num_zips'] < max(dates_df.num_zips)]

Unnamed: 0,year,month,day,num_zips


In [None]:
# check patterns and stats have same number of days
dfs = [philly_patterns_df, norm_stats]
lens = [len(df.groupby(['year','month','day']).size()) for df in dfs]
lens[0] == lens[1]

In [None]:
# code below is no longer relevant. Was written for the original philly patterns data that had more variables
#keep_cols = ['year','month','day', 'postal_code','day_visit_counts']
#philly_patterns_df = philly_patterns_df[keep_cols].groupby(['year','month','day', 'postal_code']).agg('sum').reset_index()
#philly_patterns_df['postal_code'] = philly_patterns_df['postal_code'].apply(lambda x: ('00000' + str(x))[-5:])

In [None]:
# would be better to merge in every time we read a new month of data.
philly_patterns_df = philly_patterns_df.merge(norm_stats, on = ['year','month','day'])

In [None]:
philly_patterns_df.dtypes

In [None]:
philly_patterns_df['day_visits_normalized'] = philly_patterns_df['day_visit_counts']/philly_patterns_df['total_devices_seen']

In [None]:
zip_data = philly_patterns_df[['year','month','postal_code','day_visit_counts','day_visits_normalized']].groupby(['year','month','postal_code']).agg('sum').reset_index()

In [None]:
zip_data.head()

In [None]:
len(zip_data.postal_code.unique())

In [None]:
zip_data = zip_data.merge(zip_data[zip_data['month'] == 1],  on = ['year','postal_code'], suffixes = ("","_jan"))

In [None]:
zip_data['visits_relative'] = zip_data['day_visit_counts']/zip_data['day_visit_counts_jan']
zip_data['visits_normalized_relative'] = zip_data['day_visits_normalized']/zip_data['day_visits_normalized_jan']

In [None]:
zip_data = zip_data.drop(columns = [col for col in zip_data.columns if col.endswith('_jan')])

In [None]:
zip_data = zip_data.rename(columns = {'day_visit_counts': 'visits', 'day_visits_normalized':'visits_normalized'})

In [None]:
zip_data = zip_data[zip_data['year'] == 2020].merge(
    zip_data[zip_data['year'] == 2019], 
    on = ['month','postal_code'], 
    suffixes = ("_2020","_2019"))

In [None]:
metrics = ['visits', 'visits_normalized', 'visits_relative', 'visits_normalized_relative']
for metric in metrics:
    plot_col = metric + '_dif'
    zip_data[plot_col] = zip_data[metric+'_2020'] - zip_data[metric + '_2019']

In [None]:
zip_data = zip_data.drop(columns = ['year_2020','year_2019'])

In [None]:
zip_data.head()

In [None]:
# Code to directly download the zip3 shape file and format it.
# Avoids putting the data into github.
zip_shp = gpd.read_file("http://faculty.baruch.cuny.edu/geoportal/data/esri/usa/census/zip_poly.zip")
zip_shp.rename(columns={'ZIP':'postal_code'}, inplace=True)
zip_shp = zip_shp[zip_shp['STATE'] == 'PA']
zip_shp = zip_shp[['postal_code', 'geometry']]

In [None]:
zip_shp.head()

In [None]:
zip_shp['postal_code'].apply(len).unique()

In [None]:
zip_data = zip_shp.merge(zip_data, on = ['postal_code'])

In [None]:
zip_data.head()

In [None]:
zip_data = zip_data.to_crs(epsg=3857)

In [None]:
metric = 'visits_normalized_relative'
month = 4

# Create the figure
fig, ax = plt.subplots(facecolor="lightgray", figsize=(8, 8))

# NEW: Create a nice, lined up colorbar axes (called "cax" here)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.2)

# Plot
plot_df = zip_data[zip_data['month'] == month]
col = metric + '_dif'
plot_df.plot(ax=ax, cax=cax, column=col, edgecolor="none", legend=True, cmap="viridis")

# NEW: Get the limits of the GeoDataFrame
xmin, ymin, xmax, ymax = plot_df.total_bounds

# NEW: Set the xlims and ylims
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)

# Format
ax.set_axis_off()
ax.set_aspect("equal")

In [None]:
zip_data.postal_code.unique()