In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

In [2]:
import cartopy.crs as ccrs
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [3]:
from safegraph_py_functions import safegraph_py_functions as sgpy

In [4]:
import os
from dotenv import load_dotenv, find_dotenv

In [5]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')

In [6]:
# local directory where we want to put all the data
patterns_path = os.path.join(raw_data_dir,'monthly-patterns')
# print(local)
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(patterns_path):
    for file in f:
        if file.endswith('.csv.gz') and 'patterns-part' in file:
            files.append(os.path.join(patterns_path, r, file))

In [7]:
files

['/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part3.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part1.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part4.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part2.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part3.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part1.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part4.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part2.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVID

In [12]:
keep_cols = ['safegraph_place_id', 'location_name','street_address','city','region','postal_code',
             'date_range_start', 'date_range_end', 'raw_visit_counts','raw_visitor_counts','visits_by_day',
            'poi_cbg']

def filter_and_explode(df):
    df = df[keep_cols]
    df = df[(df['city'] == 'Philadelphia') & (df['region'] == 'PA')]
    df = sgpy.explode_json_array(
        df, array_column ='visits_by_day', value_col_name='day_visit_counts', 
        place_key='safegraph_place_id', file_key='date_range_start', array_sequence='day', 
        keep_index=False, zero_index=False)
    df['date_range_start'] = pd.to_datetime(df['date_range_start'])
    temp = df['day'].apply(lambda x: pd.Timedelta(x-1, unit='D'))
    df['date'] = df['date_range_start'] + temp
    return df

In [14]:
philly_patterns = [filter_and_explode(pd.read_csv(file)) for file in files]

In [18]:
philly_patterns_df = pd.concat(philly_patterns)

In [15]:
processed_data_dir = os.path.join(root_dir,'data/processed')

In [19]:
philly_patterns_df.to_csv(os.path.join(processed_data_dir,'philly_patterns.csv'), index = False)

In [None]:
for file in files[:4]:
    print(file)
    patterns = pd.read_csv(file)
    print('date_range_start' in patterns.columns)
    patterns = filter_and_explode(patterns)
    philly_patterns.append(patterns)

In [6]:
patterns_path = os.path.join(raw_data_dir, 'monthly-patterns/patterns_backfill/2020/05/07/12/2020/01')

In [7]:
norm_path = os.path.join(raw_data_dir, 'monthly-patterns/normalization_stats_backfill/2020/05/07/12/2019/01')

In [8]:
norm = pd.read_csv(os.path.join(norm_path,'normalization_stats.csv'))

In [9]:
norm.head()

Unnamed: 0,year,month,day,total_visits,total_devices_seen,total_home_visits,total_home_visitors
0,2019,1,1,42236746,17581143,23397865,14772615
1,2019,1,2,53039892,18165000,23407187,14693797
2,2019,1,3,54674970,18021181,23070631,14361047
3,2019,1,4,58124764,18086092,23319783,14275180
4,2019,1,5,53145260,17583502,22608363,13941435


In [10]:
patterns = pd.read_csv(os.path.join(patterns_path,'patterns-part1.csv.gz'))

In [16]:
len(patterns.index)

1078206

In [17]:
patterns = patterns[(patterns['city'] == 'Philadelphia') & (patterns['region'] == 'PA')]

In [18]:
len(patterns.index)

4564

In [15]:
patterns.columns

Index(['safegraph_place_id', 'location_name', 'street_address', 'city',
       'region', 'postal_code', 'safegraph_brand_ids', 'brands',
       'date_range_start', 'date_range_end', 'raw_visit_counts',
       'raw_visitor_counts', 'visits_by_day', 'poi_cbg', 'visitor_home_cbgs',
       'visitor_daytime_cbgs', 'visitor_work_cbgs',
       'visitor_country_of_origin', 'distance_from_home', 'median_dwell',
       'bucketed_dwell_times', 'related_same_day_brand',
       'related_same_month_brand', 'popularity_by_hour', 'popularity_by_day',
       'device_type'],
      dtype='object')

In [19]:
keep_cols = ['safegraph_place_id', 'location_name','street_address','city','region','postal_code',
             'date_range_start', 'date_range_end', 'raw_visit_counts','raw_visitor_counts','visits_by_day',
            'poi_cbg']
patterns = patterns[keep_cols]

In [23]:
patterns['date_range_start_dt'] = pd.to_datetime(patterns['date_range_start'])
patterns['year'] = patterns['date_range_start_dt'].apply(lambda x: x.year)
patterns['month'] = patterns['date_range_start_dt'].apply(lambda x: x.month)
patterns['start_day'] = patterns['date_range_start_dt']

In [24]:
patterns.head()

Unnamed: 0,safegraph_place_id,location_name,street_address,city,region,postal_code,date_range_start,date_range_end,raw_visit_counts,raw_visitor_counts,visits_by_day,poi_cbg,date_range_start_dt,year,month
249,sg:1343a6b2f36a4d8bbb4410a361d067f2,Chez Bow Wow,707 N 2nd St,Philadelphia,PA,19123,2020-01-01T00:00:00-05:00,2020-02-01T00:00:00-05:00,177,130,"[6,5,4,12,2,7,10,4,3,10,8,13,3,3,4,4,3,4,5,7,5...",421010400000.0,2020-01-01 00:00:00-05:00,2020,1
430,sg:24305f8978a64542b68df89e08f9b07b,Tommie's Restaurant,465 E Girard Ave,Philadelphia,PA,19125,2020-01-01T00:00:00-05:00,2020-02-01T00:00:00-05:00,155,83,"[2,6,12,1,2,6,8,5,5,10,4,3,2,7,2,6,3,6,5,7,6,8...",421010200000.0,2020-01-01 00:00:00-05:00,2020,1
500,sg:29b4966a726046a89522b619b81393b9,Fine Wine and Good Spirits,730 Adams Ave,Philadelphia,PA,19124,2020-01-01T00:00:00-05:00,2020-02-01T00:00:00-05:00,22,21,"[0,1,1,0,1,0,1,0,1,0,2,0,2,1,0,0,1,0,1,1,0,1,3...",421010300000.0,2020-01-01 00:00:00-05:00,2020,1
546,sg:2dd600dc710f405b89874a96fa32aca4,Great Wall Chinese Food,1801 Moore St,Philadelphia,PA,19145,2020-01-01T00:00:00-05:00,2020-02-01T00:00:00-05:00,264,134,"[7,6,8,10,6,10,8,9,9,4,6,13,9,8,10,6,14,8,11,7...",421010000000.0,2020-01-01 00:00:00-05:00,2020,1
581,sg:2fa584ddcf464a9fa551bd09f035c25c,Ted Martynowicz DO,501 S 54th St Ste 126,Philadelphia,PA,19143,2020-01-01T00:00:00-05:00,2020-02-01T00:00:00-05:00,395,204,"[6,8,13,4,5,16,9,17,15,14,7,9,18,16,19,15,21,1...",421010100000.0,2020-01-01 00:00:00-05:00,2020,1


In [20]:
exploded = sgpy.explode_json_array(patterns, array_column ='visits_by_day', value_col_name='day_visit_counts',place_key='safegraph_place_id', file_key='date_range_start', array_sequence='day', keep_index=False, zero_index=False)

In [26]:
exploded['date_range_start'] = pd.to_datetime(exploded['date_range_start'])
temp = exploded['day'].apply(lambda x: pd.Timedelta(x-1, unit='D'))
exploded['date'] = exploded['date_range_start'] + temp

In [27]:
exploded.head()

Unnamed: 0,safegraph_place_id,location_name,street_address,city,region,postal_code,date_range_start,date_range_end,raw_visit_counts,raw_visitor_counts,visits_by_day,poi_cbg,day_visit_counts,day,date
0,sg:1343a6b2f36a4d8bbb4410a361d067f2,Chez Bow Wow,707 N 2nd St,Philadelphia,PA,19123,2020-01-01 00:00:00-05:00,2020-02-01T00:00:00-05:00,177,130,"[6,5,4,12,2,7,10,4,3,10,8,13,3,3,4,4,3,4,5,7,5...",421010400000.0,6,1,2020-01-01 00:00:00-05:00
1,sg:1343a6b2f36a4d8bbb4410a361d067f2,Chez Bow Wow,707 N 2nd St,Philadelphia,PA,19123,2020-01-01 00:00:00-05:00,2020-02-01T00:00:00-05:00,177,130,"[6,5,4,12,2,7,10,4,3,10,8,13,3,3,4,4,3,4,5,7,5...",421010400000.0,5,2,2020-01-02 00:00:00-05:00
2,sg:1343a6b2f36a4d8bbb4410a361d067f2,Chez Bow Wow,707 N 2nd St,Philadelphia,PA,19123,2020-01-01 00:00:00-05:00,2020-02-01T00:00:00-05:00,177,130,"[6,5,4,12,2,7,10,4,3,10,8,13,3,3,4,4,3,4,5,7,5...",421010400000.0,4,3,2020-01-03 00:00:00-05:00
3,sg:1343a6b2f36a4d8bbb4410a361d067f2,Chez Bow Wow,707 N 2nd St,Philadelphia,PA,19123,2020-01-01 00:00:00-05:00,2020-02-01T00:00:00-05:00,177,130,"[6,5,4,12,2,7,10,4,3,10,8,13,3,3,4,4,3,4,5,7,5...",421010400000.0,12,4,2020-01-04 00:00:00-05:00
4,sg:1343a6b2f36a4d8bbb4410a361d067f2,Chez Bow Wow,707 N 2nd St,Philadelphia,PA,19123,2020-01-01 00:00:00-05:00,2020-02-01T00:00:00-05:00,177,130,"[6,5,4,12,2,7,10,4,3,10,8,13,3,3,4,4,3,4,5,7,5...",421010400000.0,2,5,2020-01-05 00:00:00-05:00
