In [2]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')
processed_data_dir = os.path.join(root_dir,'data/processed')

In [3]:
philly_patterns = pd.read_csv(
    os.path.join(processed_data_dir,"philly_patterns.csv.tar.gz")
)
philly_patterns.head()

Unnamed: 0,safegraph_place_id,location_name,street_address,city,region,postal_code,safegraph_brand_ids,brands,date_range_start,date_range_end,raw_visit_counts,raw_visitor_counts,visits_by_day,poi_cbg,visitor_home_cbgs,visitor_daytime_cbgs,visitor_work_cbgs,distance_from_home,median_dwell,device_type
0,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,,,2020-10-01T00:00:00-04:00,2020-11-01T00:00:00-04:00,7,7,"[0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",421010000000.0,{},"{""421010021002"":4,""421010024003"":4}",{},4783.0,8.0,"{""android"":0,""ios"":5}"
1,sg:32b17bf96a93444588131370e52d310d,Los Potrillos Mexican Restaurant,4653 Rising Sun Ave,Philadelphia,PA,19140,,,2020-10-01T00:00:00-04:00,2020-11-01T00:00:00-04:00,168,96,"[9,8,3,3,6,6,5,3,7,5,5,2,6,9,8,6,5,2,8,6,5,2,4...",421010300000.0,"{""421010288001"":11,""421010330006"":4,""421010287...","{""421010288001"":11,""420454107003"":4,""421010305...",{},3335.0,19.5,"{""android"":60,""ios"":35}"
2,sg:3f4865e8bbb249838350eee9a101c8eb,Precious Babies Learning Academy Day Car,1433 W Erie Ave,Philadelphia,PA,19140,,,2020-10-01T00:00:00-04:00,2020-11-01T00:00:00-04:00,75,47,"[2,1,4,1,2,2,4,2,6,1,8,2,3,2,4,2,4,1,3,1,4,1,3...",421010200000.0,"{""421010002001"":4,""421010202005"":4,""4210102750...","{""421010364001"":5,""421010204003"":4,""4210103100...",{},5626.0,28.0,"{""android"":29,""ios"":15}"
3,sg:76dbefabe6eb465fac98e62c2fb15be9,TABU hookah lounge,4535 N 5th St,Philadelphia,PA,19140,,,2020-10-01T00:00:00-04:00,2020-11-01T00:00:00-04:00,322,234,"[3,16,12,15,9,4,13,2,8,18,11,8,10,9,9,11,16,17...",421010300000.0,"{""421010287001"":7,""421010188001"":6,""4210103380...","{""421010197002"":7,""421010098021"":6,""4210103160...",{},6166.0,51.5,"{""android"":122,""ios"":112}"
4,sg:9b2b12ebfa7745b6814873b7c1cef76b,Yu Hsiang Garden,7630 Germantown Ave,Philadelphia,PA,19118,,,2020-10-01T00:00:00-04:00,2020-11-01T00:00:00-04:00,93,46,"[3,3,2,5,6,3,4,1,5,4,1,2,1,6,3,3,3,3,3,4,1,2,2...",421010200000.0,"{""421010306007"":4,""421010318001"":4,""4210101220...","{""420912019025"":5,""421010218002"":4,""4210102560...",{},4905.0,106.0,"{""android"":21,""ios"":24}"


In [4]:
# pull all the normalization_stats files in monthly-patterns
patterns_path = os.path.join(raw_data_dir,'monthly-patterns')
# print(local)
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(patterns_path):
    for file in f:
        if 'normalization_stats.csv' in file:
            files.append(os.path.join(patterns_path, r, file))

#print(files)

In [5]:
# for files with information disaggregated at the state level, keep only the country-wide info
def keep_total_level(norm_stats):
    if 'region' in norm_stats.columns:
        if len(norm_stats[norm_stats['region'] == 'ALL_STATES']) == 0:
            raise ValueError('no region named "ALL_STATES"')
        norm_stats = norm_stats[norm_stats['region'] == 'ALL_STATES']
        norm_stats = norm_stats.drop(columns = ['region'])
    return norm_stats

In [6]:
norm_stats = pd.concat([keep_total_level(pd.read_csv(file)) for file in files])
norm_stats['year'] = norm_stats['year'].astype(int)
norm_stats['month'] = norm_stats['month'].astype(int)
norm_stats['day'] = norm_stats['day'].astype(int)
# HK: I only downloaded patterns data from 2019 onwards due to memory constraints
norm_stats = norm_stats[norm_stats['year'] >= 2019]

In [7]:
philly_traffic = pd.read_csv(os.path.join(processed_data_dir,"philly_patterns.csv.tar.gz"))

In [8]:
philly_places = pd.read_csv(os.path.join(processed_data_dir,'philly_places.csv.tar.gz'))

In [9]:
norm_stats.head()

Unnamed: 0,year,month,day,total_visits,total_devices_seen,total_home_visits,total_home_visitors
0,2019,3,1,69304245,19864233,27636202,15964091
1,2019,3,2,62486898,19501615,26199538,15565360
2,2019,3,3,55210974,19444962,27158187,16285916
3,2019,3,4,63672615,19781861,27567027,16034434
4,2019,3,5,63906175,19171193,26656320,15457780
