In [None]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv

In [None]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')

In [None]:
# local directory where we want to put all the data
patterns_path = os.path.join(raw_data_dir,'monthly-patterns')
# print(local)
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(patterns_path):
    for file in f:
        if file.endswith('.csv.gz') and 'patterns-part' in file:
            files.append(os.path.join(patterns_path, r, file))
#print(files)

In [None]:
# This list was taken from https://www.city-data.com/zipmaps/Philadelphia-Pennsylvania.html
philly_zips = pd.Series(['19102', '19103', '19104', '19106', '19107', '19109', '19111', '19112', '19114', '19115', 
               '19116', '19118', '19119', '19120', '19121', '19122', '19123', '19124', '19125', '19126', 
               '19127', '19128', '19129', '19130', '19131', '19132', '19133', '19134', '19135', '19136', 
               '19137', '19138', '19139', '19140', '19141', '19142', '19143', '19144', '19145', '19146', 
               '19147', '19148', '19149', '19150', '19151', '19152', '19153', '19154'], name = 'postal_code')

philly_zips = philly_zips.to_frame()

In [None]:
keep_cols = ['postal_code', 'poi_cbg', 'date_range_start', 'date_range_end', 'raw_visit_counts', 
    'raw_visitor_counts', 'visitor_home_cbgs','median_dwell']

group_by_cols = ['poi_cbg', 'visitor_cbg', 'date_range_start', 'date_range_end']

aggregations = {'raw_visit_counts': 'sum', 
                'raw_visitor_counts': 'median', 
                'median_dwell': 'median',
               'cbg_visitor_count':'sum'}

def filter_and_explode(file):
    df = pd.read_csv(file)
    df = df[keep_cols]
    df['poi_cbg'] = df['poi_cbg'].astype(str)
    # zip codes are read as integers rather than strings so we add leading zeros.
    # this is not strictly necessary since Philadelphia zipcodes don't have leading zeros.
    df['postal_code'] = df['postal_code'].apply(lambda x: ('00000' + str(x))[-5:])
    df = philly_zips.merge(df)
    df = df.drop(columns = 'postal_code')
    # The visits_by_day column contains a list of integers. 
    # This explodes that list so we get one row per day.
    df = sgpy.unpack_json_and_merge(
        df, json_column='visitor_home_cbgs', key_col_name='visitor_cbg', 
        value_col_name='cbg_visitor_count', keep_index=False
    )
    df = df.groupby(group_by_cols).agg(aggregations).reset_index()
    return df

In [None]:
processed_data_dir = os.path.join(root_dir,'data/processed')
philly_patterns_file = os.path.join(processed_data_dir,"philly_patterns.csv.tar.gz")
visitors_by_cbg = filter_and_explode(philly_patterns_file)
visitors_by_cbg.to_csv(
    os.path.join(processed_data_dir,'philly_visitors_by_cbg.csv.tar.gz'), index = False)

In [None]:
file = files[0]
df = pd.read_csv(file)
df = df[['safegraph_place_id'] + keep_cols]
df['poi_cbg'] = df['poi_cbg'].astype(str)
# zip codes are read as integers rather than strings so we add leading zeros.
# this is not strictly necessary since Philadelphia zipcodes don't have leading zeros.
df['postal_code'] = df['postal_code'].apply(lambda x: ('00000' + str(x))[-5:])
df = philly_zips.merge(df)
df = df.drop(columns = 'postal_code')
# The visits_by_day column contains a list of integers. 
# This explodes that list so we get one row per day.
df = sgpy.unpack_json_and_merge(
    df, json_column='visitor_home_cbgs', key_col_name='visitor_cbg', 
    value_col_name='cbg_visitor_count', keep_index=False
)
grouped_df = df.groupby('safegraph_place_id').agg({'raw_visitor_counts': 'max', 'cbg_visitor_count': 'sum'}).reset_index()

In [None]:
grouped_df['percent_error'] = 100*(
    grouped_df['cbg_visitor_count'] - grouped_df['raw_visitor_counts']
)/grouped_df['raw_visitor_counts']

In [None]:
df[df['safegraph_place_id'] == 'sg:003249a954f84272be664ef43c7ffe07']

In [None]:
df['postal_code'] = df['postal_code'].apply(lambda x: ('00000' + str(x))[-5:])
df = philly_zips.merge(df)

In [None]:
uniques = df[['safegraph_place_id', 'poi_cbg']].groupby('safegraph_place_id').poi_cbg.nunique()

In [None]:
# there are cbgs with more than one zipcode
uniques[uniques > 1]

In [None]:
df = filter_and_explode(files[0])

In [None]:
df.head()

In [None]:
df['poi_cbg'] = df['poi_cbg'].astype(str)