In [1]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv

In [5]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')

In [6]:
# local directory where we want to put all the data
patterns_path = os.path.join(raw_data_dir,'monthly-patterns')
# print(local)
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(patterns_path):
    for file in f:
        if file.endswith('.csv.gz') and 'patterns-part' in file:
            files.append(os.path.join(patterns_path, r, file))

In [7]:
files

['/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part3.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part1.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part4.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part2.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part3.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part1.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part4.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part2.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVID

In [12]:
keep_cols = ['safegraph_place_id', 'location_name','street_address','city','region','postal_code',
             'date_range_start', 'date_range_end', 'raw_visit_counts','raw_visitor_counts','visits_by_day',
            'poi_cbg']

def filter_and_explode(df):
    df = df[keep_cols]
    df = df[(df['city'] == 'Philadelphia') & (df['region'] == 'PA')]
    df = sgpy.explode_json_array(
        df, array_column ='visits_by_day', value_col_name='day_visit_counts', 
        place_key='safegraph_place_id', file_key='date_range_start', array_sequence='day', 
        keep_index=False, zero_index=False)
    df['date_range_start'] = pd.to_datetime(df['date_range_start'])
    temp = df['day'].apply(lambda x: pd.Timedelta(x-1, unit='D'))
    df['date'] = df['date_range_start'] + temp
    return df

In [14]:
philly_patterns = [filter_and_explode(pd.read_csv(file)) for file in files]

In [18]:
philly_patterns_df = pd.concat(philly_patterns)

In [15]:
processed_data_dir = os.path.join(root_dir,'data/processed')

In [19]:
philly_patterns_df.to_csv(os.path.join(processed_data_dir,'philly_patterns.csv'), index = False)