## Expedited patterns processing
This notebook reads in the patterns data and:
    1. Filters down to Philadelphia zipcodes based on a (currently hard-coded) list.
    2. Aggregates each file by zipcode
    3. Concatenates the files
    4. Writes the result to philly_patterns_by_zip.csv in the processed data folder.
    
It runs much faster than the original patterns exploration notebook which makes it useful 
for updating the zipcode map data.

In [None]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv
from loguru import logger

In [2]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
os.chdir(os.environ.get("ROOT_DIR"))
from src import DATA_DIR
raw_data_dir = DATA_DIR / 'raw'

In [3]:
# Read in all patterns files in the monthly-patterns folder

patterns_path = raw_data_dir / "monthly-patterns"
files = []
for f in patterns_path.glob("**/*.csv.gz"):
    files.append(f)

In [4]:
# columns that we keep and aggregate
keep_cols = ['postal_code', 'safegraph_place_id', 'date_range_start', 'visits_by_day']
# columns we keep after exploding
keep_cols_2 = ['postal_code', 'date','day_visit_counts']
# columns to group by
group_by_cols = ['postal_code', 'date']

def filter_to_philly(df):
    # zip codes are read as integers rather than strings so we add leading zeros.
    # this is not strictly necessary since Philadelphia zipcodes don't have leading zeros.
    
    # Philadelphia selection
    # HK: adding leading zeros because some zipcodes in MA are 0191X.
    df['postal_code'] = df['postal_code'].apply(lambda x: ('00000'+str(x))[-5:])
    in_philly = df['postal_code'].astype(str).str.startswith("191")
    df = df.loc[in_philly]
    
    return df

def explode(df):
    
    df = df[keep_cols]
    
    # The visits_by_day column contains a list of integers. 
    # This explodes that list so we get one row per day.
    df = sgpy.explode_json_array(
        df, array_column ='visits_by_day', value_col_name='day_visit_counts', 
        place_key='safegraph_place_id', file_key='date_range_start', array_sequence='day', 
        keep_index=False, zero_index=False)
    df['date_range_start'] = pd.to_datetime(df['date_range_start'])
    # Calculate the date for each row.
    temp = df['day'].apply(lambda x: pd.Timedelta(x-1, unit='D'))
    df['date'] = df['date_range_start'] + temp
    df = df[keep_cols_2].groupby(group_by_cols).agg('sum').reset_index()
    return df

In [5]:
philly_patterns = []
for i, f in enumerate(files):
    print(f)
    philly_patterns.append(filter_to_philly(pd.read_csv(f)))
    
philly_patterns = pd.concat(philly_patterns)

/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part3.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part1.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part4.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part2.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part3.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part1.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part4.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part2.csv.gz
/Users/hannahkro

/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns_backfill/2020/05/07/12/2019/01/patterns-part4.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns_backfill/2020/05/07/12/2019/01/patterns-part2.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns_backfill/2020/05/07/12/2019/06/patterns-part3.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns_backfill/2020/05/07/12/2019/06/patterns-part1.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns_backfill/2020/05/07/12/2019/06/patterns-part4.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns_backfill/2020/05/07/12/2019/06/patterns-part2.csv.gz
/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/monthly-patterns/patterns_backfill/2020/05/07/12/2019/12/patterns-part3.csv.gz
/Users/hannahkronenb

In [None]:
philly_patterns.head()

In [None]:
len(philly_patterns)

In [6]:
philly_patterns.to_csv(
    DATA_DIR / "processed" / "philly_patterns.csv.tar.gz", index=False
)

In [None]:
philly_patterns_exploded = explode(philly_patterns)

In [None]:
philly_patterns_exploded.to_csv(
    DATA_DIR / "processed" / "philly_patterns_exploded.csv.tar.gz", index=False
)