## Expedited patterns processing
This notebook reads in the patterns data and:
    1. Filters down to Philadelphia zipcodes based on a (currently hard-coded) list.
    2. Aggregates each file by zipcode
    3. Concatenates the files
    4. Writes the result to philly_patterns_by_zip.csv in the processed data folder.
    
It runs much faster than the original patterns exploration notebook which makes it useful 
for updating the zipcode map data.

In [2]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv

In [3]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')

In [4]:
# Read in all patterns files in the monthly-patterns folder

patterns_path = os.path.join(raw_data_dir,'monthly-patterns')
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(patterns_path):
    for file in f:
        if file.endswith('.csv.gz') and 'patterns-part' in file:
            files.append(os.path.join(patterns_path, r, file))

In [9]:
# columns that we keep and aggregate
keep_cols = ['postal_code', 'safegraph_place_id', 'date_range_start', 'visits_by_day']
# columns we keep after exploding
keep_cols_2 = ['postal_code', 'date','day_visit_counts']
# columns to group by
group_by_cols = ['postal_code', 'date']

def filter_and_explode(file):
    df = pd.read_csv(file)
    df = df[keep_cols]
    # zip codes are read as integers rather than strings so we add leading zeros.
    df['postal_code'] = df['postal_code'].apply(lambda x: ('00000' + str(x))[-5:])
    # then we select the Philadelphia zipcodes
    in_philly = df['postal_code'].str.startswith("191")
    df = df.loc[in_philly]
    # The visits_by_day column contains a list of integers. 
    # This explodes that list so we get one row per day.
    df = sgpy.explode_json_array(
        df, array_column ='visits_by_day', value_col_name='day_visit_counts', 
        place_key='safegraph_place_id', file_key='date_range_start', array_sequence='day', 
        keep_index=False, zero_index=False)
    df['date_range_start'] = pd.to_datetime(df['date_range_start'])
    # Calculate the date for each row.
    temp = df['day'].apply(lambda x: pd.Timedelta(x-1, unit='D'))
    df['date'] = df['date_range_start'] + temp
    df = df[keep_cols_2].groupby(group_by_cols).agg('sum').reset_index()
    return df

In [13]:
philly_patterns = [filter_and_explode(file) for file in files]
philly_patterns_df = pd.concat(philly_patterns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
processed_data_dir = os.path.join(root_dir,'data/processed')
philly_patterns_df.to_csv(os.path.join(processed_data_dir,'philly_patterns_by_zip.csv'), index = False)