## Patterns processing
This notebook reads in the patterns data and:
    1. Filters down to Philadelphia zipcodes based on a (currently hard-coded) list.
        a. I have not yet run this using this list. Previously it filtered down by looking at the state and city columns in the data
    3. Concatenates the files
    4. Writes the result to philly_patterns.csv in the processed data folder.
    
It takes a long time to run and the resulting dataset is very large so it is worth thinking about ways to cut down the data.

In [4]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv

In [5]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')

In [11]:
# local directory where we want to put all the data
patterns_path = os.path.join(raw_data_dir,'monthly-patterns')
# print(local)
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(patterns_path):
    for file in f:
        if file.endswith('.csv.gz') and 'patterns-part' in file:
            files.append(os.path.join(patterns_path, r, file))

norm_files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(patterns_path):
    for file in f:
        if 'normalization_stats.csv' in file:
            norm_files.append(os.path.join(patterns_path, r, file))

In [7]:
processed_data_dir = os.path.join(root_dir,'data/processed')
philly_places = pd.read_csv(os.path.join(processed_data_dir,'philly_places.csv'))

In [8]:
keep_cols = ['safegraph_place_id', 'location_name', 'street_address',
       'city', 'region', 'postal_code', 'safegraph_brand_ids', 'brands',
       'date_range_start', 'date_range_end', 'raw_visit_counts',
       'raw_visitor_counts', 'visits_by_day', 'poi_cbg', 'visitor_home_cbgs',
       'visitor_daytime_cbgs', 'visitor_work_cbgs',
       'distance_from_home', 'median_dwell',
       'device_type']

# for files with information disaggregated at the state level, keep only the country-wide info
def keep_total_level(norm_stats):
    if 'region' in norm_stats.columns:
        if len(norm_stats[norm_stats['region'] == 'ALL_STATES']) == 0:
            raise ValueError('no region named "ALL_STATES"')
        norm_stats = norm_stats[norm_stats['region'] == 'ALL_STATES']
        norm_stats = norm_stats.drop(columns = ['region'])
    return norm_stats

def filter_to_philly(file):
    # zip codes are read as integers rather than strings so we add leading zeros.
    # this is not strictly necessary since Philadelphia zipcodes don't have leading zeros.
    
    # Philadelphia selection
    # HK: adding leading zeros because some zipcodes in MA are 0191X.
    df = pd.read_csv(file)
    df['postal_code'] = df['postal_code'].apply(lambda x: ('00000'+str(x))[-5:])
    in_philly = df['postal_code'].astype(str).str.startswith("191")
    df = df.loc[in_philly]
    df = df[keep_cols]
    return df


def get_places(df):
    df = df.reset_index(drop = True)
    df['date_range_start'] = pd.to_datetime(
        df['date_range_start'].apply(lambda x: x[:10])
    )
    if len(df['date_range_start'].unique()) > 1:
        print('More than one date in {0}!'.format(file))
    file_date = df.loc[0,'date_range_start']
    current = (philly_places['valid_from'] <= file_date) & (philly_places['valid_to'] > file_date)
    current_places = philly_places.loc[current, ['safegraph_place_id','top_category']]
    df = df.merge(current_places, how = 'left')
    return df

def get_norm_stats(df):
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    return df
    
def explode(df):
    # The visits_by_day column contains a list of integers. 
    # This explodes that list so we get one row per day.
    df = sgpy.explode_json_array(
        df, array_column ='visits_by_day', value_col_name='day_visit_counts', 
        place_key='safegraph_place_id', file_key='date_range_start', array_sequence='day', 
        keep_index=False, zero_index=False)
    df['date_range_start'] = pd.to_datetime(df['date_range_start'])
    temp = df['day'].apply(lambda x: pd.Timedelta(x-1, unit='D'))
    df['date'] = df['date_range_start'] + temp
    return df

In [12]:
norm_stats = pd.concat([keep_total_level(pd.read_csv(file)) for file in norm_files])
norm_stats['year'] = norm_stats['year'].astype(int)
norm_stats['month'] = norm_stats['month'].astype(int)
norm_stats['day'] = norm_stats['day'].astype(int)
# HK: I only downloaded patterns data from 2019 onwards due to memory constraints
norm_stats = norm_stats[norm_stats['year'] >= 2019]

In [11]:
philly_patterns = [filter_to_philly(file) for file in files]
philly_patterns = pd.concat(philly_patterns)

In [12]:
processed_data_dir = os.path.join(root_dir,'data/processed')
philly_patterns.to_csv(
    os.path.join(processed_data_dir,"philly_patterns.csv.tar.gz"), index=False
)

In [19]:
philly_patterns_df.to_csv(os.path.join(processed_data_dir,'philly_patterns.csv'), index = False)