## Expedited patterns processing
This notebook reads in the patterns data and:
    1. Filters down to Philadelphia zipcodes based on a (currently hard-coded) list.
    2. Aggregates each file by zipcode
    3. Concatenates the files
    4. Writes the result to philly_patterns_by_zip.csv in the processed data folder.
    
It runs much faster than the original patterns exploration notebook which makes it useful 
for updating the zipcode map data.

In [1]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv
from loguru import logger

from src import DATA_DIR

ModuleNotFoundError: No module named 'src'

In [3]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
raw_data_dir = DATA_DIR / 'raw'

In [7]:
# Read in all patterns files in the monthly-patterns folder

patterns_path = raw_data_dir / "monthly-patterns"

for f in patterns_path.glob("**/*.csv.gz"):
    files.append(f)

In [8]:
# This list was taken from https://www.city-data.com/zipmaps/Philadelphia-Pennsylvania.html
philly_zips = pd.Series(
    [
        "19102",
        "19103",
        "19104",
        "19106",
        "19107",
        "19109",
        "19111",
        "19112",
        "19114",
        "19115",
        "19116",
        "19118",
        "19119",
        "19120",
        "19121",
        "19122",
        "19123",
        "19124",
        "19125",
        "19126",
        "19127",
        "19128",
        "19129",
        "19130",
        "19131",
        "19132",
        "19133",
        "19134",
        "19135",
        "19136",
        "19137",
        "19138",
        "19139",
        "19140",
        "19141",
        "19142",
        "19143",
        "19144",
        "19145",
        "19146",
        "19147",
        "19148",
        "19149",
        "19150",
        "19151",
        "19152",
        "19153",
        "19154",
    ],
    name="postal_code",
)

philly_zips = philly_zips.to_frame()

In [15]:
# columns that we keep and aggregate
keep_cols = ['postal_code', 'safegraph_place_id', 'date_range_start', 'visits_by_day']
# columns we keep after exploding
keep_cols_2 = ['postal_code', 'date','day_visit_counts']
# columns to group by
group_by_cols = ['postal_code', 'date']

def filter_to_philly(df):
    # zip codes are read as integers rather than strings so we add leading zeros.
    # this is not strictly necessary since Philadelphia zipcodes don't have leading zeros.
    
    # Philadelphia selection
    # HK: adding leading zeros because some zipcodes in MA are 0191X.
    df['postal_code'] = df['postal_code'].apply(lambda x: ('00000'+str(x))[-5:])
    in_philly = df['postal_code'].astype(str).str.startswith("191")
    df = df.loc[in_philly]
    
    return df

def explode(df):
    
    df = df[keep_cols]
    
    # The visits_by_day column contains a list of integers. 
    # This explodes that list so we get one row per day.
    df = sgpy.explode_json_array(
        df, array_column ='visits_by_day', value_col_name='day_visit_counts', 
        place_key='safegraph_place_id', file_key='date_range_start', array_sequence='day', 
        keep_index=False, zero_index=False)
    df['date_range_start'] = pd.to_datetime(df['date_range_start'])
    # Calculate the date for each row.
    temp = df['day'].apply(lambda x: pd.Timedelta(x-1, unit='D'))
    df['date'] = df['date_range_start'] + temp
    df = df[keep_cols_2].groupby(group_by_cols).agg('sum').reset_index()
    return df

In [23]:
philly_patterns = []
for i, f in enumerate(files):
    print(f)
    philly_patterns.append(filter_to_philly(pd.read_csv(f)))
    
philly_patterns = pd.concat(philly_patterns)

/Users/nhand/DataProjects/DATSPracticum/HK_DATS_Practicum/src/../data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part3.csv.gz
/Users/nhand/DataProjects/DATSPracticum/HK_DATS_Practicum/src/../data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part1.csv.gz
/Users/nhand/DataProjects/DATSPracticum/HK_DATS_Practicum/src/../data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part4.csv.gz
/Users/nhand/DataProjects/DATSPracticum/HK_DATS_Practicum/src/../data/raw/monthly-patterns/patterns/2020/11/06/11/patterns-part2.csv.gz
/Users/nhand/DataProjects/DATSPracticum/HK_DATS_Practicum/src/../data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part3.csv.gz
/Users/nhand/DataProjects/DATSPracticum/HK_DATS_Practicum/src/../data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part1.csv.gz
/Users/nhand/DataProjects/DATSPracticum/HK_DATS_Practicum/src/../data/raw/monthly-patterns/patterns/2020/10/07/02/patterns-part4.csv.gz
/Users/nhand/DataProjects/DATSPracticum/HK_DATS_

FileNotFoundError: [Errno 2] No such file or directory: '/Users/nhand/DataProjects/DATSPracticum/HK_DATS_Practicum/src/../data/processed/philly_patterns.csv'

In [25]:
philly_patterns.head()

Unnamed: 0,placekey,safegraph_place_id,location_name,street_address,city,region,postal_code,safegraph_brand_ids,brands,date_range_start,...,visitor_work_cbgs,visitor_country_of_origin,distance_from_home,median_dwell,bucketed_dwell_times,related_same_day_brand,related_same_month_brand,popularity_by_hour,popularity_by_day,device_type
408,zzw-228@628-pmb-7t9,sg:26825b56c5ce4c7d91afdc71a7c34440,Provident Technology,1315 Walnut St Ste 905,Philadelphia,PA,19107,,,2020-10-01T00:00:00-04:00,...,{},"{""US"":6}",4783.0,8.0,"{""<5"":2,""5-20"":2,""21-60"":0,""61-240"":0,"">240"":3}",{},"{""Dunkin'"":39,""Starbucks"":35,""McDonald's"":28,""...","[1,1,1,1,1,0,0,0,0,1,2,3,2,2,3,2,2,1,0,0,1,1,2,1]","{""Monday"":1,""Tuesday"":2,""Wednesday"":0,""Thursda...","{""android"":0,""ios"":5}"
435,zzy-223@62j-q72-249,sg:2910df09a7264baea5bae7f876ff0423,Arete Rehab,110 Haverhill Rd Ste 344 Arete Rehabilitation,Amesbury Town,MA,1913,,,2020-10-01T00:00:00-04:00,...,{},,,238.0,"{""<5"":0,""5-20"":0,""21-60"":0,""61-240"":1,"">240"":0}",{},"{""Edward Jones"":100}","[1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1]","{""Monday"":1,""Tuesday"":0,""Wednesday"":0,""Thursda...","{""android"":0,""ios"":0}"
536,22x-222@628-phy-4qf,sg:32b17bf96a93444588131370e52d310d,Los Potrillos Mexican Restaurant,4653 Rising Sun Ave,Philadelphia,PA,19140,,,2020-10-01T00:00:00-04:00,...,{},"{""US"":87}",3335.0,19.5,"{""<5"":8,""5-20"":78,""21-60"":28,""61-240"":37,"">240...","{""Dunkin'"":21,""AAMCO Transmissions"":20,""Family...","{""Dunkin'"":42,""Sunoco"":35,""Wawa"":35,""Dollar Tr...","[7,6,5,6,7,8,11,11,12,18,20,28,28,24,35,30,36,...","{""Monday"":20,""Tuesday"":23,""Wednesday"":22,""Thur...","{""android"":60,""ios"":35}"
683,238-222@628-pj5-3nq,sg:3f4865e8bbb249838350eee9a101c8eb,Precious Babies Learning Academy Day Car,1433 W Erie Ave,Philadelphia,PA,19140,,,2020-10-01T00:00:00-04:00,...,{},"{""US"":32}",5626.0,28.0,"{""<5"":5,""5-20"":30,""21-60"":16,""61-240"":14,"">240...","{""McDonald's"":46,""Chick-fil-A"":14}","{""Dunkin'"":37,""Sunoco"":36,""Rite Aid"":25,""Wawa""...","[6,6,4,4,3,2,3,4,7,9,14,13,11,13,14,12,10,11,1...","{""Monday"":9,""Tuesday"":8,""Wednesday"":12,""Thursd...","{""android"":29,""ios"":15}"
1236,23z-222@628-phy-rc5,sg:76dbefabe6eb465fac98e62c2fb15be9,TABU hookah lounge,4535 N 5th St,Philadelphia,PA,19140,,,2020-10-01T00:00:00-04:00,...,{},"{""US"":211}",6166.0,51.5,"{""<5"":11,""5-20"":91,""21-60"":74,""61-240"":103,"">2...","{""Wawa"":20,""PNC Financial Services"":8,""U-Haul""...","{""Wawa"":48,""Dunkin'"":37,""Sunoco"":29,""McDonald'...","[18,17,11,11,13,18,16,15,41,46,53,62,70,73,71,...","{""Monday"":36,""Tuesday"":35,""Wednesday"":36,""Thur...","{""android"":122,""ios"":112}"


In [26]:
len(philly_patterns)

383658

In [27]:
philly_patterns.to_csv(
    DATA_DIR / "processed" / "philly_patterns.csv.tar.gz", index=False
)

In [28]:
philly_patterns_exploded = explode(philly_patterns)



In [29]:
philly_patterns_exploded.to_csv(
    DATA_DIR / "processed" / "philly_patterns_exploded.csv.tar.gz", index=False
)