## Core places processing
This notebook should read in the coreplaces files from the raw data folder and:
    1. Associate a date to each file
    2. Filter down each file to Philadelphia places
    3. Concatenate the files together
    4. Write out the result to a file in the proessed data folder

In [1]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv
import zipfile

In [2]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')

In [37]:
# local directory where we want to put all the data
core_path = os.path.join(raw_data_dir,'core')
# print(local)
archives = []
# r=root, d=directories, f = files
for r, d, f in os.walk(core_path):
    for file in f:
        if file.endswith('.zip'):
            archives.append(os.path.join(core_path, r, file))
archives

['/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/03/CoreRecords-CORE_POI-2019_03-2020-03-25.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/04/CoreApr2020Release-CORE_POI-2020_03-2020-04-07.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/05/CorePlacesMay2020Release-CORE_POI-2020_04-2020-05-06.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/10/Core-USA-Oct-CORE_POI-2020_09-2020-10-19.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/07/Core-USA-July2020-Release-CORE_POI-2020_06-2020-07-13.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/09/Core-USA-Sep-CORE_POI-2020_08-2020-09-08.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/08/Core-USA-August2020-Release-CORE_POI-2020_07-2020-08-07.zip']

In [5]:
for archive in archives:
    with zipfile.ZipFile(archive,"r") as zip_ref:
        zip_ref.extractall(archive[:-4])

In [35]:
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(core_path):
    for file in f:
        if file.endswith('.csv.gz') and 'core_poi-part' in file:
            files.append(os.path.join(core_path, r, file))

In [36]:
files

[]

In [8]:
def extract_date(file):
    folders = file.split("/")
    date = folders[-2]
    date = date.split("-")[-3:]
    return '-'.join(date)
    

In [9]:
all_dates = list(set([extract_date(file) for file in files]))
all_dates.sort()
all_dates = ['1900-01-01'] + all_dates

In [10]:
# make mapping from file date to previous file date
date_df = pd.DataFrame({'valid_to':all_dates})
date_df['valid_from'] = date_df.shift()['valid_to']
date_df = date_df.set_index('valid_to')

In [11]:
date_df

Unnamed: 0_level_0,valid_from
valid_to,Unnamed: 1_level_1
1900-01-01,
2020-03-25,1900-01-01
2020-04-07,2020-03-25
2020-05-06,2020-04-07
2020-07-13,2020-05-06
2020-08-07,2020-07-13
2020-09-08,2020-08-07
2020-10-19,2020-09-08


In [12]:
date_df.loc['2020-03-25', 'valid_from']

'1900-01-01'

In [24]:
# This list was taken from https://www.city-data.com/zipmaps/Philadelphia-Pennsylvania.html
philly_zips = pd.Series(['19102', '19103', '19104', '19106', '19107', '19109', '19111', '19112', '19114', '19115', 
               '19116', '19118', '19119', '19120', '19121', '19122', '19123', '19124', '19125', '19126', 
               '19127', '19128', '19129', '19130', '19131', '19132', '19133', '19134', '19135', '19136', 
               '19137', '19138', '19139', '19140', '19141', '19142', '19143', '19144', '19145', '19146', 
               '19147', '19148', '19149', '19150', '19151', '19152', '19153', '19154'], name = 'postal_code')

philly_zips = philly_zips.to_frame()

In [19]:
keep_cols = ['safegraph_place_id', 'parent_safegraph_place_id', 'safegraph_brand_ids', 'brands', 'top_category', 
             'sub_category', 'naics_code', 'latitude', 'longitude', 'street_address', 'city', 'region', 
             'postal_code', 'open_hours']

def filter_and_format(file):
    df = pd.read_csv(file)
    df = df[keep_cols]
    # Philadelphia selection
    df['postal_code'] = df['postal_code'].apply(lambda x: ('00000'+str(x))[-5:])
    in_philly = df['postal_code'].astype(str).str.startswith("191")
    df = df.loc[in_philly]
    # The visits_by_day column contains a list of integers. 
    # This explodes that list so we get one row per day.
    file_date = extract_date(file)
    df['valid_to'] = file_date
    df['valid_to'] = pd.to_datetime(df['valid_to'], format = '%Y-%m-%d')
    df['valid_from'] = date_df.loc[file_date,'valid_from']
    df['valid_from'] = pd.to_datetime(df['valid_from'], format = '%Y-%m-%d')
    return df

In [20]:
example_df = filter_and_format(files[0])

In [21]:
example_df.head()

Unnamed: 0,safegraph_place_id,parent_safegraph_place_id,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,postal_code,open_hours,valid_to,valid_from
273,sg:13ff95596a514499a0edbc3d45e7ad98,,,,Religious Organizations,Religious Organizations,813110.0,40.053464,-75.076514,7101 Pennway St,Philadelphia,PA,19111,"{ ""Mon"": [], ""Tue"": [], ""Wed"": [], ""Thu"": [], ...",2020-03-25,1900-01-01
599,sg:2b4441655d4a4e23ae20d327e67e219c,,,,,,,39.937086,-75.162869,1122 Washington Ave,Philadelphia,PA,19147,,2020-03-25,1900-01-01
747,sg:37f738a1643f46dd95fcca5935fbe449,,,,Elementary and Secondary Schools,Elementary and Secondary Schools,611110.0,39.983266,-75.246632,1141 N 63rd St,Philadelphia,PA,19151,,2020-03-25,1900-01-01
892,sg:423a8dc1c6084d579729f06762ea748a,,,,Motion Picture and Video Industries,Motion Picture Theaters (except Drive-Ins),512131.0,39.961404,-75.155315,1026 Spring Garden St,Philadelphia,PA,19123,,2020-03-25,1900-01-01
1033,sg:4c7fb593fdc44f3c939a4741aa7a0c0e,,,,Religious Organizations,Religious Organizations,813110.0,39.927161,-75.192737,28th & Snyder Ave,Philadelphia,PA,19102,,2020-03-25,1900-01-01


In [22]:
example_df.postal_code.unique()

array(['19111', '19147', '19151', '19123', '19102', '19134', '19131',
       '19139', '19148', '19114', '19149', '19152', '19141', '19116',
       '19107', '19153', '19104', '19144', '19121', '19129', '19145',
       '19103', '19124', '19143', '19115', '19106', '19120', '19119',
       '19135', '19109', '19132', '19125', '19146', '19133', '19130',
       '19138', '19142', '19140', '19118', '19136', '19150', '19137',
       '19154', '19128', '19127', '19122', '19126', '19110', '19112',
       '19176', '19113', '19155', '19192'], dtype=object)

In [29]:
philly_places = [filter_and_format(file) for file in files]

In [30]:
philly_places = pd.concat(philly_places)

In [31]:
philly_places

Unnamed: 0,safegraph_place_id,parent_safegraph_place_id,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,postal_code,open_hours,valid_to,valid_from
273,sg:13ff95596a514499a0edbc3d45e7ad98,,,,Religious Organizations,Religious Organizations,813110.0,40.053464,-75.076514,7101 Pennway St,Philadelphia,PA,19111,"{ ""Mon"": [], ""Tue"": [], ""Wed"": [], ""Thu"": [], ...",2020-03-25,1900-01-01
599,sg:2b4441655d4a4e23ae20d327e67e219c,,,,,,,39.937086,-75.162869,1122 Washington Ave,Philadelphia,PA,19147,,2020-03-25,1900-01-01
747,sg:37f738a1643f46dd95fcca5935fbe449,,,,Elementary and Secondary Schools,Elementary and Secondary Schools,611110.0,39.983266,-75.246632,1141 N 63rd St,Philadelphia,PA,19151,,2020-03-25,1900-01-01
892,sg:423a8dc1c6084d579729f06762ea748a,,,,Motion Picture and Video Industries,Motion Picture Theaters (except Drive-Ins),512131.0,39.961404,-75.155315,1026 Spring Garden St,Philadelphia,PA,19123,,2020-03-25,1900-01-01
1033,sg:4c7fb593fdc44f3c939a4741aa7a0c0e,,,,Religious Organizations,Religious Organizations,813110.0,39.927161,-75.192737,28th & Snyder Ave,Philadelphia,PA,19102,,2020-03-25,1900-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057142,sg:b4922964b199455ea6e51f915055dfc2,,,,Health and Personal Care Stores,Optical Goods Stores,446130.0,39.968948,-75.139515,180 W Girard Ave Ste 5,Philadelphia,PA,19123,"{ ""Mon"": [[""10:00"", ""18:00""]], ""Tue"": [[""10:00...",2020-08-07,2020-07-13
1057211,sg:b99b60aebcad4734bd72c84b0653e015,,,,Elementary and Secondary Schools,Elementary and Secondary Schools,611110.0,39.977912,-75.229408,1621 N 54th St,Philadelphia,PA,19131,,2020-08-07,2020-07-13
1057326,sg:c2b83b09fd4d405281bb6073441c11c6,,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,39.991803,-75.113133,3084 Frankford Ave,Philadelphia,PA,19134,"{ ""Mon"": [[""11:00"", ""23:00""]], ""Tue"": [], ""Wed...",2020-08-07,2020-07-13
1057724,sg:e143059c1fdc43c2b1141906aa801672,,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,40.023183,-75.061512,4221 Benner St,Philadelphia,PA,19135,"{ ""Mon"": [[""15:00"", ""22:00""]], ""Tue"": [[""15:00...",2020-08-07,2020-07-13


In [32]:
processed_data_dir = os.path.join(root_dir,'data/processed')

In [33]:
philly_places.to_csv(os.path.join(processed_data_dir,'philly_places.csv'), index = False)

In [34]:
for file in files:
    os.unlink(file)