## Core places processing
This notebook should read in the coreplaces files from the raw data folder and:
    1. Associate a date to each file
    2. Filter down each file to Philadelphia places
    3. Concatenate the files together
    4. Write out the result to a file in the proessed data folder

In [1]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv

In [2]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
root_dir = os.environ.get("ROOT_DIR")
raw_data_dir = os.path.join(root_dir,'data/raw')

In [3]:
import zipfile

In [4]:
# local directory where we want to put all the data
core_path = os.path.join(raw_data_dir,'core')
# print(local)
archives = []
# r=root, d=directories, f = files
for r, d, f in os.walk(core_path):
    for file in f:
        if file.endswith('.zip'):
            archives.append(os.path.join(core_path, r, file))
archives

[]

In [None]:
# TO-DO: add code to remove files that have been expanded already.
# Match to existing files based on date.

In [6]:
for archive in archives:
    with zipfile.ZipFile(archive,"r") as zip_ref:
        zip_ref.extractall(archive[:-4])
    os.unlink(archive)

In [5]:
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(core_path):
    for file in f:
        if file.endswith('.csv.gz') and 'core_poi-part' in file:
            files.append(os.path.join(core_path, r, file))

In [6]:
files

['/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/03/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part5.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/03/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part3.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/03/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part1.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/03/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part4.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/03/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part2.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/04/CoreApr2020Release-CORE_POI-2020_03-2020-04-07/core_poi-part5.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw/core/2020/04/CoreApr2020Release-CORE_POI-2020_03-2020-04-07/core_poi-part3.csv.gz',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/data/raw

In [10]:
def extract_date(file):
    folders = file.split("/")
    date = folders[-2]
    date = date.split("-")[-3:]
    return '-'.join(date)
    

In [18]:
all_dates = list(set([extract_date(file) for file in files]))
all_dates.sort()
all_dates = ['1900-01-01'] + all_dates

In [24]:
# make mapping from file date to previous file date
date_df = pd.DataFrame({'valid_to':all_dates})
date_df['valid_from'] = date_df.shift()['valid_to']
date_df = date_df.set_index('valid_to')

In [25]:
date_df

Unnamed: 0_level_0,valid_from
valid_to,Unnamed: 1_level_1
1900-01-01,
2020-03-25,1900-01-01
2020-04-07,2020-03-25
2020-05-06,2020-04-07
2020-06-06,2020-05-06
2020-07-13,2020-06-06
2020-08-07,2020-07-13
2020-09-08,2020-08-07
2020-10-19,2020-09-08


In [29]:
date_df.loc['2020-03-25', 'valid_from']

'1900-01-01'

In [30]:
# This list was taken from https://www.city-data.com/zipmaps/Philadelphia-Pennsylvania.html
philly_zips = pd.Series(['19102', '19103', '19104', '19106', '19107', '19109', '19111', '19112', '19114', '19115', 
               '19116', '19118', '19119', '19120', '19121', '19122', '19123', '19124', '19125', '19126', 
               '19127', '19128', '19129', '19130', '19131', '19132', '19133', '19134', '19135', '19136', 
               '19137', '19138', '19139', '19140', '19141', '19142', '19143', '19144', '19145', '19146', 
               '19147', '19148', '19149', '19150', '19151', '19152', '19153', '19154'], name = 'postal_code')

philly_zips = philly_zips.to_frame()

In [35]:
keep_cols = ['safegraph_place_id', 'parent_safegraph_place_id', 'safegraph_brand_ids', 'brands', 'top_category', 
             'sub_category', 'naics_code', 'latitude', 'longitude', 'street_address', 'city', 'region', 
             'postal_code', 'open_hours']

def filter_and_format(file):
    df = pd.read_csv(file)
    df = df[keep_cols]
    # zip codes are read as integers rather than strings so we add leading zeros.
    # this is not strictly necessary since Philadelphia zipcodes don't have leading zeros.
    df['postal_code'] = df['postal_code'].apply(lambda x: ('00000' + str(x))[-5:])
    df = philly_zips.merge(df)
    # The visits_by_day column contains a list of integers. 
    # This explodes that list so we get one row per day.
    file_date = extract_date(file)
    df['valid_to'] = file_date
    df['valid_to'] = pd.to_datetime(df['valid_to'], format = '%Y-%m-%d')
    df['valid_from'] = date_df.loc[file_date,'valid_from']
    df['valid_from'] = pd.to_datetime(df['valid_from'], format = '%Y-%m-%d')
    return df

In [36]:
example_df = filter_and_format(files[0])

In [37]:
example_df.head()

Unnamed: 0,postal_code,safegraph_place_id,parent_safegraph_place_id,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,open_hours,valid_to,valid_from
0,19102,sg:4c7fb593fdc44f3c939a4741aa7a0c0e,,,,Religious Organizations,Religious Organizations,813110.0,39.927161,-75.192737,28th & Snyder Ave,Philadelphia,PA,,2020-03-25,1900-01-01
1,19102,sg:4d0a58a086ee462d968ba9308aca2346,,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,39.958452,-75.163874,1439 Vine St,Philadelphia,PA,,2020-03-25,1900-01-01
2,19102,sg:e69f2edb37a24b7fba0f8a6635a42c6c,,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,39.952228,-75.166504,1500 Jfk Blvd Two Penn Ctr Ste 6,Philadelphia,PA,,2020-03-25,1900-01-01
3,19102,sg:bacc571627354e18a254bdf1c745ea7e,,,,Offices of Physicians,Offices of Physicians (except Mental Health Sp...,621111.0,39.949958,-75.167695,1601 Walnut St Ste 208,Philadelphia,PA,"{ ""Mon"": [[""8:00"", ""18:00""]], ""Tue"": [[""8:00"",...",2020-03-25,1900-01-01
4,19102,sg:28cfd39dd89c4149b152a3c5942cea6c,,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,39.951522,-75.166746,1533 Chestnut St,Philadelphia,PA,"{ ""Mon"": [[""11:00"", ""23:00""]], ""Tue"": [[""11:00...",2020-03-25,1900-01-01


In [45]:
philly_places = [filter_and_format(file) for file in files]

In [46]:
philly_places = pd.concat(philly_places)

In [47]:
philly_places

Unnamed: 0,postal_code,safegraph_place_id,parent_safegraph_place_id,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,open_hours,valid_to,valid_from
0,19102,sg:4c7fb593fdc44f3c939a4741aa7a0c0e,,,,Religious Organizations,Religious Organizations,813110.0,39.927161,-75.192737,28th & Snyder Ave,Philadelphia,PA,,2020-03-25,1900-01-01
1,19102,sg:4d0a58a086ee462d968ba9308aca2346,,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,39.958452,-75.163874,1439 Vine St,Philadelphia,PA,,2020-03-25,1900-01-01
2,19102,sg:e69f2edb37a24b7fba0f8a6635a42c6c,,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,39.952228,-75.166504,1500 Jfk Blvd Two Penn Ctr Ste 6,Philadelphia,PA,,2020-03-25,1900-01-01
3,19102,sg:bacc571627354e18a254bdf1c745ea7e,,,,Offices of Physicians,Offices of Physicians (except Mental Health Sp...,621111.0,39.949958,-75.167695,1601 Walnut St Ste 208,Philadelphia,PA,"{ ""Mon"": [[""8:00"", ""18:00""]], ""Tue"": [[""8:00"",...",2020-03-25,1900-01-01
4,19102,sg:28cfd39dd89c4149b152a3c5942cea6c,,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,39.951522,-75.166746,1533 Chestnut St,Philadelphia,PA,"{ ""Mon"": [[""11:00"", ""23:00""]], ""Tue"": [[""11:00...",2020-03-25,1900-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4286,19154,sg:64094f57e641496199f2bde0e2e6089c,,,,Other Amusement and Recreation Industries,All Other Amusement and Recreation Industries,713990.0,40.080759,-74.974968,4336 Deerpath La,Philadelphia,PA,,2020-06-06,2020-05-06
4287,19154,sg:b63dffffed124790a1f43bbed2a6fb98,sg:7c7ceeceda65431c9e2ca94dc76dfe66,SG_BRAND_b9753c942e2efa5e5051a57e7d12c9bb,Sbarro,Restaurants and Other Eating Places,Limited-Service Restaurants,722513.0,40.084933,-74.962757,1244 Franklin Mills Cir,Philadelphia,PA,,2020-06-06,2020-05-06
4288,19154,sg:2f563afd93f2410fa4016c4b1a835390,,SG_BRAND_8d819ffb9fa80295260ecf779927ef25,Sunoco,Gasoline Stations,Gasoline Stations with Convenience Stores,447110.0,40.095688,-74.976944,12291 Academy & Byberry Rds,Philadelphia,PA,,2020-06-06,2020-05-06
4289,19154,sg:dddc9907a8d04d0b80735f7811c6c20f,sg:d43fa1a3d06d486f8631e799521aab75,,,Child Day Care Services,Child Day Care Services,624410.0,40.080416,-74.976594,Knights Rd And Chalfont Dr,Philadelphia,PA,,2020-06-06,2020-05-06


In [48]:
processed_data_dir = os.path.join(root_dir,'data/processed')

In [50]:
philly_places.to_csv(os.path.join(processed_data_dir,'philly_places.csv'), index = False)