## Core places processing
This notebook should read in the coreplaces files from the raw data folder and:
    1. Associate a date to each file
    2. Filter down each file to Philadelphia places
    3. Concatenate the files together
    4. Write out the result to a file in the proessed data folder

In [5]:
import pandas as pd
from safegraph_py_functions import safegraph_py_functions as sgpy
import os
from dotenv import load_dotenv, find_dotenv
import zipfile

In [6]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)
ROOT_DIR = os.environ.get("ROOT_DIR")

In [7]:
os.chdir(ROOT_DIR)
from src import DATA_DIR

In [8]:
core_path = DATA_DIR / 'raw' / 'core'
archives_ = [f for f in core_path.glob("**/*.zip")]

['/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/core/2020/03/CoreRecords-CORE_POI-2019_03-2020-03-25.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/core/2020/04/CoreApr2020Release-CORE_POI-2020_03-2020-04-07.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/core/2020/05/CorePlacesMay2020Release-CORE_POI-2020_04-2020-05-06.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/core/2020/10/Core-USA-Oct-CORE_POI-2020_09-2020-10-19.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/core/2020/07/Core-USA-July2020-Release-CORE_POI-2020_06-2020-07-13.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/core/2020/09/Core-USA-Sep-CORE_POI-2020_08-2020-09-08.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/core/2020/08/Core-USA-August2020-Release-CORE_POI-2020_07-2020-08-07.zip',
 '/Users/hannahkronenberg/SafegraphCOVIDPhilly/src/../data/raw/core/2020/06/Core-USA-June2020-

In [13]:
for archive in archives_:
    archive_str = str(archive.resolve())
    with zipfile.ZipFile(archive,"r") as zip_ref:
        zip_ref.extractall(archive_str[:-4])

In [29]:
files_ = [f for f in core_path.glob('**/core_poi-part*.csv.gz')]

#files_

In [18]:
def extract_date(file):
    file_str = str(file.resolve())
    folders = file_str.split("/")
    date = folders[-2]
    date = date.split("-")[-3:]
    return '-'.join(date) 

In [19]:
all_dates = list(set([extract_date(file) for file in files_]))
all_dates.sort()
all_dates = ['1900-01-01'] + all_dates

In [20]:
# make mapping from file date to previous file date
date_df = pd.DataFrame({'valid_to':all_dates})
date_df['valid_from'] = date_df.shift()['valid_to']
date_df = date_df.set_index('valid_to')

In [21]:
date_df

Unnamed: 0_level_0,valid_from
valid_to,Unnamed: 1_level_1
1900-01-01,
2020-03-25,1900-01-01
2020-04-07,2020-03-25
2020-05-06,2020-04-07
2020-06-06,2020-05-06
2020-07-13,2020-06-06
2020-08-07,2020-07-13
2020-09-08,2020-08-07
2020-10-19,2020-09-08


In [22]:
date_df.loc['2020-03-25', 'valid_from']

'1900-01-01'

In [23]:
keep_cols = ['safegraph_place_id', 'parent_safegraph_place_id', 'safegraph_brand_ids', 'brands', 'top_category', 
             'sub_category', 'naics_code', 'latitude', 'longitude', 'street_address', 'city', 'region', 
             'postal_code', 'open_hours']

def filter_and_format(file):
    df = pd.read_csv(file)
    #df = df[keep_cols]
    # Philadelphia selection
    df['postal_code'] = df['postal_code'].apply(lambda x: ('00000'+str(x))[-5:])
    in_philly = df['postal_code'].astype(str).str.startswith("191")
    df = df.loc[in_philly]
    # The visits_by_day column contains a list of integers. 
    # This explodes that list so we get one row per day.
    file_date = extract_date(file)
    df['valid_to'] = file_date
    df['valid_to'] = pd.to_datetime(df['valid_to'], format = '%Y-%m-%d')
    df['valid_from'] = date_df.loc[file_date,'valid_from']
    df['valid_from'] = pd.to_datetime(df['valid_from'], format = '%Y-%m-%d')
    return df

In [24]:
philly_places = [filter_and_format(file) for file in files_]

In [25]:
philly_places = pd.concat(philly_places)

In [26]:
philly_places

Unnamed: 0,safegraph_place_id,parent_safegraph_place_id,location_name,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,...,phone_number,open_hours,category_tags,valid_to,valid_from,placekey,opened_on,closed_on,tracking_opened_since,tracking_closed_since
273,sg:13ff95596a514499a0edbc3d45e7ad98,,Pentecostal Church of Philadelphia,,,Religious Organizations,Religious Organizations,813110.0,40.053464,-75.076514,...,1.215725e+10,"{ ""Mon"": [], ""Tue"": [], ""Wed"": [], ""Thu"": [], ...",,2020-03-25,1900-01-01,,,,,
599,sg:2b4441655d4a4e23ae20d327e67e219c,,Washington Nails & Paper Supply,,,,,,39.937086,-75.162869,...,,,,2020-03-25,1900-01-01,,,,,
747,sg:37f738a1643f46dd95fcca5935fbe449,,63rd St Multicultural Acad Of Academic Excellence,,,Elementary and Secondary Schools,Elementary and Secondary Schools,611110.0,39.983266,-75.246632,...,,,,2020-03-25,1900-01-01,,,,,
892,sg:423a8dc1c6084d579729f06762ea748a,,Union Transfer,,,Motion Picture and Video Industries,Motion Picture Theaters (except Drive-Ins),512131.0,39.961404,-75.155315,...,1.215232e+10,,,2020-03-25,1900-01-01,,,,,
1033,sg:4c7fb593fdc44f3c939a4741aa7a0c0e,,Assumption of the Holy Virgin Orthodox Church,,,Religious Organizations,Religious Organizations,813110.0,39.927161,-75.192737,...,1.215468e+10,,,2020-03-25,1900-01-01,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1074700,sg:cc420ccc9f3e4636b3b24e5bf76e2730,,Ramada Philadelphia Northeast,,,Traveler Accommodation,Hotels (except Casino Hotels) and Motels,721110.0,40.103577,-75.005351,...,,,,2020-06-06,2020-05-06,,,,,
1074763,sg:d190a8b5f5a2483987db81dcd88fa867,,McNally's Tavern,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,40.077121,-75.208852,...,1.215248e+10,"{ ""Mon"": [[""11:00"", ""23:00""]], ""Tue"": [[""11:00...","Bar or Pub,Sandwich Shop,Late Night",2020-06-06,2020-05-06,,,,,
1074801,sg:d59e27fc19264a2e8c855407505ba370,,Lo Chinese Restaurant,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,40.022127,-75.077535,...,1.215288e+10,,Chinese Food,2020-06-06,2020-05-06,,,,,
1075246,sg:f9b54b6cfbc44ddf8d74a501189abbea,sg:d8e253d8737547119e6d436bcf94a690,Tea Do,,,Restaurants and Other Eating Places,Snack and Nonalcoholic Beverage Bars,722515.0,39.954692,-75.200246,...,,"{ ""Mon"": [[""11:00"", ""24:00""]], ""Tue"": [[""11:00...","Smoothie & Juice Bar,Coffee Shop,Bubble Tea Shop",2020-06-06,2020-05-06,,,,,


In [27]:
philly_places.to_csv(DATA_DIR / 'processed' / 'philly_places.csv.tar.gz', index = False)

In [28]:
for file in files_:
    os.unlink(file)