In [2]:
import importlib
import os
from pathlib import Path
import sys

from arcgis.features import GeoAccessor, GeoSeriesAccessor
from arcgis.gis import GIS
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import swifter

# import arcpy if available
if importlib.util.find_spec("arcpy") is not None:
    import arcpy

In [3]:
# load environment variables from .env
load_dotenv(find_dotenv())

# paths to common data locations - NOTE: to convert any path to a raw string, simply use str(path_instance)
project_parent = Path('./').absolute().parent

data_dir = project_parent/'data'

dir_raw = data_dir/'raw'
dir_test = data_dir/'test'
dir_int = data_dir/'interim'

gdb_int = dir_int/'interim.gdb'
gdb_test = dir_test/'test.gdb'

# load the "autoreload" extension so that code can change, & always reload modules so that as you change code in src, it gets loaded
%load_ext autoreload
%autoreload 2

# import the project package from the project package path
sys.path.append(str(project_parent/'src'))
import ba_tools

In [59]:
raw_trips_tbl = dir_raw/'raw_trips.csv'
trips_x = 'coord_x'
trips_y = 'coord_y'
origin_id_field = 'ID'
customer_destination_id_field = 'store_locn'
customer_keep_field_prefix = 'travel_'
customer_keep_field_suffix = None
customer_keep_fields = None

block_group_fc = gdb_test/'block_groups'

arcpy.Exists(block_group_fc)

True

In [4]:
raw_trips_df = pd.read_csv(raw_trips_tbl).drop(columns=['OBJECTID2'])
raw_trips_df.columns = ['travel_distance_miles', 'travel_time_minutes', 'store_id', 'coord_x', 'coord_y']

raw_trips_df.head()

Unnamed: 0,travel_distance_miles,travel_time_minutes,store_id,coord_x,coord_y
0,3.123011,22.833333,724622491,-122.727474,45.502023
1,6.09565,21.133333,417755893,-122.788712,45.491919
2,2.368666,8.0,637497868,-122.802347,45.379363
3,10.198562,15.166667,724624345,-122.743871,45.400648
4,1.986523,12.216667,257782771,-122.703216,45.433386


In [5]:
raw_trips_df.store_id = raw_trips_df.store_id.astype(str)

In [6]:
raw_trips_df.to_parquet(dir_raw/'trips.parquet')
raw_trips_df.to_csv(dir_raw/'trips.csv')

In [4]:
raw_trips_df = pd.read_parquet(dir_raw/'trips.parquet')

In [6]:
df_out = raw_trips_df.iloc[:200000]

In [10]:
df_out.to_parquet(dir_raw/'trips.parquet')
df_out.to_csv(dir_raw/'trips.csv')

In [7]:
df_out.to_csv(dir_raw/'trips.csv')