In [None]:
#default_exp data.preprocess

### Preprocessing
> This module reads data generated in create-sample-data step from parquet files, and saves two datasets partitioned by customer_id and email in order to enable next steps to efficiently perform aggregations

In [None]:
#export
import dask.dataframe as dd

from hopeit.app.context import EventContext
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client, MakeSampleDataJob, PreprocessingJob

In [None]:
#export
__steps__ = ['run']

logger = app_logger()

In [None]:
#export
def run(job: MakeSampleDataJob, context: EventContext) -> PreprocessingJob:
    base_path = context.env['data']['partitioned']
    client = get_client(context)
    try:
        df = dd.read_parquet(job.path, 
                             engine='fastparquet', 
                             columns=['order_id', 'order_date', 'customer_id', 'email', 'ip_addr', 'order_amount'])
        partitioned = {
            'customer_id': f'{base_path}/customer_id/',
            'email': f'{base_path}/email'
        }
        for key, path in partitioned.items():
            logger.info(context, f"Partitioning on {key} to {path}...")
            df.set_index(key).to_parquet(path)
        return PreprocessingJob(
            source=job.path,
            partitioned=partitioned
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = MakeSampleDataJob(path='./data/raw', num_batches=2, batch_size=1000, batch_span_days=10, 
                        num_customers=1000, num_emails=1000, num_ips=1000)
result = await execute_event(app_config, 'data.preprocess', job)
result

2020-07-07 07:59:15,064 | INFO | fraud-poc 0.0.1 data.preprocess leo-legion 9733 | Partitioning on customer_id to ./data/partitioned/customer_id/... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-07T07:59:14.127882+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-07 07:59:16,563 | INFO | fraud-poc 0.0.1 data.preprocess leo-legion 9733 | Partitioning on email to ./data/partitioned/email... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-07T07:59:14.127882+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=


PreprocessingJob(source='./data/raw', partitioned={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'})

In [None]:
dd.read_parquet(result.partitioned['customer_id']).head()

Unnamed: 0_level_0,order_id,order_date,email,ip_addr,order_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
005c052f-5148-4acc-884d-7b06f604308f,6a5ddc18-0dee-485d-b2ee-cdb8ccfa0485,2020-06-29 10:45:56+00:00,1d53f0c2cb70ca0922dc0051a8e2cd300b720cce,7ff85a4d3ee8bf7773d867c187fc7599bd1672a3,555.943568
005c052f-5148-4acc-884d-7b06f604308f,e7ce02eb-08da-4894-be79-c293ace4693e,2020-06-25 07:41:16+00:00,386b2fa6a5640b6eeb47b557a634c5e324810d09,5c86c74948a216aa3ec6492f4949c8f62ac3251d,1.147234
005c052f-5148-4acc-884d-7b06f604308f,28b5ad13-55ce-40cd-bcc5-5c9204325edc,2020-06-19 00:42:49+00:00,386b2fa6a5640b6eeb47b557a634c5e324810d09,5c86c74948a216aa3ec6492f4949c8f62ac3251d,574.395839
005c052f-5148-4acc-884d-7b06f604308f,8d1368bd-b076-45a7-8457-78f44f4a2c7e,2020-07-04 01:31:48+00:00,04baa6de56ac27af1fae31ba56307fb814b32f68,84a1c9bd726d8a35d9335240ce68c056f772ca2d,204.861045
005c052f-5148-4acc-884d-7b06f604308f,58f0682a-1fd4-430c-ac11-98893c06991e,2020-06-27 03:59:10+00:00,04baa6de56ac27af1fae31ba56307fb814b32f68,84a1c9bd726d8a35d9335240ce68c056f772ca2d,526.19557


In [None]:
dd.read_parquet(result.partitioned['email']).head()

Unnamed: 0_level_0,order_id,order_date,customer_id,ip_addr,order_amount
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00691f9cf2bd3e1977c4d0408557f0c4931b294a,f39ecc27-5583-4e07-8d71-fcbecc55ea89,2020-06-23 22:19:09+00:00,0ed1768b-659c-465b-bd1e-b9e1571b6155,b55a2c7e553f4e59ec0582c673f9408fc5bb1d67,595.156716
00691f9cf2bd3e1977c4d0408557f0c4931b294a,faeda91b-a709-44ad-8c06-7d5667b5f039,2020-06-22 09:04:43+00:00,0ed1768b-659c-465b-bd1e-b9e1571b6155,b55a2c7e553f4e59ec0582c673f9408fc5bb1d67,96.706966
00691f9cf2bd3e1977c4d0408557f0c4931b294a,9164ae9e-5cdf-4af1-9175-73cd73fe6366,2020-06-26 23:46:13+00:00,0ed1768b-659c-465b-bd1e-b9e1571b6155,3b47716663011295fb13dd5ad4c242845c396484,641.732567
00691f9cf2bd3e1977c4d0408557f0c4931b294a,f64cf4cd-6d92-4e9c-9a04-c6a420b4f22d,2020-07-03 16:35:16+00:00,0ed1768b-659c-465b-bd1e-b9e1571b6155,b55a2c7e553f4e59ec0582c673f9408fc5bb1d67,536.578923
009157d2735994c1451447051676dcb001779c9e,60789921-58c5-47aa-8510-4878a9d3ebfa,2020-07-02 10:47:48+00:00,239ead93-a93a-4a4f-8673-1f47058ac8c1,a1dd69f29d89599598c3cdc93e7998add8c2b759,959.487961
