In [None]:
#default_exp data.preprocess

# Preprocessing

> This module reads data generated in create-sample-data step from parquet files, and saves two datasets partitioned by customer_id and email in order to enable next steps to efficiently perform aggregations

In [None]:
#export
import dask.dataframe as dd

from hopeit.app.context import EventContext
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client, MakeSampleDataJob, PreprocessingJob

In [None]:
#export
__steps__ = ['run']

logger = app_logger()

In [None]:
#export
def run(job: MakeSampleDataJob, context: EventContext) -> PreprocessingJob:
    base_path = context.env['data']['partitioned']
    client = get_client(context)
    try:
        df = dd.read_parquet(job.path, 
                             engine='fastparquet', 
                             columns=['order_id', 'order_date', 'customer_id', 'email', 'ip_addr', 'order_amount'])
        partitioned = {
            'customer_id': f'{base_path}/customer_id/',
            'email': f'{base_path}/email'
        }
        for key, path in partitioned.items():
            logger.info(context, f"Partitioning on {key} to {path}...")
            df.set_index(key).to_parquet(path)
        return PreprocessingJob(
            source=job.path,
            partitioned=partitioned
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = MakeSampleDataJob(path='./data/raw', num_batches=2, batch_size=1000, batch_span_days=10, 
                        num_customers=1000, num_emails=1000, num_ips=1000)
result = await execute_event(app_config, 'data.preprocess', job)
result

2021-05-18 20:25:55,943 | INFO | fraud-poc training data.preprocess ALT00617 75563 | Partitioning on customer_id to ./data/partitioned/customer_id/... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:25:54.672544+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2021-05-18 20:25:57,151 | INFO | fraud-poc training data.preprocess ALT00617 75563 | Partitioning on email to ./data/partitioned/email... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:25:54.672544+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=


PreprocessingJob(source='./data/raw', partitioned={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'})

In [None]:
dd.read_parquet(result.partitioned['customer_id']).head()

Unnamed: 0_level_0,order_id,order_date,email,ip_addr,order_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00c00d42-6923-4297-87c4-107bc70035be,7beaa6ea-27ea-4082-a0ca-e216b93521a1,2021-05-07 22:49:55+00:00,af9ec8893d8c212276b9916b2599efe88510207d,e18142d2fcd6626e86d40573fd268c2f5f449a60,301.888156
048e9104-8369-407b-a0a1-fb06c06760c1,4069005c-02d0-4f41-b065-ad4b82412f20,2021-05-07 05:29:05+00:00,30650ad8775163a27cc73b642d3de1f46a337aaa,110fef07ad03b67a2f30fcebd6ab190bc2d11252,86.057737
09da86be-63c3-4265-a799-8fe5efaf270c,3e5bb9cc-bdef-4693-8870-49967d510ae3,2021-05-03 10:08:39+00:00,79cdeec4056ed0714127c4c255d9dafa334ee2a7,ee714f6b18d2b01e2d632f1299d8cb7618099070,945.67836
09da86be-63c3-4265-a799-8fe5efaf270c,9cc8c9aa-e342-4e7a-8269-4ebb7bc0d208,2021-05-15 04:11:54+00:00,79cdeec4056ed0714127c4c255d9dafa334ee2a7,ee714f6b18d2b01e2d632f1299d8cb7618099070,955.929156
0c5e0c2c-fb79-4a60-94c4-501bf2d07278,5b3900c1-e2ca-4226-81ee-0aa47d825005,2021-05-07 09:52:48+00:00,86665ef2c644c5f6a50e7702a7d5b0b88ee3c68b,8a799f9e07fa26a47e84fd8f64ea20ae5601a489,525.625661


In [None]:
dd.read_parquet(result.partitioned['email']).head()

Unnamed: 0_level_0,order_id,order_date,customer_id,ip_addr,order_amount
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
06b27c6a29714eead08e96789b9ed37c6b33c67b,564ca53c-92ae-44e9-b550-857bc3f32299,2021-05-07 16:47:55+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,2061d686466fd7bdd0c4ff12c0c0431adfb4b584,296.66598
06b27c6a29714eead08e96789b9ed37c6b33c67b,78210c4e-1997-477f-92f7-c78da8f6e2b0,2021-05-12 00:37:48+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,e1bf51a3b72ef02c554f338b28ea67a61f2effa9,675.094589
06b27c6a29714eead08e96789b9ed37c6b33c67b,bab6d5be-14fe-4705-9765-ad5fc5a14b82,2021-05-15 01:52:11+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,e1bf51a3b72ef02c554f338b28ea67a61f2effa9,860.661434
06b27c6a29714eead08e96789b9ed37c6b33c67b,d12ec21f-c115-45c4-b5c9-904578faa733,2021-05-01 23:42:34+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,e1bf51a3b72ef02c554f338b28ea67a61f2effa9,944.537976
06b27c6a29714eead08e96789b9ed37c6b33c67b,c1adb330-64c2-40c1-931a-86ba8dde545d,2021-05-14 06:56:31+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,e1bf51a3b72ef02c554f338b28ea67a61f2effa9,135.732576
