In [None]:
#default_exp data.preprocess

In [None]:
!nbdev_build_lib

Converted 00_jobs.ipynb.
Converted 01-create-sample-data.ipynb.
Converted 02-preprocess.ipynb.
Converted 03-feature-calc.ipynb.
Converted 04-label-folds.ipynb.
Converted 05-training.ipynb.
Converted index.ipynb.


In [None]:
#export
import dask.dataframe as dd

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client, MakeSampleDataJob, PreprocessingJob

In [None]:
#export
__steps__ = ['run']

logger = app_logger()

In [None]:
#export
def run(job: MakeSampleDataJob, context: EventContext) -> PreprocessingJob:
    base_path = context.env['data']['partitioned']
    client = get_client(context)
    try:
        df = dd.read_parquet(job.path, 
                             engine='fastparquet', 
                             columns=['order_id', 'order_date', 'customer_id', 'email', 'ip_addr', 'order_amount'])
        partitioned = {
            'customer_id': f'{base_path}/customer_id/',
            'email': f'{base_path}/email'
        }
        for key, path in partitioned.items():
            logger.info(context, f"Partitioning on {key} to {path}...")
            df.set_index(key).to_parquet(path)
        return PreprocessingJob(
            source=job.path,
            partitioned=partitioned
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = MakeSampleDataJob(path='./data/raw', num_batches=2, batch_size=1000, batch_span_days=10, 
                        num_customers=1000, num_emails=1000, num_ips=1000)
result = await execute_event(app_config, 'data.preprocess', job)
result

2020-07-06 17:32:26,913 | INFO | fraud-poc 0.0.1 data.preprocess leo-legion 23174 | Partitioning on customer_id to ./data/partitioned/customer_id/... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T17:32:25.996107+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-06 17:32:28,496 | INFO | fraud-poc 0.0.1 data.preprocess leo-legion 23174 | Partitioning on email to ./data/partitioned/email... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T17:32:25.996107+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=


PreprocessingJob(source='./data/raw', partitioned={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'})

In [None]:
dd.read_parquet(result.partitioned['customer_id']).head()

Unnamed: 0_level_0,order_id,order_date,email,ip_addr,order_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
002b6bff-0bec-4b19-82be-d428f41cc30c,b5e1cf50-1996-44f2-8d82-fe18fbe61752,2020-06-28 19:11:16+00:00,d9c3af743102923e41d1ad4281ddd96742f6e50a,180fb8a54dce9e83394400969edb0dc6a7b13088,68.270268
002b6bff-0bec-4b19-82be-d428f41cc30c,7740c611-2504-4374-9a40-0d80c7d6a723,2020-06-11 17:01:42+00:00,d9c3af743102923e41d1ad4281ddd96742f6e50a,180fb8a54dce9e83394400969edb0dc6a7b13088,337.110821
0047b81f-8a96-473a-b0f2-8c8c10f304eb,6e4b3e31-288f-472e-94b9-03678e60c3a3,2020-06-24 10:32:22+00:00,58d77079c714b7796cac1d3df211cb3147525bfb,6d7c0b798489375381e3838b4fbad6f933c348ac,601.101297
004c9254-4e65-47f6-88e4-d262dce434a9,a7ad0661-f86d-4388-8b5f-5c3051c39c07,2020-06-05 02:43:32+00:00,09da81d40cb2939e7a48331825837d039d2b0784,02bf0acfcbc290d72b7765fb1de00fa796ca0d88,539.827743
0055a45d-ad90-48f3-a461-ac4d6716ab48,7f37211b-d1e4-4e43-acf9-1a17abbd3072,2020-06-16 14:17:27+00:00,f599018e431a101f6ed8ef365add95a1bcce726d,eecda0b16e78628a4e5368c437a634b5ef9d6f6e,124.542277


In [None]:
dd.read_parquet(result.partitioned['email']).head()

Unnamed: 0_level_0,order_id,order_date,customer_id,ip_addr,order_amount
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00563c174e1904e1333f6140b4f5af62e9af31f4,de347ad8-881d-4db2-8f77-8cd7c3bc8409,2020-05-28 12:51:48+00:00,a77c93a2-2b07-4f36-93f4-2cdb5de40fac,5e523d96f0b3a52caeb612543bab45f409c98396,172.943654
005fa0d91e609fede563ab1386609dbab7250655,64f7fbab-074a-4b43-9b1c-a6c7e6b615bf,2020-06-10 18:11:59+00:00,097fdf7b-41e4-4fec-b52c-8ba88be8f4c1,7080f1f31d238381c88a105258b7e891728a5c97,90.264341
005fa0d91e609fede563ab1386609dbab7250655,c3ca9003-cb66-43fc-95a3-1e0ac62b0daa,2020-06-05 08:50:44+00:00,097fdf7b-41e4-4fec-b52c-8ba88be8f4c1,7080f1f31d238381c88a105258b7e891728a5c97,243.128592
00715c8f81403a0dae2329f7217ae5c9506a1691,cdb2c545-d430-4b52-be94-c6d8cbb6a1f1,2020-05-29 07:34:49+00:00,cea238d3-d72f-4f86-b446-14164943c395,5dc2e15a296b2910a219f2ad6c6a0b996575002e,939.699533
00ba3fdeba3e968aeb68310a546098d0efe4bb3f,0cb052b6-414e-4e19-80b9-e44458df7e5a,2020-06-08 03:58:36+00:00,efd65b15-a16b-4ffe-a8a0-67c205c778b2,a14b9cfb63a6ba4a570f7088f350340096605de0,386.084229
