In [None]:
#default_exp data.training_data

In [None]:
!nbdev_build_lib

Converted 00_jobs.ipynb.
Converted 01-create-sample-data.ipynb.
Converted 02-preprocess.ipynb.
Converted 03-feature-calc.ipynb.
Converted 04-training-data.ipynb.
Converted 05-train-model.ipynb.
Converted 06-submit-training-pipeline.ipynb.
Converted 07-prepare-db.ipynb.
Converted 99-tools.ipynb.
Converted index.ipynb.


In [None]:
#export
from typing import Dict
from datetime import datetime, timezone, timedelta
import random
import math
import dask.dataframe as dd
import numpy as np

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client, FeatureCalcJob, TrainingDataJob

In [None]:
#export
__steps__ = ['run']

logger = app_logger()

In [None]:
#export
def _merge_feature_datasets(datasets: Dict[str, str]):
    df = None
    for key, path in datasets.items():
        df_key = dd.read_parquet(path, engine='fastparquet')
        if df is not None:
            df = df.merge(df_key, left_on='order_id', right_on='order_id', suffixes=('', '_DROP'))
            keep_cols = [c for c in df.columns if c[-5:] != '_DROP']
            df = df[keep_cols]
        else:
            df = df_key
    return df


def _add_labels(df):
    df['is_fraud'] = (df['known_ip_addr_by_customer_id'] == 0) & (df['num_ip_addr_by_customer_id'] > 3)
    df['is_fraud'] = df['is_fraud'] | ((df['known_email_by_customer_id'] == 0) & (df['num_email_by_customer_id'] > 3))
    df['is_fraud'] = df['is_fraud'] | (df['order_amount'] > (1. + 0.5 * random.random() * df['order_amount_mean_by_customer_id']))
    df['is_fraud'] = df['is_fraud'].apply(lambda x: int(x & (random.random() > 0.1)), meta=('is_fraud', int))
    return df

def _add_sample_flag(df, subsample_not_fraud: float):
    df['sample'] = df['is_fraud'].apply(lambda x: int((x > 0) | (random.random() > (1.-subsample_not_fraud))), meta=('sample', int))
    return df

def _add_validation_flag(df):
    now = datetime.now(tz=timezone.utc)
    now_epoch = now.timestamp()
    df['now'] = now
    df['elapsed_wgt'] = df['order_date'].apply(lambda x: math.log(max(0.001, 1. - (now_epoch - x.timestamp())/now_epoch)) + 1., meta=('elapsed_wgt', float))
    df['validation'] = df['elapsed_wgt'].apply(lambda x: int((max(0, x)  * random.random()) > 0.8), meta=('validation', int))
    return df

def _add_fold_number(df, num_folds):
    df['fold'] = df['is_fraud'].apply(lambda x: random.randint(0, num_folds), meta=('fold', int)) 
    return df

In [None]:
#export
def run(job: FeatureCalcJob, context: EventContext) -> TrainingDataJob:
    base_path = context.env['data']['training']
    num_folds = context.env['training_data']['num_folds']
    subsample_not_fraud = context.env['training_data']['subsample_not_fraud']
    
    client = get_client(context)
    try:
        df = _merge_feature_datasets(job.features)
        df = _add_labels(df)
        df = _add_sample_flag(df, subsample_not_fraud)
        df = _add_validation_flag(df)
        df = _add_fold_number(df, num_folds)
        
        sampled_save_path = f"{base_path}/sampled/"
        logger.info(context, f"Saving sampled training dataset to {sampled_save_path}...")
        df_sample = df[df['sample'] > 0]
        df_sample = df_sample.set_index('fold')
        df_sample.to_parquet(sampled_save_path)
        
        valid_save_path = f"{base_path}/validation/"
        logger.info(context, f"Saving weighted validation dataset to {valid_save_path}...")
        df_validation = df[df['validation'] >0 ]
        df_validation.to_parquet(valid_save_path)

        return TrainingDataJob(
            sources=job.features,
            sampled=sampled_save_path,
            validation=valid_save_path
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = FeatureCalcJob(sources={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'}, 
                     features={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'})
result = await execute_event(app_config, 'data.training-data', job)
result

2020-07-07 22:36:58,442 | INFO | fraud-poc 0.0.1-training data.training-data leo-legion 18704 | Saving sampled training dataset to ./data/training/sampled/... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-07T22:36:57.211612+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-07 22:37:43,979 | INFO | fraud-poc 0.0.1-training data.training-data leo-legion 18704 | Saving weighted validation dataset to ./data/training/validation/... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-07T22:36:57.211612+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=


TrainingDataJob(sources={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'}, sampled='./data/training/sampled/', validation='./data/training/validation/')

In [None]:
dd.read_parquet(result.sampled).describe().compute()

Task was destroyed but it is pending!
task: <Task pending name='Task-625' coro=<HTTP1ServerConnection._server_request_loop() running at /opt/dev/anaconda3/envs/dask/lib/python3.8/site-packages/tornado/http1connection.py:817> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7fb0c8c25af0>()]> cb=[IOLoop.add_future.<locals>.<lambda>() at /opt/dev/anaconda3/envs/dask/lib/python3.8/site-packages/tornado/ioloop.py:690]>
Task was destroyed but it is pending!
task: <Task pending name='Task-630' coro=<HTTP1ServerConnection._server_request_loop() running at /opt/dev/anaconda3/envs/dask/lib/python3.8/site-packages/tornado/http1connection.py:817> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7fb0a3fa0a60>()]> cb=[IOLoop.add_future.<locals>.<lambda>() at /opt/dev/anaconda3/envs/dask/lib/python3.8/site-packages/tornado/ioloop.py:690]>
Task was destroyed but it is pending!
task: <Task pending name='Task-631' coro=<HTTP1ServerConnection._server_request_loop() running a

Unnamed: 0,order_amount,num_email_by_customer_id,num_ip_addr_by_customer_id,same_email_by_customer_id,same_ip_addr_by_customer_id,known_email_by_customer_id,known_ip_addr_by_customer_id,order_amount_mean_by_customer_id,order_amount_std_by_customer_id,order_amount_min_by_customer_id,...,known_customer_id_by_email,order_amount_mean_by_email,order_amount_std_by_email,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email,is_fraud,sample,elapsed_wgt,validation
count,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,...,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0,1072089.0
mean,526.708,2.160165,3.119257,0.5705758,0.4602976,0.970759,0.8614052,501.2814,265.6489,100.1119,...,0.9895568,501.8918,263.4956,105.2752,897.9406,4789.087,0.8804055,1.0,0.9901491,0.1927834
std,275.9189,1.01354,1.36977,0.4949942,0.4984215,0.1684815,0.3455234,96.65022,52.15631,99.80408,...,0.1016568,99.83118,56.5171,109.634,107.4631,1212.657,0.324487,0.0,0.005692438,0.3944846
min,0.001560899,1.0,1.0,0.0,0.0,0.0,0.0,1.001237,0.0,0.0002271392,...,0.0,0.03408705,0.0,0.0002271392,0.03408705,0.03408705,0.0,1.0,0.9802699,0.0
25%,295.9992,1.0,2.0,0.0,0.0,1.0,1.0,438.42,237.5031,30.50219,...,1.0,438.3146,236.2462,31.25854,861.8774,4216.201,1.0,1.0,0.9852614,0.0
50%,531.9706,2.0,3.0,1.0,0.0,1.0,1.0,502.1227,270.1232,72.07299,...,1.0,502.8616,269.5697,74.2512,929.6629,4922.503,1.0,1.0,0.9901895,0.0
75%,766.5151,3.0,4.0,1.0,1.0,1.0,1.0,566.1961,300.0172,140.7146,...,1.0,567.5166,299.8108,145.7638,970.3124,5584.035,1.0,1.0,0.9951112,0.0
max,999.9995,7.0,10.0,1.0,1.0,1.0,1.0,999.9918,498.2714,999.9918,...,1.0,999.9918,498.2714,999.9918,999.9995,8915.618,1.0,1.0,0.9999744,1.0


In [None]:
dd.read_parquet(result.validation).describe().compute()

Unnamed: 0,order_amount,num_email_by_customer_id,num_ip_addr_by_customer_id,same_email_by_customer_id,same_ip_addr_by_customer_id,known_email_by_customer_id,known_ip_addr_by_customer_id,order_amount_mean_by_customer_id,order_amount_std_by_customer_id,order_amount_min_by_customer_id,...,order_amount_mean_by_email,order_amount_std_by_email,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email,is_fraud,sample,elapsed_wgt,validation,fold
count,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0,...,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0,229899.0
mean,499.575716,2.157073,3.111566,0.572934,0.464047,0.972679,0.867242,499.597151,267.026925,97.229817,...,499.416451,264.900245,101.84635,897.7651,4777.521732,0.785497,0.892153,0.990291,1.0,5.000183
std,288.783726,1.012125,1.365872,0.494653,0.498707,0.163017,0.339314,96.555412,51.819866,97.441246,...,99.781962,55.895181,106.629882,107.388765,1202.93156,0.410478,0.310188,0.005683,0.0,3.162667
min,0.004208,1.0,1.0,0.0,0.0,0.0,0.0,0.457344,0.0,0.000227,...,0.29538,0.0,0.000227,0.29538,0.29538,0.0,0.0,0.98027,1.0,0.0
25%,254.207715,1.0,2.0,0.0,0.0,1.0,1.0,437.617167,239.067854,29.753422,...,435.394575,238.28583,30.483997,862.305111,4199.80649,1.0,1.0,0.98552,1.0,2.0
50%,505.215247,2.0,3.0,1.0,0.0,1.0,1.0,501.715618,271.648605,70.648778,...,500.704623,271.126161,71.581809,930.681119,4909.946848,1.0,1.0,0.990526,1.0,5.0
75%,755.731631,3.0,4.0,1.0,1.0,1.0,1.0,565.428195,301.65286,137.977865,...,565.241917,301.407646,141.064595,970.713671,5579.990289,1.0,1.0,0.995357,1.0,8.0
max,999.996485,7.0,10.0,1.0,1.0,1.0,1.0,997.396879,498.27144,997.396879,...,997.812768,498.27144,997.812768,999.999489,8880.975814,1.0,1.0,0.999974,1.0,10.0
