In [None]:
#default_exp data.training_data

In [None]:
!nbdev_build_lib

Converted 00_jobs.ipynb.
Converted 01-create-sample-data.ipynb.
Converted 02-preprocess.ipynb.
Converted 03-feature-calc.ipynb.
Converted 04-training-data.ipynb.
Converted 05-train-model.ipynb.
Converted 06-submit-training-pipeline.ipynb.
Converted 99-tools.ipynb.
Converted index.ipynb.


In [None]:
#export
from typing import Dict
from datetime import datetime, timezone, timedelta
import random
import math
import dask.dataframe as dd
import numpy as np

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client, FeatureCalcJob, TrainingDataJob

In [None]:
#export
__steps__ = ['run']

logger = app_logger()

In [None]:
#export
def _merge_feature_datasets(datasets: Dict[str, str]):
    df = None
    for key, path in datasets.items():
        df_key = dd.read_parquet(path, engine='fastparquet')
        if df is not None:
            df = df.merge(df_key, left_on='order_id', right_on='order_id', suffixes=('', '_DROP'))
            keep_cols = [c for c in df.columns if c[-5:] != '_DROP']
            df = df[keep_cols]
        else:
            df = df_key
    return df


def _add_labels(df):
    df['is_fraud'] = (df['known_ip_addr_by_customer_id'] == 0) & (df['num_ip_addr_by_customer_id'] > 3)
    df['is_fraud'] = df['is_fraud'] | ((df['known_email_by_customer_id'] == 0) & (df['num_email_by_customer_id'] > 3))
    df['is_fraud'] = df['is_fraud'] | (df['order_amount'] > 2. * df['order_amount_mean_by_customer_id'])
    df['is_fraud'] = df['is_fraud'].apply(lambda x: int(x & (random.random() > 0.1)), meta=('is_fraud', int))
    return df

def _add_sample_flag(df, subsample_not_fraud: float):
    df['sample'] = df['is_fraud'].apply(lambda x: int((x > 0) | (random.random() > (1.-subsample_not_fraud))), meta=('sample', int))
    return df

def _add_validation_flag(df):
    now = datetime.now(tz=timezone.utc)
    now_epoch = now.timestamp()
    df['now'] = now
    df['elapsed_wgt'] = df['order_date'].apply(lambda x: math.log(max(0.001, 1. - (now_epoch - x.timestamp())/now_epoch)) + 1., meta=('elapsed_wgt', float))
    df['validation'] = df['elapsed_wgt'].apply(lambda x: int((max(0, x)  * random.random()) > 0.8), meta=('validation', int))
    return df

def _add_fold_number(df, num_folds):
    df['fold'] = df['is_fraud'].apply(lambda x: random.randint(0, num_folds), meta=('fold', int)) 
    return df

In [None]:
#export
def run(job: FeatureCalcJob, context: EventContext) -> TrainingDataJob:
    base_path = context.env['data']['training']
    num_folds = context.env['training_data']['num_folds']
    subsample_not_fraud = context.env['training_data']['subsample_not_fraud']
    
    client = get_client(context)
    try:
        df = _merge_feature_datasets(job.features)
        df = _add_labels(df)
        df = _add_sample_flag(df, subsample_not_fraud)
        df = _add_validation_flag(df)
        df = _add_fold_number(df, num_folds)
        
        sampled_save_path = f"{base_path}/sampled/"
        logger.info(context, f"Saving sampled training dataset to {sampled_save_path}...")
        df_sample = df[df['sample'] > 0]
        df_sample = df_sample.set_index('fold')
        df_sample.to_parquet(sampled_save_path)
        
        valid_save_path = f"{base_path}/validation/"
        logger.info(context, f"Saving weighted validation dataset to {valid_save_path}...")
        df_validation = df[df['validation'] >0 ]
        df_validation.to_parquet(valid_save_path)

        return TrainingDataJob(
            sources=job.features,
            sampled=sampled_save_path,
            validation=valid_save_path
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = FeatureCalcJob(sources={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'}, 
                     features={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'})
result = await execute_event(app_config, 'data.training-data', job)
result

2020-07-07 08:58:13,322 | INFO | fraud-poc 0.0.1 data.training-data leo-legion 17007 | Saving sampled training dataset to ./data/training/sampled/... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-07T08:58:12.092046+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-07 08:58:15,172 | INFO | fraud-poc 0.0.1 data.training-data leo-legion 17007 | Saving weighted validation dataset to ./data/training/validation/... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-07T08:58:12.092046+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=


TrainingDataJob(sources={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'}, sampled='./data/training/sampled/', validation='./data/training/validation/')

In [None]:
dd.read_parquet(result.sampled).describe().compute()

Unnamed: 0,order_amount,num_email_by_customer_id,num_ip_addr_by_customer_id,same_email_by_customer_id,same_ip_addr_by_customer_id,known_email_by_customer_id,known_ip_addr_by_customer_id,order_amount_mean_by_customer_id,order_amount_std_by_customer_id,order_amount_min_by_customer_id,...,known_customer_id_by_email,order_amount_mean_by_email,order_amount_std_by_email,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email,is_fraud,sample,elapsed_wgt,validation
count,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,...,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0
mean,508.332956,1.55957,1.794922,0.425781,0.34668,0.573242,0.488281,497.043536,156.724514,302.735927,...,0.573242,495.378141,110.539975,369.861545,620.044341,1009.469685,0.068359,1.0,0.999472,0.201172
std,289.402377,0.749302,0.998011,0.494703,0.476145,0.494848,0.500107,203.712029,128.243913,259.466287,...,0.494848,232.971164,124.532367,271.993116,272.546974,736.884342,0.252485,0.0,0.000313,0.401072
min,3.37126,1.0,1.0,0.0,0.0,0.0,0.0,4.060345,0.0,0.593414,...,0.0,3.37126,0.0,3.37126,3.37126,3.37126,0.0,1.0,0.998915,0.0
25%,280.488471,1.0,1.0,0.0,0.0,0.0,0.0,392.586343,9.111429,88.439412,...,0.0,331.945793,0.0,151.294213,462.043733,538.812515,0.0,1.0,0.999231,0.0
50%,526.388393,1.0,2.0,0.0,0.0,1.0,0.0,497.098576,187.180525,226.475192,...,1.0,512.604606,71.157398,334.067447,693.946036,895.660164,0.0,1.0,0.999506,0.0
75%,782.820371,2.0,2.0,1.0,1.0,1.0,1.0,633.410801,270.046273,476.907941,...,1.0,683.35149,221.508438,585.404707,868.905491,1415.090439,0.0,1.0,0.999756,0.0
max,998.519851,4.0,6.0,1.0,1.0,1.0,1.0,994.185613,453.44959,994.185613,...,1.0,995.623624,453.44959,995.623624,998.519851,4150.689094,1.0,1.0,0.999998,1.0


In [None]:
dd.read_parquet(result.validation).describe().compute()

Unnamed: 0,order_amount,num_email_by_customer_id,num_ip_addr_by_customer_id,same_email_by_customer_id,same_ip_addr_by_customer_id,known_email_by_customer_id,known_ip_addr_by_customer_id,order_amount_mean_by_customer_id,order_amount_std_by_customer_id,order_amount_min_by_customer_id,...,order_amount_mean_by_email,order_amount_std_by_email,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email,is_fraud,sample,elapsed_wgt,validation,fold
count,419.0,419.0,419.0,419.0,419.0,419.0,419.0,419.0,419.0,419.0,...,419.0,419.0,419.0,419.0,419.0,419.0,419.0,419.0,419.0,419.0
mean,504.377774,1.501193,1.713604,0.422434,0.338902,0.579952,0.48926,496.210284,148.037534,316.967046,...,497.68756,109.235244,371.711486,620.821339,1041.556649,0.033413,0.515513,0.999462,1.0,4.880668
std,285.825313,0.696453,0.882624,0.494537,0.473902,0.494156,0.500482,212.708419,125.814889,259.408736,...,232.302292,119.655983,266.004112,271.991267,767.503092,0.179927,0.500357,0.000317,0.0,3.146363
min,5.630585,1.0,1.0,0.0,0.0,0.0,0.0,7.141438,0.0,0.593414,...,7.141438,0.0,5.630585,7.141438,7.141438,0.0,0.0,0.998915,1.0,0.0
25%,291.658136,1.0,1.0,0.0,0.0,0.0,0.0,409.251553,0.0,113.357415,...,372.395179,0.0,160.243644,481.99105,546.632623,0.0,0.0,0.999197,1.0,3.0
50%,549.622148,1.0,1.0,0.0,0.0,1.0,0.0,529.178049,167.057018,288.360878,...,528.037552,55.12823,357.099032,700.383423,891.140691,0.0,1.0,0.999498,1.0,5.0
75%,775.498693,2.0,2.0,1.0,1.0,1.0,1.0,652.922003,252.351156,552.254215,...,696.982149,215.18185,600.386062,869.144709,1444.1364,0.0,1.0,0.999745,1.0,8.0
max,997.628389,4.0,6.0,1.0,1.0,1.0,1.0,990.551144,422.262965,990.551144,...,994.336621,440.261912,994.336621,997.661044,3840.306279,1.0,1.0,0.999994,1.0,10.0
