In [None]:
#default_exp data.training_data

### Training Data Preparation
> This module reads features computed on customer_id and email, merges both datasets, obtain `is_fraud` label for orders and creates two dataset that are suitable to train the model:

>1) Sampled: a dataset that contains all transactions marked as fraud, plus a subsample of the non fraud orders, in order to obtain a less-imbalanced dataset and train faster

>2) Validation: a dataset containing only most recent orders, containing all fraud and non fraud transactions to be use as a final validation for the trained model.

In [None]:
#export
from typing import Dict
from datetime import datetime, timezone, timedelta
import random
import math
import dask.dataframe as dd
import numpy as np

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client, FeatureCalcJob, TrainingDataJob

In [None]:
#export
__steps__ = ['run']

logger = app_logger()

In [None]:
#export
def _merge_feature_datasets(datasets: Dict[str, str]):
    df = None
    for key, path in datasets.items():
        df_key = dd.read_parquet(path, engine='fastparquet')
        if df is not None:
            df = df.merge(df_key, left_on='order_id', right_on='order_id', suffixes=('', '_DROP'))
            keep_cols = [c for c in df.columns if c[-5:] != '_DROP']
            df = df[keep_cols]
        else:
            df = df_key
    return df


def _add_labels(df):
    df['is_fraud'] = (df['known_ip_addr_by_customer_id'] == 0) & (df['num_ip_addr_by_customer_id'] > 3)
    df['is_fraud'] = df['is_fraud'] | ((df['known_email_by_customer_id'] == 0) & (df['num_email_by_customer_id'] > 3))
    df['is_fraud'] = df['is_fraud'] | (df['order_amount'] > (1. + 0.5 * random.random() * df['order_amount_mean_by_customer_id']))
    df['is_fraud'] = df['is_fraud'].apply(lambda x: int(x & (random.random() > 0.1)), meta=('is_fraud', int))
    return df

def _add_sample_flag(df, subsample_not_fraud: float):
    df['sample'] = df['is_fraud'].apply(lambda x: int((x > 0) | (random.random() > (1.-subsample_not_fraud))), meta=('sample', int))
    return df

def _add_validation_flag(df):
    now = datetime.now(tz=timezone.utc)
    now_epoch = now.timestamp()
    df['now'] = now
    df['elapsed_wgt'] = df['order_date'].apply(lambda x: math.log(max(0.001, 1. - (now_epoch - x.timestamp())/now_epoch)) + 1., meta=('elapsed_wgt', float))
    df['validation'] = df['elapsed_wgt'].apply(lambda x: int((max(0, x)  * random.random()) > 0.8), meta=('validation', int))
    return df

def _add_fold_number(df, num_folds):
    df['fold'] = df['is_fraud'].apply(lambda x: random.randint(0, num_folds), meta=('fold', int)) 
    return df

In [None]:
#export
def run(job: FeatureCalcJob, context: EventContext) -> TrainingDataJob:
    base_path = context.env['data']['training']
    num_folds = context.env['training_data']['num_folds']
    subsample_not_fraud = context.env['training_data']['subsample_not_fraud']
    
    client = get_client(context)
    try:
        df = _merge_feature_datasets(job.features)
        df = _add_labels(df)
        df = _add_sample_flag(df, subsample_not_fraud)
        df = _add_validation_flag(df)
        df = _add_fold_number(df, num_folds)
        
        sampled_save_path = f"{base_path}/sampled/"
        logger.info(context, f"Saving sampled training dataset to {sampled_save_path}...")
        df_sample = df[df['sample'] > 0]
        df_sample = df_sample.set_index('fold')
        df_sample.to_parquet(sampled_save_path)
        
        valid_save_path = f"{base_path}/validation/"
        logger.info(context, f"Saving weighted validation dataset to {valid_save_path}...")
        df_validation = df[df['validation'] >0 ]
        df_validation.to_parquet(valid_save_path)

        return TrainingDataJob(
            sources=job.features,
            sampled=sampled_save_path,
            validation=valid_save_path
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = FeatureCalcJob(sources={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'}, 
                     features={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'})
result = await execute_event(app_config, 'data.training-data', job)
result

2021-05-18 20:28:12,444 | INFO | fraud-poc training data.training-data ALT00617 75610 | Saving sampled training dataset to ./data/training/sampled/... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:28:11.484283+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2021-05-18 20:28:14,091 | INFO | fraud-poc training data.training-data ALT00617 75610 | Saving weighted validation dataset to ./data/training/validation/... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:28:11.484283+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=


TrainingDataJob(sources={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'}, sampled='./data/training/sampled/', validation='./data/training/validation/')

In [None]:
dd.read_parquet(result.sampled).describe().compute()

Unnamed: 0,order_amount,num_email_by_customer_id,num_ip_addr_by_customer_id,same_email_by_customer_id,same_ip_addr_by_customer_id,known_email_by_customer_id,known_ip_addr_by_customer_id,order_amount_mean_by_customer_id,order_amount_std_by_customer_id,order_amount_min_by_customer_id,...,known_customer_id_by_email,order_amount_mean_by_email,order_amount_std_by_email,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email,is_fraud,sample,elapsed_wgt,validation
count,188.0,188.0,188.0,188.0,188.0,188.0,188.0,188.0,188.0,188.0,...,188.0,188.0,188.0,188.0,188.0,188.0,188.0,188.0,188.0,188.0
mean,524.606828,1.755319,2.138298,0.361702,0.25,0.569149,0.446809,511.480117,159.316375,319.896259,...,0.569149,524.794925,113.340613,396.771221,657.255252,1047.595948,0.941489,1.0,0.99947,0.234043
std,281.567721,0.955549,1.193575,0.481776,0.434169,0.496518,0.49849,207.981474,123.908393,245.276464,...,0.496518,231.650609,123.329541,276.938429,272.226871,692.482507,0.235333,0.0,0.000314,0.424529
min,0.402291,1.0,1.0,0.0,0.0,0.0,0.0,0.402291,0.0,0.402291,...,0.0,0.402291,0.0,0.402291,0.402291,0.402291,0.0,1.0,0.998941,0.0
25%,346.147602,1.0,1.0,0.0,0.0,0.0,0.0,398.643346,36.08789,134.578503,...,0.0,353.471786,0.0,167.206097,438.680258,520.862094,1.0,1.0,0.999246,0.0
50%,491.724897,2.0,2.0,0.0,0.0,1.0,0.0,522.281127,190.593451,281.71717,...,1.0,511.262158,136.818371,350.677562,728.436786,938.82395,1.0,1.0,0.999501,0.0
75%,793.608032,2.0,3.0,1.0,1.0,1.0,1.0,653.541265,281.174204,462.674445,...,1.0,701.53079,249.284494,624.516786,925.82998,1454.151908,1.0,1.0,0.999753,1.0
max,997.119448,5.0,6.0,1.0,1.0,1.0,1.0,989.953833,413.698024,989.953833,...,1.0,995.144205,413.698024,995.144205,999.276104,3007.250422,1.0,1.0,0.999994,1.0


In [None]:
dd.read_parquet(result.validation).describe().compute()

Unnamed: 0,order_amount,num_email_by_customer_id,num_ip_addr_by_customer_id,same_email_by_customer_id,same_ip_addr_by_customer_id,known_email_by_customer_id,known_ip_addr_by_customer_id,order_amount_mean_by_customer_id,order_amount_std_by_customer_id,order_amount_min_by_customer_id,...,order_amount_mean_by_email,order_amount_std_by_email,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email,is_fraud,sample,elapsed_wgt,validation,fold
count,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0
mean,531.34059,1.690476,2.02381,0.357143,0.238095,0.547619,0.357143,515.845594,185.447372,311.285556,...,560.438991,124.731174,430.621161,702.37035,997.584484,0.880952,0.97619,0.999446,1.0,4.785714
std,264.341622,0.840676,0.99971,0.484966,0.431081,0.503761,0.484966,171.878141,129.856514,215.511969,...,217.59957,139.248552,264.615976,263.05034,558.30563,0.32777,0.154303,0.000333,0.0,2.841639
min,80.334187,1.0,1.0,0.0,0.0,0.0,0.0,80.334187,0.0,57.086638,...,80.334187,0.0,80.334187,80.334187,80.334187,0.0,0.0,0.998948,1.0,0.0
25%,420.356323,1.0,1.0,0.0,0.0,0.0,0.0,466.393298,102.186373,164.20045,...,433.396839,0.0,246.722547,462.674445,570.021621,1.0,1.0,0.999238,1.0,3.5
50%,607.866511,2.0,2.0,0.0,0.0,1.0,0.0,581.472604,226.378341,281.71717,...,581.472604,119.807098,413.87471,813.828513,1030.247967,1.0,1.0,0.999419,1.0,6.0
75%,662.987751,2.0,3.0,1.0,0.5,1.0,1.0,596.497147,306.184266,468.37399,...,766.321292,257.718198,717.698285,935.718323,1546.594138,1.0,1.0,0.999798,1.0,7.0
max,958.691116,4.0,4.0,1.0,1.0,1.0,1.0,891.351667,413.698024,891.351667,...,958.691116,413.698024,958.691116,999.276104,2582.030538,1.0,1.0,0.999994,1.0,10.0


In [None]:
df = dd.read_parquet('./data/training/validation/')
df[df.is_fraud == 0].head(npartitions=2)

Unnamed: 0,order_id,order_date,email,ip_addr,order_amount,customer_id,email_by_customer_id,ip_addr_by_customer_id,num_email_by_customer_id,num_ip_addr_by_customer_id,...,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email,order_amount_by_email,is_fraud,sample,now,elapsed_wgt,validation,fold
66,9c2b6d93-32a7-45fa-ac40-a47059e4b7c4,2021-05-04 12:29:59+00:00,20d3a0d5ee5fce50d1cc660ea94e8fd6dc3e0d5d,804833f0387c5c3a0aed1218687be20d19e4aa15,222.106098,c6eb3cd2-93bc-413f-a0e9-61c71ab0d224,"[""5402e6915b106c264bd2bd7d7194e45d6de83091"", ""...","[""29854c437e114323911562c081daa967fc2418a4"", ""...",2,2,...,222.106098,222.106098,222.106098,[222.1060979147914],0,1,2021-05-18 20:28:12.411255+00:00,0.999236,1,4
18,a489c647-bd28-456e-ab10-f4dde1151a20,2021-05-18 14:09:46+00:00,7d95a0edb3399a16e29f353350793a55489af201,3de6a5ab79cda5e3a73eff8a2bbb506dab451293,121.437929,2355ad27-ea68-4439-a085-1d294cc772ec,"[""7d95a0edb3399a16e29f353350793a55489af201"", ""...","[""f898fbab3f673df0b0049098814b2ffabd277471"", ""...",2,4,...,101.425959,928.822007,1151.685896,"[928.8220074790162, 101.42595902612995, 121.43...",0,1,2021-05-18 20:28:12.411255+00:00,0.999986,1,6
20,b9df0d87-c37f-4837-a81e-27e23bedc4a8,2021-05-04 18:02:34+00:00,95d1b98298ff8558bebbcd5b68b81426d951dc9d,8d90d3f40f9b4c1974f42ba9cfb3f3513e55018a,801.040739,23925e34-a97b-4a39-9af7-556994f4de50,"[""c5a4b2c53a0f1060909717f27a26b6b611c6abac"", ""...","[""e09327b09677fd88e72c6007d218ce042cba0f60"", ""...",3,3,...,801.040739,801.040739,801.040739,[801.040738517752],0,1,2021-05-18 20:28:12.411255+00:00,0.999248,1,6
48,564739cb-f238-487f-8893-3f8a5c42afe9,2021-05-05 21:21:08+00:00,164e9cd15154ecf244a56d1afa9f7182e12eb038,dadd450850a73d8c87ffef6f6c183a7fd03105e0,80.334187,78378996-89c5-4667-bf77-50a7bcf3899a,"[""164e9cd15154ecf244a56d1afa9f7182e12eb038""]","[""dadd450850a73d8c87ffef6f6c183a7fd03105e0""]",1,1,...,80.334187,80.334187,80.334187,[80.33418734448439],0,1,2021-05-18 20:28:12.411255+00:00,0.999309,1,6
55,39560977-f66c-4be8-8ec7-00e6364fb253,2021-05-10 09:00:25+00:00,60fd02c90b468b96daf5d3244bdc62631c60e8e2,dcbc5797682d1a4aea5f17ef88627c2ab83501a7,350.677562,85fcce17-2788-481c-83e5-9be65a2219d9,"[""3c87bfd70bf46bfdda0e3324f3a03008a5ba7047"", ""...","[""ab2348a4cf38fd70b3b39de17b2ae4d4c88d7ed0"", ""...",2,3,...,350.677562,999.276104,1349.953666,"[999.2761039800785, 350.6775618423433]",0,0,2021-05-18 20:28:12.411255+00:00,0.999548,1,8
