In [None]:
#default_exp data.feature_calc

In [None]:
!nbdev_build_lib

Converted 00_jobs.ipynb.
Converted 01-create-sample-data.ipynb.
Converted 02-preprocess.ipynb.
Converted 03-feature-calc.ipynb.
Converted 04-label-folds.ipynb.
Converted 05-training.ipynb.
Converted index.ipynb.


In [None]:
#export
import dask.dataframe as dd
import numpy as np

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client, FeatureCalcJob, PreprocessingJob

In [None]:
#export
__steps__ = ['run']

logger = app_logger()

In [None]:
#export
def calculate(df, count_cols, stat_cols, by):
    counts = count_distinct_values(df, count_cols, by)
    stats = num_stats(df, stat_cols, by)
    right = counts.merge(stats)
    df = df.merge(right,
                  left_on=[df.index, 'order_id'], 
                  right_on=[by, 'order_id'],
                  suffixes=('', f's_by_{by}'))
    return df
        

def count_distinct_values(df, cols, by):
    results = []
    for col in cols:
        results.append( 
            df.groupby([df.index, df.order_date, df.order_id])[col] \
                .apply(list) \
                .sort_index() \
                .groupby(level=0) \
                .apply(np.cumsum) \
                .apply(lambda x: len(set(x))))
        
    counts = results[0].to_frame()
    for col, result in zip(cols[1:], results[1:]):
        counts[col] = result

    counts = counts.reset_index()[[by, 'order_id', *cols]]
    return counts

def num_stats(df, cols, by):
    results = []
    for col in cols:
        results.append(df.groupby([df.index, df.order_date, df.order_id])[col] \
                .apply(list) \
                .sort_index() \
                .groupby(level=0) \
                .apply(np.cumsum) \
                .apply(lambda x: (np.mean(x), np.std(x), np.min(x), np.max(x), np.sum(x))))
        
    stats = results[0].to_frame()
    for col, result in zip(cols[1:], results[1:]):
        stats[col] = result
    
    stats = stats.reset_index()[[by, 'order_id', *cols]]
    for col in cols:
        stats[f'{col}_mean_by_{by}'] = stats[col].apply(lambda x: x[0])
        stats[f'{col}_std_by_{by}'] = stats[col].apply(lambda x: x[1])
        stats[f'{col}_min_by_{by}'] = stats[col].apply(lambda x: x[2])
        stats[f'{col}_max_by_{by}'] = stats[col].apply(lambda x: x[3])
        stats[f'{col}_sum_by_{by}'] = stats[col].apply(lambda x: x[4])
        stats[col] = stats[col].apply(str)
    
    return stats


In [None]:
#export
def run(job: PreprocessingJob, context: EventContext) -> FeatureCalcJob:
    base_path = context.env['data']['features']
    client = get_client(context)
    features = {}
    try:
        path = job.partitioned.get('customer_id')
        if path:
            logger.info(context, "Calculating features on customer_id...")
            df = dd.read_parquet(path, 
                         engine='fastparquet', 
                         columns=['order_id', 'order_date', 'email', 'ip_addr', 'order_amount'])
            df = df.map_partitions(calculate, count_cols=['email', 'ip_addr'], stat_cols=['order_amount'], by='customer_id')
            save_path = f'{base_path}/customer_id/'
            df.to_parquet(save_path)
            features['customer_id'] = save_path 
            logger.info(context, f"Saved {save_path}.")
        
        path = job.partitioned.get('email')
        if path:
            logger.info(context, "Calculating features on email...")
            df = dd.read_parquet(path, 
                         engine='fastparquet', 
                         columns=['order_id', 'order_date', 'customer_id', 'ip_addr', 'order_amount'])
            df = df.map_partitions(calculate, count_cols=['customer_id'], stat_cols=['order_amount'], by='email')
            save_path = f'{base_path}/email/'
            df.to_parquet(save_path)
            features['email'] = save_path 
            logger.info(context, f"Saved {save_path}.")
            
        return FeatureCalcJob(
            sources=job.partitioned,
            features=features
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = PreprocessingJob(source='./data/raw', partitioned={
    'customer_id': './data/partitioned/customer_id/', 
    'email': './data/partitioned/email'})
result = await execute_event(app_config, 'data.feature-calc', job)
result

2020-07-06 17:36:10,972 | INFO | fraud-poc 0.0.1 data.feature-calc leo-legion 26039 | Calculating features on customer_id... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T17:36:10.273728+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-06 17:36:13,848 | INFO | fraud-poc 0.0.1 data.feature-calc leo-legion 26039 | Saved ./data/features/customer_id/. | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T17:36:10.273728+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-06 17:36:13,849 | INFO | fraud-poc 0.0.1 data.feature-calc leo-legion 26039 | Calculating features on email... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T17:36:10.273728+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-06 17:36:14,865 | INFO | fraud-poc 0.0.1 data.feature-calc leo-legion 26039 |

FeatureCalcJob(sources={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'}, features={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'})

In [None]:
dd.read_parquet(result.features['customer_id']).head()

Unnamed: 0,order_id,order_date,email,ip_addr,order_amount,customer_id,emails_by_customer_id,ip_addrs_by_customer_id,order_amounts_by_customer_id,order_amount_mean_by_customer_id,order_amount_std_by_customer_id,order_amount_min_by_customer_id,order_amount_max_by_customer_id,order_amount_sum_by_customer_id
0,b5e1cf50-1996-44f2-8d82-fe18fbe61752,2020-06-28 19:11:16+00:00,d9c3af743102923e41d1ad4281ddd96742f6e50a,180fb8a54dce9e83394400969edb0dc6a7b13088,68.270268,002b6bff-0bec-4b19-82be-d428f41cc30c,1,1,"(202.69054417237436, 134.42027646106766, 68.27...",202.690544,134.420276,68.270268,337.110821,405.381088
1,7740c611-2504-4374-9a40-0d80c7d6a723,2020-06-11 17:01:42+00:00,d9c3af743102923e41d1ad4281ddd96742f6e50a,180fb8a54dce9e83394400969edb0dc6a7b13088,337.110821,002b6bff-0bec-4b19-82be-d428f41cc30c,1,1,"(337.110820633442, 0.0, 337.110820633442, 337....",337.110821,0.0,337.110821,337.110821,337.110821
2,6e4b3e31-288f-472e-94b9-03678e60c3a3,2020-06-24 10:32:22+00:00,58d77079c714b7796cac1d3df211cb3147525bfb,6d7c0b798489375381e3838b4fbad6f933c348ac,601.101297,0047b81f-8a96-473a-b0f2-8c8c10f304eb,1,1,"(601.1012968630803, 0.0, 601.1012968630803, 60...",601.101297,0.0,601.101297,601.101297,601.101297
3,a7ad0661-f86d-4388-8b5f-5c3051c39c07,2020-06-05 02:43:32+00:00,09da81d40cb2939e7a48331825837d039d2b0784,02bf0acfcbc290d72b7765fb1de00fa796ca0d88,539.827743,004c9254-4e65-47f6-88e4-d262dce434a9,1,1,"(539.8277429844641, 0.0, 539.8277429844641, 53...",539.827743,0.0,539.827743,539.827743,539.827743
4,7f37211b-d1e4-4e43-acf9-1a17abbd3072,2020-06-16 14:17:27+00:00,f599018e431a101f6ed8ef365add95a1bcce726d,eecda0b16e78628a4e5368c437a634b5ef9d6f6e,124.542277,0055a45d-ad90-48f3-a461-ac4d6716ab48,2,2,"(547.7368392614269, 423.19456234254795, 124.54...",547.736839,423.194562,124.542277,970.931402,1095.473679


In [None]:
dd.read_parquet(result.features['email']).head()

Unnamed: 0,order_id,order_date,customer_id,ip_addr,order_amount,email,customer_ids_by_email,order_amounts_by_email,order_amount_mean_by_email,order_amount_std_by_email,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email
0,de347ad8-881d-4db2-8f77-8cd7c3bc8409,2020-05-28 12:51:48+00:00,a77c93a2-2b07-4f36-93f4-2cdb5de40fac,5e523d96f0b3a52caeb612543bab45f409c98396,172.943654,00563c174e1904e1333f6140b4f5af62e9af31f4,1,"(172.94365353360763, 0.0, 172.94365353360763, ...",172.943654,0.0,172.943654,172.943654,172.943654
1,64f7fbab-074a-4b43-9b1c-a6c7e6b615bf,2020-06-10 18:11:59+00:00,097fdf7b-41e4-4fec-b52c-8ba88be8f4c1,7080f1f31d238381c88a105258b7e891728a5c97,90.264341,005fa0d91e609fede563ab1386609dbab7250655,1,"(166.6964662813392, 76.43212529548465, 90.2643...",166.696466,76.432125,90.264341,243.128592,333.392933
2,c3ca9003-cb66-43fc-95a3-1e0ac62b0daa,2020-06-05 08:50:44+00:00,097fdf7b-41e4-4fec-b52c-8ba88be8f4c1,7080f1f31d238381c88a105258b7e891728a5c97,243.128592,005fa0d91e609fede563ab1386609dbab7250655,1,"(243.12859157682388, 0.0, 243.12859157682388, ...",243.128592,0.0,243.128592,243.128592,243.128592
3,cdb2c545-d430-4b52-be94-c6d8cbb6a1f1,2020-05-29 07:34:49+00:00,cea238d3-d72f-4f86-b446-14164943c395,5dc2e15a296b2910a219f2ad6c6a0b996575002e,939.699533,00715c8f81403a0dae2329f7217ae5c9506a1691,1,"(939.6995334039897, 0.0, 939.6995334039897, 93...",939.699533,0.0,939.699533,939.699533,939.699533
4,0cb052b6-414e-4e19-80b9-e44458df7e5a,2020-06-08 03:58:36+00:00,efd65b15-a16b-4ffe-a8a0-67c205c778b2,a14b9cfb63a6ba4a570f7088f350340096605de0,386.084229,00ba3fdeba3e968aeb68310a546098d0efe4bb3f,1,"(386.0842287445715, 0.0, 386.0842287445715, 38...",386.084229,0.0,386.084229,386.084229,386.084229
