In [None]:
#default_exp data.feature_calc

In [None]:
!nbdev_build_lib

Converted 00_jobs.ipynb.
Converted 01-create-sample-data.ipynb.
Converted 02-preprocess.ipynb.
Converted 03-feature-calc.ipynb.
Converted 04-training-data.ipynb.
Converted 05-train-model.ipynb.
Converted 06-submit-training-pipeline.ipynb.
Converted 07-prepare-db.ipynb.
Converted 99-tools.ipynb.
Converted index.ipynb.


In [None]:
#export
import dask.dataframe as dd
import numpy as np

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client, FeatureCalcJob, PreprocessingJob

In [None]:
#export
__steps__ = ['run']

logger = app_logger()

In [None]:
#export
def calculate(df, count_cols, stat_cols, by):
    counts = count_distinct_values(df, count_cols, by)
    stats = num_stats(df, stat_cols, by)
    right = counts.merge(stats)
    df = df.merge(right,
                  left_on=[df.index, 'order_id'], 
                  right_on=[by, 'order_id'],
                  suffixes=('', f'_DROP'))
    return df[[col for col in df.columns if col[-5:] != '_DROP']]
        

def count_distinct_values(df, cols, by):
    sets = []
    for col in cols:
        sets.append( 
            df.groupby([df.index, df.order_date, df.order_id])[col] \
                .apply(list) \
                .sort_index() \
                .groupby(level=0) \
                .apply(np.cumsum) \
                .apply(lambda x: list(x)[-10:]))
        
    counts = sets[0].to_frame()
    for col, result in zip(cols, sets):
        counts[col] = result
        counts[f'num_{col}_by_{by}'] = counts[col].apply(lambda x: len(set(x)))
        counts[f'last_{col}_by_{by}'] = counts[col].apply(lambda x: x[-2] if len(x)>1 else "")
        counts[f'same_{col}_by_{by}'] = counts[col].apply(lambda x: int(x[-2] == x[-1]) if len(x) > 1 else 0)
        counts[f'known_{col}_by_{by}'] = counts[col].apply(lambda x: int(x[-1] in x[:-1]) if len(x) > 1 else 0)
        counts[f'{col}_by_{by}'] = counts[col].apply(str)
 
    count_cols = [f'{p}{col}_by_{by}' for p in ('', 'num_', 'last_', 'same_', 'known_') for col in cols]
    counts = counts.reset_index()[[by, 'order_id', *count_cols]]
    return counts

def num_stats(df, cols, by):
    results = []
    for col in cols:
        results.append(df.groupby([df.index, df.order_date, df.order_id])[col] \
                .apply(list) \
                .sort_index() \
                .groupby(level=0) \
                .apply(np.cumsum) \
                .apply(lambda x: list(x)[-10:]))
        
    stats = results[0].to_frame()
    for col, result in zip(cols[1:], results[1:]):
        stats[col] = result
    
    stats = stats.reset_index()[[by, 'order_id', *cols]]
    for col in cols:
        stats[f'{col}_mean_by_{by}'] = stats[col].apply(lambda x: np.mean(x))
        stats[f'{col}_std_by_{by}'] = stats[col].apply(lambda x: np.std(x))
        stats[f'{col}_min_by_{by}'] = stats[col].apply(lambda x: np.min(x))
        stats[f'{col}_max_by_{by}'] = stats[col].apply(lambda x: np.max(x))
        stats[f'{col}_sum_by_{by}'] = stats[col].apply(lambda x: np.sum(x))
        stats[f'{col}_by_{by}'] = stats[col].apply(str)
    
    return stats


In [None]:
#export
def run(job: PreprocessingJob, context: EventContext) -> FeatureCalcJob:
    base_path = context.env['data']['features']
    client = get_client(context)
    features = {}
    try:
        path = job.partitioned.get('customer_id')
        if path:
            logger.info(context, "Calculating features on customer_id...")
            df = dd.read_parquet(path, 
                         engine='fastparquet', 
                         columns=['order_id', 'order_date', 'email', 'ip_addr', 'order_amount'])
            df = df.map_partitions(calculate, count_cols=['email', 'ip_addr'], stat_cols=['order_amount'], by='customer_id')
            save_path = f'{base_path}/customer_id/'
            df.to_parquet(save_path)
            features['customer_id'] = save_path 
            logger.info(context, f"Saved {save_path}.")
        
        path = job.partitioned.get('email')
        if path:
            logger.info(context, "Calculating features on email...")
            df = dd.read_parquet(path, 
                         engine='fastparquet', 
                         columns=['order_id', 'order_date', 'customer_id', 'ip_addr', 'order_amount'])
            df = df.map_partitions(calculate, count_cols=['customer_id'], stat_cols=['order_amount'], by='email')
            save_path = f'{base_path}/email/'
            df.to_parquet(save_path)
            features['email'] = save_path 
            logger.info(context, f"Saved {save_path}.")
            
        return FeatureCalcJob(
            sources=job.partitioned,
            features=features
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = PreprocessingJob(source='./data/raw', partitioned={
    'customer_id': './data/partitioned/customer_id/', 
    'email': './data/partitioned/email'
})
result = await execute_event(app_config, 'data.feature-calc', job)
result

2020-07-07 22:24:23,918 | INFO | fraud-poc 0.0.1-training data.feature-calc leo-legion 15772 | Calculating features on customer_id... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-07T22:24:23.217526+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-07 22:26:23,085 | INFO | fraud-poc 0.0.1-training data.feature-calc leo-legion 15772 | Saved ./data/features/customer_id/. | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-07T22:24:23.217526+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-07 22:26:23,086 | INFO | fraud-poc 0.0.1-training data.feature-calc leo-legion 15772 | Calculating features on email... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-07T22:24:23.217526+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-07 22:27:49,861 | INFO | fraud-poc 0.0.1-training 

FeatureCalcJob(sources={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'}, features={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'})

In [None]:
df = dd.read_parquet(result.features['customer_id'])
df.head()

Unnamed: 0,order_id,order_date,email,ip_addr,order_amount,customer_id,email_by_customer_id,ip_addr_by_customer_id,num_email_by_customer_id,num_ip_addr_by_customer_id,...,same_email_by_customer_id,same_ip_addr_by_customer_id,known_email_by_customer_id,known_ip_addr_by_customer_id,order_amount_mean_by_customer_id,order_amount_std_by_customer_id,order_amount_min_by_customer_id,order_amount_max_by_customer_id,order_amount_sum_by_customer_id,order_amount_by_customer_id
0,6a5ddc18-0dee-485d-b2ee-cdb8ccfa0485,2020-06-29 10:45:56+00:00,1d53f0c2cb70ca0922dc0051a8e2cd300b720cce,7ff85a4d3ee8bf7773d867c187fc7599bd1672a3,555.943568,005c052f-5148-4acc-884d-7b06f604308f,"['386b2fa6a5640b6eeb47b557a634c5e324810d09', '...","['5c86c74948a216aa3ec6492f4949c8f62ac3251d', '...",3,3,...,1,1,1,1,335.058021,266.412303,1.147234,574.395839,1675.290105,"[574.3958388601079, 1.1472343694837184, 526.19..."
1,e7ce02eb-08da-4894-be79-c293ace4693e,2020-06-25 07:41:16+00:00,386b2fa6a5640b6eeb47b557a634c5e324810d09,5c86c74948a216aa3ec6492f4949c8f62ac3251d,1.147234,005c052f-5148-4acc-884d-7b06f604308f,"['386b2fa6a5640b6eeb47b557a634c5e324810d09', '...","['5c86c74948a216aa3ec6492f4949c8f62ac3251d', '...",1,1,...,1,1,1,1,287.771537,286.624302,1.147234,574.395839,575.543073,"[574.3958388601079, 1.1472343694837184]"
2,28b5ad13-55ce-40cd-bcc5-5c9204325edc,2020-06-19 00:42:49+00:00,386b2fa6a5640b6eeb47b557a634c5e324810d09,5c86c74948a216aa3ec6492f4949c8f62ac3251d,574.395839,005c052f-5148-4acc-884d-7b06f604308f,['386b2fa6a5640b6eeb47b557a634c5e324810d09'],['5c86c74948a216aa3ec6492f4949c8f62ac3251d'],1,1,...,0,0,0,0,574.395839,0.0,574.395839,574.395839,574.395839,[574.3958388601079]
3,8d1368bd-b076-45a7-8457-78f44f4a2c7e,2020-07-04 01:31:48+00:00,04baa6de56ac27af1fae31ba56307fb814b32f68,84a1c9bd726d8a35d9335240ce68c056f772ca2d,204.861045,005c052f-5148-4acc-884d-7b06f604308f,"['386b2fa6a5640b6eeb47b557a634c5e324810d09', '...","['5c86c74948a216aa3ec6492f4949c8f62ac3251d', '...",3,3,...,0,0,1,1,313.358525,247.993151,1.147234,574.395839,1880.15115,"[574.3958388601079, 1.1472343694837184, 526.19..."
4,58f0682a-1fd4-430c-ac11-98893c06991e,2020-06-27 03:59:10+00:00,04baa6de56ac27af1fae31ba56307fb814b32f68,84a1c9bd726d8a35d9335240ce68c056f772ca2d,526.19557,005c052f-5148-4acc-884d-7b06f604308f,"['386b2fa6a5640b6eeb47b557a634c5e324810d09', '...","['5c86c74948a216aa3ec6492f4949c8f62ac3251d', '...",2,2,...,0,0,0,0,367.246214,259.617878,1.147234,574.395839,1101.738643,"[574.3958388601079, 1.1472343694837184, 526.19..."


In [None]:
dd.read_parquet(result.features['email']).head()

Unnamed: 0,order_id,order_date,customer_id,ip_addr,order_amount,email,customer_id_by_email,num_customer_id_by_email,last_customer_id_by_email,same_customer_id_by_email,known_customer_id_by_email,order_amount_mean_by_email,order_amount_std_by_email,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email,order_amount_by_email
0,f39ecc27-5583-4e07-8d71-fcbecc55ea89,2020-06-23 22:19:09+00:00,0ed1768b-659c-465b-bd1e-b9e1571b6155,b55a2c7e553f4e59ec0582c673f9408fc5bb1d67,595.156716,00691f9cf2bd3e1977c4d0408557f0c4931b294a,"['0ed1768b-659c-465b-bd1e-b9e1571b6155', '0ed1...",1,0ed1768b-659c-465b-bd1e-b9e1571b6155,1,1,345.931841,249.224875,96.706966,595.156716,691.863682,"[96.70696566722414, 595.1567162934887]"
1,faeda91b-a709-44ad-8c06-7d5667b5f039,2020-06-22 09:04:43+00:00,0ed1768b-659c-465b-bd1e-b9e1571b6155,b55a2c7e553f4e59ec0582c673f9408fc5bb1d67,96.706966,00691f9cf2bd3e1977c4d0408557f0c4931b294a,['0ed1768b-659c-465b-bd1e-b9e1571b6155'],1,,0,0,96.706966,0.0,96.706966,96.706966,96.706966,[96.70696566722414]
2,9164ae9e-5cdf-4af1-9175-73cd73fe6366,2020-06-26 23:46:13+00:00,0ed1768b-659c-465b-bd1e-b9e1571b6155,3b47716663011295fb13dd5ad4c242845c396484,641.732567,00691f9cf2bd3e1977c4d0408557f0c4931b294a,"['0ed1768b-659c-465b-bd1e-b9e1571b6155', '0ed1...",1,0ed1768b-659c-465b-bd1e-b9e1571b6155,1,1,444.532083,246.683416,96.706966,641.732567,1333.596249,"[96.70696566722414, 595.1567162934887, 641.732..."
3,f64cf4cd-6d92-4e9c-9a04-c6a420b4f22d,2020-07-03 16:35:16+00:00,0ed1768b-659c-465b-bd1e-b9e1571b6155,b55a2c7e553f4e59ec0582c673f9408fc5bb1d67,536.578923,00691f9cf2bd3e1977c4d0408557f0c4931b294a,"['0ed1768b-659c-465b-bd1e-b9e1571b6155', '0ed1...",1,0ed1768b-659c-465b-bd1e-b9e1571b6155,1,1,467.543793,217.320379,96.706966,641.732567,1870.175173,"[96.70696566722414, 595.1567162934887, 641.732..."
4,60789921-58c5-47aa-8510-4878a9d3ebfa,2020-07-02 10:47:48+00:00,239ead93-a93a-4a4f-8673-1f47058ac8c1,a1dd69f29d89599598c3cdc93e7998add8c2b759,959.487961,009157d2735994c1451447051676dcb001779c9e,"['239ead93-a93a-4a4f-8673-1f47058ac8c1', '239e...",1,239ead93-a93a-4a4f-8673-1f47058ac8c1,1,1,543.918074,311.396589,151.294213,959.487961,2175.672295,"[151.29421266507381, 358.15183900418737, 706.7..."


In [None]:
col = 'xx_DROP'
col[-5:]

'_DROP'