In [None]:
#default_exp data.feature_calc

# Feature calculation

> This module calculates features/aggregations that could be useful to detect fraudulent transactions. Results are saved into 2 parquet datasets for features email and customer_id features. These features are calculated:

Email aggregations:
 * "num_customer_id_by_email": number of previously known different customer_ids for a given email, up to 10
 * "same_customer_id_by_email": 1 if order contains the same customer_id as the last order from this email
 * "known_customer_id_by_email": 1 if the customer_id has been seen before with the email
 * "order_amount_mean_by_email": mean of the last 10 order amounts on this email
 * "order_amount_std_by_email": std
 * "order_amount_min_by_email": min
 * "order_amount_max_by_email": max
 * "order_amount_sum_by_email": sum
  
Customer_id aggregations:
 * "num_email_by_customer_id": number of previosuly seen different emails on the order customer_id, up to 10
 * "num_ip_addr_by_customer_id": number of previously seen different IP addresses for the order customer_id, up to 10
 * "same_email_by_customer_id": 1 is the email is the same as latest order for this customer_id
 * "same_ip_addr_by_customer_id": 1 is the ip address is the same
 * "known_email_by_customer_id": 1 if the email was seen before with this customer_id
 * "known_ip_addr_by_customer_id": 1 if the IP address was seen befor with this customer_id
 * "order_amount_mean_by_customer_id": mean of the last 10 order amount on this customer_id
 * "order_amount_std_by_customer_id": std
 * "order_amount_min_by_customer_id": min
 * "order_amount_max_by_customer_id": max
 * "order_amount_sum_by_customer_id": sum

In [None]:
#export
import dask.dataframe as dd
import numpy as np
import json

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client, FeatureCalcJob, PreprocessingJob

In [None]:
#export
__steps__ = ['run']

logger = app_logger()

In [None]:
#export
def calculate(df, count_cols, stat_cols, by):
    counts = count_distinct_values(df, count_cols, by)
    stats = num_stats(df, stat_cols, by)
    right = counts.merge(stats)
    df = df.merge(right,
                  left_on=[df.index, 'order_id'], 
                  right_on=[by, 'order_id'],
                  suffixes=('', f'_DROP'))
    return df[[col for col in df.columns if col[-5:] != '_DROP']]
        

def count_distinct_values(df, cols, by):
    sets = []
    for col in cols:
        sets.append( 
            df.groupby([df.index, df.order_date, df.order_id])[col] \
                .apply(list) \
                .sort_index() \
                .groupby(level=0) \
                .apply(np.cumsum) \
                .apply(lambda x: list(x)[-10:]))
        
    counts = sets[0].to_frame()
    for col, result in zip(cols, sets):
        counts[col] = result
        counts[f'num_{col}_by_{by}'] = counts[col].apply(lambda x: len(set(x)))
        counts[f'last_{col}_by_{by}'] = counts[col].apply(lambda x: x[-2] if len(x)>1 else "")
        counts[f'same_{col}_by_{by}'] = counts[col].apply(lambda x: int(x[-2] == x[-1]) if len(x) > 1 else 0)
        counts[f'known_{col}_by_{by}'] = counts[col].apply(lambda x: int(x[-1] in x[:-1]) if len(x) > 1 else 0)
        counts[f'{col}_by_{by}'] = counts[col].apply(lambda x: json.dumps(x))
 
    count_cols = [f'{p}{col}_by_{by}' for p in ('', 'num_', 'last_', 'same_', 'known_') for col in cols]
    counts = counts.reset_index()[[by, 'order_id', *count_cols]]
    return counts

def num_stats(df, cols, by):
    results = []
    for col in cols:
        results.append(df.groupby([df.index, df.order_date, df.order_id])[col] \
                .apply(list) \
                .sort_index() \
                .groupby(level=0) \
                .apply(np.cumsum) \
                .apply(lambda x: list(x)[-10:]))
        
    stats = results[0].to_frame()
    for col, result in zip(cols[1:], results[1:]):
        stats[col] = result
    
    stats = stats.reset_index()[[by, 'order_id', *cols]]
    for col in cols:
        stats[f'{col}_mean_by_{by}'] = stats[col].apply(lambda x: np.mean(x))
        stats[f'{col}_std_by_{by}'] = stats[col].apply(lambda x: np.std(x))
        stats[f'{col}_min_by_{by}'] = stats[col].apply(lambda x: np.min(x))
        stats[f'{col}_max_by_{by}'] = stats[col].apply(lambda x: np.max(x))
        stats[f'{col}_sum_by_{by}'] = stats[col].apply(lambda x: np.sum(x))
        stats[f'{col}_by_{by}'] = stats[col].apply(lambda x: json.dumps(x))
    
    return stats


In [None]:
#export
def run(job: PreprocessingJob, context: EventContext) -> FeatureCalcJob:
    base_path = context.env['data']['features']
    client = get_client(context)
    features = {}
    try:
        path = job.partitioned.get('customer_id')
        if path:
            logger.info(context, "Calculating features on customer_id...")
            df = dd.read_parquet(path, 
                         engine='fastparquet', 
                         columns=['order_id', 'order_date', 'email', 'ip_addr', 'order_amount'])
            df = df.map_partitions(calculate, count_cols=['email', 'ip_addr'], stat_cols=['order_amount'], by='customer_id')
            save_path = f'{base_path}/customer_id/'
            df.to_parquet(save_path)
            features['customer_id'] = save_path 
            logger.info(context, f"Saved {save_path}.")
        
        path = job.partitioned.get('email')
        if path:
            logger.info(context, "Calculating features on email...")
            df = dd.read_parquet(path, 
                         engine='fastparquet', 
                         columns=['order_id', 'order_date', 'customer_id', 'ip_addr', 'order_amount'])
            df = df.map_partitions(calculate, count_cols=['customer_id'], stat_cols=['order_amount'], by='email')
            save_path = f'{base_path}/email/'
            df.to_parquet(save_path)
            features['email'] = save_path 
            logger.info(context, f"Saved {save_path}.")
            
        return FeatureCalcJob(
            sources=job.partitioned,
            features=features
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = PreprocessingJob(source='./data/raw', partitioned={
    'customer_id': './data/partitioned/customer_id/', 
    'email': './data/partitioned/email'
})
result = await execute_event(app_config, 'data.feature-calc', job)
result

2021-05-18 20:27:03,391 | INFO | fraud-poc training data.feature-calc ALT00617 75592 | Calculating features on customer_id... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:27:02.159375+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2021-05-18 20:27:04,473 | INFO | fraud-poc training data.feature-calc ALT00617 75592 | Saved ./data/features/customer_id/. | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:27:02.159375+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2021-05-18 20:27:04,473 | INFO | fraud-poc training data.feature-calc ALT00617 75592 | Calculating features on email... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:27:02.159375+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2021-05-18 20:27:05,344 | INFO | fraud-poc training data.feature-calc ALT00617 755

FeatureCalcJob(sources={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'}, features={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'})

In [None]:
df = dd.read_parquet(result.features['customer_id'])
df.head()

Unnamed: 0,order_id,order_date,email,ip_addr,order_amount,customer_id,email_by_customer_id,ip_addr_by_customer_id,num_email_by_customer_id,num_ip_addr_by_customer_id,...,same_email_by_customer_id,same_ip_addr_by_customer_id,known_email_by_customer_id,known_ip_addr_by_customer_id,order_amount_mean_by_customer_id,order_amount_std_by_customer_id,order_amount_min_by_customer_id,order_amount_max_by_customer_id,order_amount_sum_by_customer_id,order_amount_by_customer_id
0,7beaa6ea-27ea-4082-a0ca-e216b93521a1,2021-05-07 22:49:55+00:00,af9ec8893d8c212276b9916b2599efe88510207d,e18142d2fcd6626e86d40573fd268c2f5f449a60,301.888156,00c00d42-6923-4297-87c4-107bc70035be,"[""af9ec8893d8c212276b9916b2599efe88510207d""]","[""e18142d2fcd6626e86d40573fd268c2f5f449a60""]",1,1,...,0,0,0,0,301.888156,0.0,301.888156,301.888156,301.888156,[301.8881562196214]
1,4069005c-02d0-4f41-b065-ad4b82412f20,2021-05-07 05:29:05+00:00,30650ad8775163a27cc73b642d3de1f46a337aaa,110fef07ad03b67a2f30fcebd6ab190bc2d11252,86.057737,048e9104-8369-407b-a0a1-fb06c06760c1,"[""30650ad8775163a27cc73b642d3de1f46a337aaa""]","[""110fef07ad03b67a2f30fcebd6ab190bc2d11252""]",1,1,...,0,0,0,0,86.057737,0.0,86.057737,86.057737,86.057737,[86.05773660199678]
2,3e5bb9cc-bdef-4693-8870-49967d510ae3,2021-05-03 10:08:39+00:00,79cdeec4056ed0714127c4c255d9dafa334ee2a7,ee714f6b18d2b01e2d632f1299d8cb7618099070,945.67836,09da86be-63c3-4265-a799-8fe5efaf270c,"[""79cdeec4056ed0714127c4c255d9dafa334ee2a7""]","[""ee714f6b18d2b01e2d632f1299d8cb7618099070""]",1,1,...,0,0,0,0,945.67836,0.0,945.67836,945.67836,945.67836,[945.678359760513]
3,9cc8c9aa-e342-4e7a-8269-4ebb7bc0d208,2021-05-15 04:11:54+00:00,79cdeec4056ed0714127c4c255d9dafa334ee2a7,ee714f6b18d2b01e2d632f1299d8cb7618099070,955.929156,09da86be-63c3-4265-a799-8fe5efaf270c,"[""79cdeec4056ed0714127c4c255d9dafa334ee2a7"", ""...","[""ee714f6b18d2b01e2d632f1299d8cb7618099070"", ""...",1,1,...,1,1,1,1,950.803758,5.125398,945.67836,955.929156,1901.607516,"[945.678359760513, 955.9291560497809]"
4,5b3900c1-e2ca-4226-81ee-0aa47d825005,2021-05-07 09:52:48+00:00,86665ef2c644c5f6a50e7702a7d5b0b88ee3c68b,8a799f9e07fa26a47e84fd8f64ea20ae5601a489,525.625661,0c5e0c2c-fb79-4a60-94c4-501bf2d07278,"[""86665ef2c644c5f6a50e7702a7d5b0b88ee3c68b"", ""...","[""8a799f9e07fa26a47e84fd8f64ea20ae5601a489"", ""...",5,5,...,0,0,1,1,680.036988,236.531382,336.752082,997.119448,4760.258919,"[336.7520815122097, 821.1451274507584, 922.876..."


In [None]:
dd.read_parquet(result.features['email']).head()

Unnamed: 0,order_id,order_date,customer_id,ip_addr,order_amount,email,customer_id_by_email,num_customer_id_by_email,last_customer_id_by_email,same_customer_id_by_email,known_customer_id_by_email,order_amount_mean_by_email,order_amount_std_by_email,order_amount_min_by_email,order_amount_max_by_email,order_amount_sum_by_email,order_amount_by_email
0,564ca53c-92ae-44e9-b550-857bc3f32299,2021-05-07 16:47:55+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,2061d686466fd7bdd0c4ff12c0c0431adfb4b584,296.66598,06b27c6a29714eead08e96789b9ed37c6b33c67b,"[""1161afda-4b7f-498b-b07c-a19c8ff25383"", ""1161...",1,1161afda-4b7f-498b-b07c-a19c8ff25383,1,1,620.601978,323.935998,296.66598,944.537976,1241.203956,"[944.5379757024328, 296.6659799710525]"
1,78210c4e-1997-477f-92f7-c78da8f6e2b0,2021-05-12 00:37:48+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,e1bf51a3b72ef02c554f338b28ea67a61f2effa9,675.094589,06b27c6a29714eead08e96789b9ed37c6b33c67b,"[""1161afda-4b7f-498b-b07c-a19c8ff25383"", ""1161...",1,1161afda-4b7f-498b-b07c-a19c8ff25383,1,1,638.766181,265.737145,296.66598,944.537976,1916.298544,"[944.5379757024328, 296.6659799710525, 675.094..."
2,bab6d5be-14fe-4705-9765-ad5fc5a14b82,2021-05-15 01:52:11+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,e1bf51a3b72ef02c554f338b28ea67a61f2effa9,860.661434,06b27c6a29714eead08e96789b9ed37c6b33c67b,"[""1161afda-4b7f-498b-b07c-a19c8ff25383"", ""1161...",1,1161afda-4b7f-498b-b07c-a19c8ff25383,1,1,582.538511,315.696452,135.732576,944.537976,2912.692555,"[944.5379757024328, 296.6659799710525, 675.094..."
3,d12ec21f-c115-45c4-b5c9-904578faa733,2021-05-01 23:42:34+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,e1bf51a3b72ef02c554f338b28ea67a61f2effa9,944.537976,06b27c6a29714eead08e96789b9ed37c6b33c67b,"[""1161afda-4b7f-498b-b07c-a19c8ff25383""]",1,,0,0,944.537976,0.0,944.537976,944.537976,944.537976,[944.5379757024328]
4,c1adb330-64c2-40c1-931a-86ba8dde545d,2021-05-14 06:56:31+00:00,1161afda-4b7f-498b-b07c-a19c8ff25383,e1bf51a3b72ef02c554f338b28ea67a61f2effa9,135.732576,06b27c6a29714eead08e96789b9ed37c6b33c67b,"[""1161afda-4b7f-498b-b07c-a19c8ff25383"", ""1161...",1,1161afda-4b7f-498b-b07c-a19c8ff25383,1,1,513.00778,316.871739,135.732576,944.537976,2052.031121,"[944.5379757024328, 296.6659799710525, 675.094..."
