In [None]:
#default_exp data.prepare_db

In [None]:
!nbdev_build_lib

Converted 00_jobs.ipynb.
Converted 01-create-sample-data.ipynb.
Converted 02-preprocess.ipynb.
Converted 03-feature-calc.ipynb.
Converted 04-training-data.ipynb.
Converted 05-train-model.ipynb.
Converted 06-submit-training-pipeline.ipynb.
Converted 07-prepare-db.ipynb.
Converted 99-tools.ipynb.
Converted index.ipynb.


In [None]:
#export
from typing import Dict
from datetime import datetime, timezone, timedelta
import random
import math
import dask.dataframe as dd
import numpy as np
import redis

from hopeit.server.serialization import serialize, Serialization, deserialize
from hopeit.server.compression import Compression
from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger
from hopeit.toolkit.storage.redis import RedisStorage

from fraud_poc.jobs import get_client, FeatureCalcJob, PrepareDbJob

In [None]:
#export
__steps__ = ['update_database']

logger = app_logger()


In [None]:
#export
def _save_values_by_key(key, path, db_host, db_port):
    df = dd.read_parquet(path, engine='fastparquet')
    df['key'] = df[key]
    return (key, df.map_partitions(_foreach_partition, db_host, db_port, meta=('value', object)).count().compute().item())
            
def _foreach_partition(df, db_host, db_port):
    db = redis.Redis(host=db_host, port=db_port, db=0)
    items = df.groupby(['key'])[df.columns].apply(_last_item)
    items = items.apply(lambda x: _persist(x, db), axis=1)
    db.close()
    return items

def _last_item(group):
    group = group.sort_values('order_date')
    return group.tail(1)     

def _persist(item, db):
    v = item.to_dict()
    key = v['key']
    payload = serialize(v, Serialization.PICKLE4, Compression.LZ4)
    db.set(key, payload)
    return v

In [None]:
#export
async def update_database(job: FeatureCalcJob, context: EventContext):
    client = get_client(context)
    db_host = context.env['db']['host']
    db_port = context.env['db']['port']
    logger.info(context, f"Preparing to save to database {db_host}:{db_port}...")
    try:
        tasks = []
        for key, path in job.features.items():
            logger.info(context, f"Saving latest state for {key} features...")
            tasks.append(client.submit(_save_values_by_key, key, path, db_host, db_port))
        res = client.gather(tasks)
        return PrepareDbJob(
            features=job.features,
            db=f'{db_host}:{db_port}',
            saved=dict(res)
        )
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        client.close()    
    

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
job = FeatureCalcJob(sources={'customer_id': './data/partitioned/customer_id/', 'email': './data/partitioned/email'}, 
                     features={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'})
result = await execute_event(app_config, 'data.prepare-db', job)
result

2020-07-08 08:40:04,055 | INFO | fraud-poc 0.0.1-training data.prepare-db leo-legion 13835 | Preparing to save to database localhost:6379... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-08T08:40:03.311409+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-08 08:40:04,056 | INFO | fraud-poc 0.0.1-training data.prepare-db leo-legion 13835 | Saving latest state for customer_id features... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-08T08:40:03.311409+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=
2020-07-08 08:40:04,056 | INFO | fraud-poc 0.0.1-training data.prepare-db leo-legion 13835 | Saving latest state for email features... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-08T08:40:03.311409+00:00 | stream.name= | stream.msg_id= | stream.consumer_group=


PrepareDbJob(features={'customer_id': './data/features/customer_id/', 'email': './data/features/email/'}, db='localhost:6379', saved={'customer_id': 6910, 'email': 12000})

In [None]:
#customer id aggregation
db = redis.Redis(host='localhost', port=6379, db=0)
item = db.get('d555b585-5511-4a16-9f22-819834110239')
deserialize(item, Serialization.PICKLE4, Compression.LZ4, dict)

{'order_id': '758bc444-e538-41a2-81a2-1c19f269cd85',
 'order_date': Timestamp('2020-07-06 09:05:20+0000', tz='UTC'),
 'email': '1f5d34b02ef1975d5a82dcfe2e53fad6182e118c',
 'ip_addr': '4efa50345ed594d6f817f909e1b60473cb9d6c39',
 'order_amount': 879.7319538388477,
 'customer_id': 'd555b585-5511-4a16-9f22-819834110239',
 'email_by_customer_id': "['239b8d32e104be4b7c0cc65e2cbc1ead42522578', '239b8d32e104be4b7c0cc65e2cbc1ead42522578', 'd33cfd28bb631654b8e59ac97a958555ad39ea78', '1f5d34b02ef1975d5a82dcfe2e53fad6182e118c', '63552fd2e3d7e9e71e761c50f8b7ff1c07f39abb', '239b8d32e104be4b7c0cc65e2cbc1ead42522578', '63552fd2e3d7e9e71e761c50f8b7ff1c07f39abb', '63552fd2e3d7e9e71e761c50f8b7ff1c07f39abb', '63552fd2e3d7e9e71e761c50f8b7ff1c07f39abb', '1f5d34b02ef1975d5a82dcfe2e53fad6182e118c']",
 'ip_addr_by_customer_id': "['6f63bf7b774f8594744b5d1fc17a0e1c43cfd809', '6f63bf7b774f8594744b5d1fc17a0e1c43cfd809', '1404f14e3e04cdbbfadcd3ea70f014226bb9ac26', '4efa50345ed594d6f817f909e1b60473cb9d6c39', '270e3d

In [None]:
#email aggregations
item = db.get('1f5d34b02ef1975d5a82dcfe2e53fad6182e118c')
deserialize(item, Serialization.PICKLE4, Compression.LZ4, dict)

{'order_id': '758bc444-e538-41a2-81a2-1c19f269cd85',
 'order_date': Timestamp('2020-07-06 09:05:20+0000', tz='UTC'),
 'customer_id': 'd555b585-5511-4a16-9f22-819834110239',
 'ip_addr': '4efa50345ed594d6f817f909e1b60473cb9d6c39',
 'order_amount': 879.7319538388477,
 'email': '1f5d34b02ef1975d5a82dcfe2e53fad6182e118c',
 'customer_id_by_email': "['d555b585-5511-4a16-9f22-819834110239', 'd555b585-5511-4a16-9f22-819834110239', 'd555b585-5511-4a16-9f22-819834110239', 'd555b585-5511-4a16-9f22-819834110239', 'd555b585-5511-4a16-9f22-819834110239', 'd555b585-5511-4a16-9f22-819834110239', 'd555b585-5511-4a16-9f22-819834110239', 'd555b585-5511-4a16-9f22-819834110239', 'd555b585-5511-4a16-9f22-819834110239', 'd555b585-5511-4a16-9f22-819834110239']",
 'num_customer_id_by_email': 1,
 'last_customer_id_by_email': 'd555b585-5511-4a16-9f22-819834110239',
 'same_customer_id_by_email': 1,
 'known_customer_id_by_email': 1,
 'order_amount_mean_by_email': 516.7076306505403,
 'order_amount_std_by_email': 278