In [None]:
#default_exp live.predict

### Live Predict

> This module implements the POST endpoint for the Fraud Prediction Service. It can be invoked with an Order payload,
then this steps will retrieve concurrently features calculated on customer_id and email, update the features using received order information on the fly, and predict is_fraud label using trained XGBoost model.

In [None]:
#export
from typing import Dict, Optional
from datetime import datetime, timezone, timedelta
import os
import json
import pickle
import aioredis
import asyncio

import pandas as pd
import numpy as np
import xgboost as xgb
from dataclasses import dataclass
from hopeit.dataobjects import dataobject
from hopeit.server.serialization import serialize, Serialization, deserialize
from hopeit.server.compression import Compression
from hopeit.app.context import EventContext, PostprocessHook
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

In [None]:
#export
@dataobject
@dataclass
class OrderInfo:
    order_id: str
    customer_id: str
    order_date: datetime
    email: str
    ip_addr: str
    order_amount: float
    location_lat: float
    location_long: float


In [None]:
#export
__steps__ = ['lookup_features', 'predict']

__api__ = event_api(
    title="Live: Predict Fraud",
    payload=(OrderInfo, "Order Information"),
    responses={
        200: (dict, "features used for prediction contatining `is_fraud` field as result of prediction"),
        404: (str, "customer or email not found (this example only works for known customer_id and email)")
    }
)

logger = app_logger()

model = None
db = None

features = ['order_amount', 
            'num_email_by_customer_id', 'same_email_by_customer_id', 'known_email_by_customer_id', 
            'num_ip_addr_by_customer_id', 'same_ip_addr_by_customer_id', 'known_ip_addr_by_customer_id',
            'num_customer_id_by_email', 'same_customer_id_by_email', 'known_customer_id_by_email',
            'order_amount_mean_by_customer_id',
            'order_amount_std_by_customer_id', 'order_amount_min_by_customer_id', 'order_amount_max_by_customer_id',
            'order_amount_sum_by_customer_id', 
            'order_amount_mean_by_email',
            'order_amount_std_by_email', 'order_amount_min_by_email', 'order_amount_max_by_email',
            'order_amount_sum_by_email']

In [None]:
#export
async def __init_event__(context: EventContext):
    global model, db
    if model is None:
        file_name = os.path.join(context.env['model']['path'], context.env['model']['name'])
        logger.info(context, f"Loading model for prediction from {file_name}...")
        with open(file_name, 'rb') as fb:
            model = pickle.load(fb)
    if db is None:
        address = context.env['db']['url']
        logger.info(context, f"Connecting to database {address}...")
        db = await aioredis.create_redis_pool(address)
        

In [None]:
#export
async def _lookup_db(key: str):
    item = await db.get(key)
    if item is None:
        return None
    return deserialize(item, Serialization.PICKLE4, Compression.LZ4, dict)

In [None]:
#export
async def lookup_features(order: OrderInfo, context: EventContext) -> Optional[dict]:
    logger.info(context, "Looking up features in database...")
    assert db, "Connection to database missing."
    customer_id_features, email_features = await asyncio.gather(
        _lookup_db(order.customer_id),
        _lookup_db(order.email)
    )
    if customer_id_features is None or email_features is None:
        return None
    return {
        **_update_features(order, email_features, 'email'), 
        **_update_features(order, customer_id_features, 'customer_id'),
        **order.to_dict()
    }

def _append(data: dict, k: str, new_item: str):
    x = data.get(k)
    if isinstance(x, str):
        x = json.loads(x)
        x.append(new_item)
        data[k] = list(set(x[-10:]))

def _update_features(order: OrderInfo, data: dict, by: str):
    _append(data, f'order_amount_by_{by}', order.order_amount)
    _append(data, f'ip_addr_by_{by}', order.ip_addr)
    _append(data, f'email_by_{by}', order.email)
    _append(data, f'customer_id_by_{by}', order.customer_id)
    _calc_counts(data, 'customer_id')
    _calc_counts(data, 'email')
    _calc_amount_stats(data, 'customer_id')
    _calc_amount_stats(data, 'email')
    return data

def _calc_counts(data: dict, by: str):
    for col in ['ip_addr', 'customer_id', 'email']:
        x = data.get(f'{col}_by_{by}')
        if x is not None:
            data[f'num_{col}_by_{by}'] = len(x)

def _calc_amount_stats(data: dict, by: str):
    col = 'order_amount'
    x = data.get(f'{col}_by_{by}')
    if x is not None:
        x = np.array(x)
        data[f'{col}_max_by_{by}'] = np.max(x) 
        data[f'{col}_min_by_{by}'] = np.min(x) 
        data[f'{col}_mean_by_{by}'] = np.mean(x) 
        data[f'{col}_std_by_{by}'] = np.std(x) 
        data[f'{col}_sum_by_{by}'] = np.sum(x) 


In [None]:
#export
async def predict(data: dict, context: EventContext) -> dict:
    df = pd.DataFrame([data], columns=features)
    x = xgb.DMatrix(df)
    y = model.predict(x)
    data['is_fraud'] = y[0].item()
    return data
    

In [None]:
#export
async def __postprocess__(payload: Optional[dict], context: EventContext, response: PostprocessHook) -> dict:
    if payload is None:
        response.status = 404
        return "customer or email not found (this example only works for known customer_id and email)"
    return payload
        

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event
from fraud_poc.live.predict import OrderInfo
from datetime import datetime, timezone
import uuid
 
def new_key(): return str(uuid.uuid4())

app_config = config('config/fraud-service.json')
payload = OrderInfo(
    order_id=new_key(),
    customer_id='d555b585-5511-4a16-9f22-819834110239',
    order_date=datetime.now(tz=timezone.utc),
    email='1f5d34b02ef1975d5a82dcfe2e53fad6182e118c',
    ip_addr='test',
    order_amount=100.0,
    location_lat=0.0,
    location_long=0.0
) 

result = await execute_event(app_config, 'live.predict', payload)
result

2020-07-08 11:37:25,377 | INFO | fraud-poc 0.0.1-service live.predict leo-legion 38272 | __init_event__ module=fraud_poc.live.predict... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-08T11:37:25.377008+00:00
2020-07-08 11:37:25,377 | INFO | fraud-poc 0.0.1-service live.predict leo-legion 38272 | Loading model for prediction from ./data/model/xgb/latest-ok.pkl... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-08T11:37:25.377008+00:00
2020-07-08 11:37:25,382 | INFO | fraud-poc 0.0.1-service live.predict leo-legion 38272 | Connecting to database redis://localhost:6379... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-08T11:37:25.377008+00:00
2020-07-08 11:37:25,384 | INFO | fraud-poc 0.0.1-service live.predict leo-legion 38272 | Looking up features in database... | track.operation_id=test_operation_id | track.request_id=test_req

{'order_id': 'bb2de6c6-d655-4e75-8949-4ed792c024e5',
 'order_date': '2020-07-08T11:37:25.376310+00:00',
 'customer_id': 'd555b585-5511-4a16-9f22-819834110239',
 'ip_addr': 'test',
 'order_amount': 100.0,
 'email': '1f5d34b02ef1975d5a82dcfe2e53fad6182e118c',
 'customer_id_by_email': ['d555b585-5511-4a16-9f22-819834110239'],
 'num_customer_id_by_email': 1,
 'last_customer_id_by_email': 'd555b585-5511-4a16-9f22-819834110239',
 'same_customer_id_by_email': 1,
 'known_customer_id_by_email': 1,
 'order_amount_mean_by_email': 438.30120590151927,
 'order_amount_std_by_email': 274.2623429763596,
 'order_amount_min_by_email': 13.426760485169664,
 'order_amount_max_by_email': 879.7319538388477,
 'order_amount_sum_by_email': 4383.012059015193,
 'order_amount_by_email': [387.7647884248259,
  100.0,
  202.94798232836487,
  13.426760485169664,
  591.5459546297528,
  879.7319538388477,
  722.9025680336106,
  627.8375917324881,
  249.78620396645113,
  607.0682555756823],
 'key': 'd555b585-5511-4a16-9f2

In [None]:
payload = OrderInfo(
    order_id=new_key(),
    customer_id='002d6c90-2680-494a-b65e-c969c48277c8',
    order_date=datetime.now(tz=timezone.utc),
    email='1f5d34b02ef1975d5a82dcfe2e53fad6182e118c',
    ip_addr='c7ee6b54b39b1b5edefa3aa85d24665ed8feadd5',
    order_amount=10.0,
    location_lat=0.0,
    location_long=0.0
) 

result = await execute_event(app_config, 'live.predict', payload)
result

2020-07-08 11:38:45,990 | INFO | fraud-poc 0.0.1-service live.predict leo-legion 38272 | __init_event__ module=fraud_poc.live.predict... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-08T11:38:45.990505+00:00
2020-07-08 11:38:45,991 | INFO | fraud-poc 0.0.1-service live.predict leo-legion 38272 | Looking up features in database... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-08T11:38:45.990505+00:00


{'order_id': '29b1eb9a-17ed-4ea5-96b7-70b5c274dae5',
 'order_date': '2020-07-08T11:38:45.990197+00:00',
 'customer_id': '002d6c90-2680-494a-b65e-c969c48277c8',
 'ip_addr': 'c7ee6b54b39b1b5edefa3aa85d24665ed8feadd5',
 'order_amount': 10.0,
 'email': '1f5d34b02ef1975d5a82dcfe2e53fad6182e118c',
 'customer_id_by_email': ['002d6c90-2680-494a-b65e-c969c48277c8',
  'd555b585-5511-4a16-9f22-819834110239'],
 'num_customer_id_by_email': 2,
 'last_customer_id_by_email': 'd555b585-5511-4a16-9f22-819834110239',
 'same_customer_id_by_email': 1,
 'known_customer_id_by_email': 1,
 'order_amount_mean_by_email': 429.30120590151927,
 'order_amount_std_by_email': 286.4232086984392,
 'order_amount_min_by_email': 10.0,
 'order_amount_max_by_email': 879.7319538388477,
 'order_amount_sum_by_email': 4293.012059015193,
 'order_amount_by_email': [387.7647884248259,
  202.94798232836487,
  10.0,
  13.426760485169664,
  591.5459546297528,
  879.7319538388477,
  722.9025680336106,
  627.8375917324881,
  249.7862039