In [None]:
#default_exp data.make_sample_data

In [None]:
!nbdev_build_lib

Converted 00_jobs.ipynb.
Converted 01-create-sample-data.ipynb.
Converted 02-preprocess.ipynb.
Converted 03-feature-calc.ipynb.
Converted 04-training-data.ipynb.
Converted 05-train-model.ipynb.
Converted 99-tools.ipynb.
Converted index.ipynb.


In [None]:
#export
from datetime import datetime, timezone, timedelta
import random
import uuid
import hashlib
import os

import numpy as np
import pandas as pd

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client_async, MakeSampleDataJob

In [None]:
#export
__steps__ = ['submit_job', SHUFFLE, 'make_batches']

__api__ = event_api(
    title="Data: Make sample data",
    query_args=[
        ("num_batches", int, "Number of batches/files to make"),
        ("batch_size", int, "Number of rows per batch"),
        ("batch_span_days", int, "Number of time span in days for order_date in batch"),
        ("num_customers", int, "Number of customers to generate"),
        ("num_emails", int, "Number of emails to generate"),
        ("num_ips", int, "Number of IP addresses to generate")
    ],
    responses={
        200: (MakeSampleDataJob, "Job submitted")
    }
)

logger = app_logger()

In [None]:
#export
class OrderRandomSource:
    @staticmethod
    def new_uuid(): return str(uuid.uuid4())
    
    @staticmethod
    def new_hash(): return hashlib.sha1(str(uuid.uuid4()).encode()).hexdigest()

    def __init__(self, num_customers:int, num_ips:int, num_emails:int, 
                 days_ago:int = 0, days_span:int = 30):
        self.num_customers = num_customers
        self.num_ips = num_ips
        self.num_emails = num_emails
        self.lat_min, self.lat_max = -20., -10.
        self.long_min, self.long_max = -20., -10.
        self.to_date = datetime.now(tz=timezone.utc) - timedelta(days=days_ago)
        self.to_date_epoch = int(self.to_date.timestamp())
        self.from_date = self.to_date - timedelta(days=days_span)
        self.from_date_epoch = int(self.from_date.timestamp())
        self.customer_ids = [self.new_uuid() for i in range(self.num_customers)]
        self.emails = [self.new_hash() for i in range(self.num_emails)]
        self.email_to_customer = [self.customer_ids[random.randint(0, self.num_customers-1)] for i in range(self.num_emails)]
        self.ips = [self.new_hash() for i in range(self.num_ips)]
        self.email_to_ip = [self.ips[random.randint(0, num_ips-1)] for i in range(self.num_emails)] 
    
    def random_order(self):
        email_idx = random.randint(0, self.num_emails-1)
        email = self.emails[email_idx]
        customer_id = self.email_to_customer[email_idx]
        ip_addr = self.email_to_ip[email_idx] if random.random() > 0.1 else self.new_hash()
        date_epoch = random.randint(self.from_date_epoch, self.to_date_epoch)
        date = datetime.fromtimestamp(date_epoch, tz=timezone.utc)
        lat = self.lat_min + random.random() * (self.lat_max - self.lat_min)
        long = self.long_min + random.random() * (self.long_max - self.long_min)
        amount = random.random() * 1000.
        return {
            'order_id': self.new_uuid(),
            'order_date': date,
            'customer_id': customer_id,
            'email': email,
            'ip_addr': ip_addr,
            'location_lat': lat,
            'location_long': long,
            'order_amount': amount
        }

    def _generate_orders(self, n: int):
        for _ in range(n):
            yield self.random_order()

    def __call__(self, n: int):
        return pd.DataFrame(self._generate_orders(n))

    @staticmethod
    def meta(): 
        return {'order_id': object, 
               'order_date': 'datetime64[ns, UTC]', 
               'customer_id': object, 
               'email': object, 
               'ip_addr': object, 
               'location_lat': float, 
               'location_long': float, 
               'order_amount': float}

In [None]:
#export
def _make_batch(path: str, i: int, size: int, span_days: int, num_customers: int, num_emails: int, num_ips: int):
    os.makedirs(path, exist_ok=True)
    days_ago = i * span_days
    random_orders = OrderRandomSource(days_ago=days_ago, num_customers=num_customers, num_emails=num_emails, num_ips=num_ips)
    df = random_orders(size) #, meta=random_orders.meta())
    file_name = f'{path}/batch{i:02}.parquet'
    df.to_parquet(file_name, engine='fastparquet', compression='LZ4')
    return file_name

In [None]:
#export
async def submit_job(payload: None, context: EventContext, 
                     num_batches: int = 12, batch_size: int = 100000, batch_span_days: int = 30,
                     num_customers: int = 10000, num_emails: int = 10000, num_ips: int = 10000) -> MakeSampleDataJob:
    path = context.env['data']['raw']
    return MakeSampleDataJob(path, int(num_batches), int(batch_size), int(batch_span_days),
                            int(num_customers), int(num_emails), int(num_ips))

async def make_batches(job: MakeSampleDataJob, context: EventContext) -> MakeSampleDataJob:
    logger.info(context, f"Executing: {job}...")
    client = await get_client_async(context)
    logger.info(context, f"Dask: {client}")
    try:
        batches = []
        for i in range(job.num_batches):
            logger.info(context, f"Submitting batch {i}...")
            batches.append(
                client.submit(_make_batch, job.path, i, job.batch_size, job.batch_span_days,
                             job.num_customers, job.num_emails, job.num_ips)
            )

        for batch in batches:
            res = await batch
            logger.info(context, f"Done batch: {res}.")
        #dfs = await client.gather(*batches)
        return job
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        await client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
result = await execute_event(app_config, 'data.make-sample-data', None,
                            num_batches=2, batch_size=1000, batch_span_days=10,
                            num_customers=1000, num_emails=1000, num_ips=1000)
result

2020-07-06 17:30:25,495 | INFO | fraud-poc 0.0.1 data.make-sample-data leo-legion 23143 | Executing: MakeSampleDataJob(path='./data/raw', num_batches=2, batch_size=1000, batch_span_days=10, num_customers=1000, num_emails=1000, num_ips=1000)... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T17:30:25.495271+00:00
2020-07-06 17:30:26,358 | INFO | fraud-poc 0.0.1 data.make-sample-data leo-legion 23143 | Dask: <Client: 'tcp://127.0.0.1:42995' processes=4 threads=12, memory=33.54 GB> | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T17:30:25.495271+00:00
2020-07-06 17:30:26,359 | INFO | fraud-poc 0.0.1 data.make-sample-data leo-legion 23143 | Submitting batch 0... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T17:30:25.495271+00:00
2020-07-06 17:30:26,360 | INFO | fraud-poc 0.0.1 data.make-sample-data leo-legion 23143 | Submit

MakeSampleDataJob(path='./data/raw', num_batches=2, batch_size=1000, batch_span_days=10, num_customers=1000, num_emails=1000, num_ips=1000)

In [None]:
import dask.dataframe as dd
dd.read_parquet(result.path).head()

Unnamed: 0,order_id,order_date,customer_id,email,ip_addr,location_lat,location_long,order_amount
0,553b040c-8ef2-42a5-821f-3fdbbd4ec541,2020-06-19 13:06:14+00:00,8aec2e63-dee3-4c33-b6aa-2c328b2f865d,57ac73a74893620f3a66b49e6e1ed5964295323d,7794d10e34f9622a6b401b017017889a11a8008e,-12.242685,-19.596976,294.018549
1,97772a55-65cd-4fa7-9613-8153bd50cda2,2020-07-02 01:01:41+00:00,e07a9bc9-b3f5-48dc-a2b7-58596eeb6e42,0152ff1e2ca30af702efa7f73dafbedb40dce96d,79beb49e0b0697b2de9fff990f424e57f05cbc9f,-16.011192,-11.467686,585.058626
2,11e08d12-5bb7-4f65-a221-1a6c5706710d,2020-06-19 19:13:22+00:00,3b5e3f8e-306a-4d0d-8e2b-b7515caf6335,901aa07dd26964f5713cbb7dca66edf3399ef402,1a3f8c2b0ceace2c6b43f1d0ba88c2afa1a02a0c,-11.978958,-10.821121,989.673632
3,5016ac35-6cf9-4085-a996-831a41a1fec2,2020-06-20 06:26:18+00:00,1284d7ee-4010-4817-b058-c02d3db45a0c,e117f113921af6299b94f845fb22df8537fa48c3,15f040965da391a4e775ccb5204f6bc3544a11b8,-10.705637,-17.642378,69.708105
4,cc617408-985a-44b0-a088-af7229250eb4,2020-06-30 14:07:22+00:00,7c25b15a-e951-41f4-917b-32c01ca82480,ba8a31f7519786bd9a1149597fe8641118748acb,615f8f39702408a154cdc11607a3e72ee4358bd0,-13.502691,-11.868797,857.236904
