In [None]:
#default_exp data.make_sample_data

### Create Sample Data
> This module creates batches of random generated orders and saves that to parquet files using Dask

In [None]:
#export
from datetime import datetime, timezone, timedelta
import random
import uuid
import hashlib
import os
from copy import copy

import numpy as np
import pandas as pd

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client_async, MakeSampleDataJob

In [None]:
#export
__steps__ = ['submit_job', SHUFFLE, 'make_batches']

__api__ = event_api(
    summary="Data: Make sample data",
    query_args=[
        ("num_batches", int, "Number of batches/files to make"),
        ("batch_size", int, "Number of rows per batch"),
        ("batch_span_days", int, "Number of time span in days for order_date in batch"),
        ("num_customers", int, "Number of customers to generate"),
        ("num_emails", int, "Number of emails to generate"),
        ("num_ips", int, "Number of IP addresses to generate")
    ],
    responses={
        200: (MakeSampleDataJob, "Job submitted")
    }
)

logger = app_logger()

In [None]:
#export
class OrderRandomSource:
    @staticmethod
    def new_uuid(): return str(uuid.uuid4())
    
    @staticmethod
    def new_hash(): return hashlib.sha1(str(uuid.uuid4()).encode()).hexdigest()

    def __init__(self, days_ago: int, days_span: int,
                 customer_ids: list, emails: list, email_to_customer: list, ips, email_to_ip: list):
        self.num_customers = len(customer_ids)
        self.num_ips = len(ips)
        self.num_emails = len(emails)
        self.lat_min, self.lat_max = -20., -10.
        self.long_min, self.long_max = -20., -10.
        self.to_date = datetime.now(tz=timezone.utc) - timedelta(days=days_ago)
        self.to_date_epoch = int(self.to_date.timestamp())
        self.from_date = self.to_date - timedelta(days=days_span)
        self.from_date_epoch = int(self.from_date.timestamp())
        self.customer_ids = customer_ids
        self.emails = emails
        self.email_to_customer = email_to_customer
        self.ips = ips
        self.email_to_ip = email_to_ip
        
    def random_order(self):
        email_idx = random.randint(0, self.num_emails-1)
        email = self.emails[email_idx]
        customer_id = self.email_to_customer[email_idx]
        ip_addr = self.email_to_ip[email_idx] if random.random() > 0.1 else self.new_hash()
        date_epoch = random.randint(self.from_date_epoch, self.to_date_epoch)
        date = datetime.fromtimestamp(date_epoch, tz=timezone.utc)
        lat = self.lat_min + random.random() * (self.lat_max - self.lat_min)
        long = self.long_min + random.random() * (self.long_max - self.long_min)
        amount = random.random() * 1000.
        return {
            'order_id': self.new_uuid(),
            'order_date': date,
            'customer_id': customer_id,
            'email': email,
            'ip_addr': ip_addr,
            'location_lat': lat,
            'location_long': long,
            'order_amount': amount
        }

    def _generate_orders(self, n: int):
        for _ in range(n):
            yield self.random_order()

    def __call__(self, n: int):
        return pd.DataFrame(self._generate_orders(n))

    @staticmethod
    def meta(): 
        return {'order_id': object, 
               'order_date': 'datetime64[ns, UTC]', 
               'customer_id': object, 
               'email': object, 
               'ip_addr': object, 
               'location_lat': float, 
               'location_long': float, 
               'order_amount': float}

In [None]:
#export
def _make_batch(path: str, i: int, size: int, days_span: int,
               customer_ids: list, emails: list, email_to_customer: list, ips, email_to_ip: list):
    os.makedirs(path, exist_ok=True)
    days_ago = i * days_span
    random_orders = OrderRandomSource(days_ago=days_ago, days_span=days_span,
                                      customer_ids=customer_ids, emails=emails, email_to_customer=email_to_customer, 
                                      ips=ips, email_to_ip=email_to_ip)
    df = random_orders(size) #, meta=random_orders.meta())
    file_name = f'{path}/batch{i:02}.parquet'
    df.to_parquet(file_name, engine='fastparquet', compression='LZ4')
    return file_name

In [None]:
#export
async def submit_job(payload: None, context: EventContext, 
                     num_batches: int = 12, batch_size: int = 100000, batch_span_days: int = 30,
                     num_customers: int = 10000, num_emails: int = 10000, num_ips: int = 10000) -> MakeSampleDataJob:
    path = context.env['data']['raw']
    return MakeSampleDataJob(path, int(num_batches), int(batch_size), int(batch_span_days),
                            int(num_customers), int(num_emails), int(num_ips))

async def make_batches(job: MakeSampleDataJob, context: EventContext) -> MakeSampleDataJob:
    logger.info(context, f"Executing: {job}...")
    client = await get_client_async(context)
    logger.info(context, f"Dask: {client}")
    try:
        batches = []
        customer_ids = [OrderRandomSource.new_uuid() for i in range(job.num_customers)]
        emails = [OrderRandomSource.new_hash() for i in range(job.num_emails)]
        email_to_customer = [customer_ids[random.randint(0, job.num_customers-1)] for i in range(job.num_emails)]
        ips = [OrderRandomSource.new_hash() for i in range(job.num_ips)]
        email_to_ip = [ips[random.randint(0, job.num_ips-1)] for i in range(job.num_emails)] 

        for i in range(job.num_batches):
            logger.info(context, f"Submitting batch {i}...")
            batches.append(
                client.submit(_make_batch, job.path, i, job.batch_size, job.batch_span_days,
                             customer_ids, emails, email_to_customer, ips, email_to_ip)
            )

        for batch in batches:
            res = await batch
            logger.info(context, f"Done batch: {res}.")
        #dfs = await client.gather(*batches)
        return job
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        await client.close()

In [None]:
! nbdev_build_lib

zsh:1: command not found: nbdev_build_lib


### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
result = await execute_event(app_config, 'data.make-sample-data', None,
                            num_batches=2, batch_size=100, batch_span_days=10,
                            num_customers=100, num_emails=100, num_ips=100)
result

2021-05-18 20:24:07,412 | INFO | fraud-poc training data.make-sample-data ALT00617 74682 | Executing: MakeSampleDataJob(path='./data/raw', num_batches=2, batch_size=100, batch_span_days=10, num_customers=100, num_emails=100, num_ips=100)... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:24:07.412401+00:00
2021-05-18 20:24:08,219 | INFO | fraud-poc training data.make-sample-data ALT00617 74682 | Dask: <Client: 'tcp://127.0.0.1:58174' processes=4 threads=8, memory=16.00 GiB> | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:24:07.412401+00:00
2021-05-18 20:24:08,224 | INFO | fraud-poc training data.make-sample-data ALT00617 74682 | Submitting batch 0... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2021-05-18T20:24:07.412401+00:00
2021-05-18 20:24:08,226 | INFO | fraud-poc training data.make-sample-data ALT00617 74682 | Submit

MakeSampleDataJob(path='./data/raw', num_batches=2, batch_size=100, batch_span_days=10, num_customers=100, num_emails=100, num_ips=100)

In [None]:
import dask.dataframe as dd
dd.read_parquet(result.path).head()

Unnamed: 0,order_id,order_date,customer_id,email,ip_addr,location_lat,location_long,order_amount
0,5a699fab-b8b3-4e19-bbc6-0e970ffba031,2021-05-11 17:35:50+00:00,5132ccfc-ce47-4f4d-bdb2-f8fface2ab5f,bad573204742b126622e6a5c090b3769a33a3308,52d44eb8e3243a1adf9a43b74e0b7629be671277,-10.835709,-19.554605,877.709842
1,1452029f-f659-4611-9f8c-5a23fd1626cf,2021-05-17 13:24:05+00:00,25817e58-f0f5-4d0a-bc74-6fc00ce9c70d,1ebd4eae5ff22fff30e8dc152d8571a9ba5774bd,449e07ab35d65c5a874740bf8e22d2a4d0fbce7a,-13.531581,-10.881747,235.479339
2,31510480-2267-430a-a638-2cd42cfc60f4,2021-05-11 23:12:56+00:00,c6eb3cd2-93bc-413f-a0e9-61c71ab0d224,5402e6915b106c264bd2bd7d7194e45d6de83091,29854c437e114323911562c081daa967fc2418a4,-11.012069,-10.141314,68.890731
3,9f461fd9-2ad1-4c61-b600-24af355d8e63,2021-05-14 00:25:21+00:00,8a6a7275-c331-48aa-baa6-970014d04963,da80f9f0a66ba22fc910f9fb6fc7bbbd3d79fab3,14e11fef153a52f328d803fc07b48905c8a8c31f,-18.711148,-12.039022,444.247712
4,9cc8c9aa-e342-4e7a-8269-4ebb7bc0d208,2021-05-15 04:11:54+00:00,09da86be-63c3-4265-a799-8fe5efaf270c,79cdeec4056ed0714127c4c255d9dafa334ee2a7,ee714f6b18d2b01e2d632f1299d8cb7618099070,-16.11013,-13.836294,955.929156
