In [None]:
#default_exp data.make_sample_data

In [None]:
!nbdev_build_lib

Converted 00_jobs.ipynb.
Converted 01-create-sample-data.ipynb.
Converted 02-preprocess.ipynb.
Converted 03-feature-calc.ipynb.
Converted 04-training-data.ipynb.
Converted 05-train-model.ipynb.
Converted 99-tools.ipynb.
Converted index.ipynb.


In [None]:
#export
from datetime import datetime, timezone, timedelta
import random
import uuid
import hashlib
import os
from copy import copy

import numpy as np
import pandas as pd

from hopeit.app.context import EventContext
from hopeit.app.events import Spawn, SHUFFLE
from hopeit.app.api import event_api
from hopeit.app.logger import app_logger

from fraud_poc.jobs import get_client_async, MakeSampleDataJob

In [None]:
#export
__steps__ = ['submit_job', SHUFFLE, 'make_batches']

__api__ = event_api(
    title="Data: Make sample data",
    query_args=[
        ("num_batches", int, "Number of batches/files to make"),
        ("batch_size", int, "Number of rows per batch"),
        ("batch_span_days", int, "Number of time span in days for order_date in batch"),
        ("num_customers", int, "Number of customers to generate"),
        ("num_emails", int, "Number of emails to generate"),
        ("num_ips", int, "Number of IP addresses to generate")
    ],
    responses={
        200: (MakeSampleDataJob, "Job submitted")
    }
)

logger = app_logger()

In [None]:
#export
class OrderRandomSource:
    @staticmethod
    def new_uuid(): return str(uuid.uuid4())
    
    @staticmethod
    def new_hash(): return hashlib.sha1(str(uuid.uuid4()).encode()).hexdigest()

    def __init__(self, days_ago: int, days_span: int,
                 customer_ids: list, emails: list, email_to_customer: list, ips, email_to_ip: list):
        self.num_customers = len(customer_ids)
        self.num_ips = len(ips)
        self.num_emails = len(emails)
        self.lat_min, self.lat_max = -20., -10.
        self.long_min, self.long_max = -20., -10.
        self.to_date = datetime.now(tz=timezone.utc) - timedelta(days=days_ago)
        self.to_date_epoch = int(self.to_date.timestamp())
        self.from_date = self.to_date - timedelta(days=days_span)
        self.from_date_epoch = int(self.from_date.timestamp())
        self.customer_ids = customer_ids
        self.emails = emails
        self.email_to_customer = email_to_customer
        self.ips = ips
        self.email_to_ip = email_to_ip
        
    def random_order(self):
        email_idx = random.randint(0, self.num_emails-1)
        email = self.emails[email_idx]
        customer_id = self.email_to_customer[email_idx]
        ip_addr = self.email_to_ip[email_idx] if random.random() > 0.1 else self.new_hash()
        date_epoch = random.randint(self.from_date_epoch, self.to_date_epoch)
        date = datetime.fromtimestamp(date_epoch, tz=timezone.utc)
        lat = self.lat_min + random.random() * (self.lat_max - self.lat_min)
        long = self.long_min + random.random() * (self.long_max - self.long_min)
        amount = random.random() * 1000.
        return {
            'order_id': self.new_uuid(),
            'order_date': date,
            'customer_id': customer_id,
            'email': email,
            'ip_addr': ip_addr,
            'location_lat': lat,
            'location_long': long,
            'order_amount': amount
        }

    def _generate_orders(self, n: int):
        for _ in range(n):
            yield self.random_order()

    def __call__(self, n: int):
        return pd.DataFrame(self._generate_orders(n))

    @staticmethod
    def meta(): 
        return {'order_id': object, 
               'order_date': 'datetime64[ns, UTC]', 
               'customer_id': object, 
               'email': object, 
               'ip_addr': object, 
               'location_lat': float, 
               'location_long': float, 
               'order_amount': float}

In [None]:
#export
def _make_batch(path: str, i: int, size: int, days_span: int,
               customer_ids: list, emails: list, email_to_customer: list, ips, email_to_ip: list):
    os.makedirs(path, exist_ok=True)
    days_ago = i * days_span
    random_orders = OrderRandomSource(days_ago=days_ago, days_span=days_span,
                                      customer_ids=customer_ids, emails=emails, email_to_customer=email_to_customer, 
                                      ips=ips, email_to_ip=email_to_ip)
    df = random_orders(size) #, meta=random_orders.meta())
    file_name = f'{path}/batch{i:02}.parquet'
    df.to_parquet(file_name, engine='fastparquet', compression='LZ4')
    return file_name

In [None]:
#export
async def submit_job(payload: None, context: EventContext, 
                     num_batches: int = 12, batch_size: int = 100000, batch_span_days: int = 30,
                     num_customers: int = 10000, num_emails: int = 10000, num_ips: int = 10000) -> MakeSampleDataJob:
    path = context.env['data']['raw']
    return MakeSampleDataJob(path, int(num_batches), int(batch_size), int(batch_span_days),
                            int(num_customers), int(num_emails), int(num_ips))

async def make_batches(job: MakeSampleDataJob, context: EventContext) -> MakeSampleDataJob:
    logger.info(context, f"Executing: {job}...")
    client = await get_client_async(context)
    logger.info(context, f"Dask: {client}")
    try:
        batches = []
        customer_ids = [OrderRandomSource.new_uuid() for i in range(job.num_customers)]
        emails = [OrderRandomSource.new_hash() for i in range(job.num_emails)]
        email_to_customer = [customer_ids[random.randint(0, job.num_customers-1)] for i in range(job.num_emails)]
        ips = [OrderRandomSource.new_hash() for i in range(job.num_ips)]
        email_to_ip = [ips[random.randint(0, job.num_ips-1)] for i in range(job.num_emails)] 

        for i in range(job.num_batches):
            logger.info(context, f"Submitting batch {i}...")
            batches.append(
                client.submit(_make_batch, job.path, i, job.batch_size, job.batch_span_days,
                             customer_ids, emails, email_to_customer, ips, email_to_ip)
            )

        for batch in batches:
            res = await batch
            logger.info(context, f"Done batch: {res}.")
        #dfs = await client.gather(*batches)
        return job
    except Exception as e:
        logger.error(context, e)
        return None
    finally:
        await client.close()

### Test from notebook

In [None]:
from hopeit.testing.apps import config, execute_event

app_config = config('config/training-pipeline.json')
result = await execute_event(app_config, 'data.make-sample-data', None,
                            num_batches=2, batch_size=1000, batch_span_days=10,
                            num_customers=1000, num_emails=1000, num_ips=1000)
result

2020-07-06 18:14:26,827 | INFO | fraud-poc 0.0.1 data.make-sample-data leo-legion 42739 | Executing: MakeSampleDataJob(path='./data/raw', num_batches=2, batch_size=1000, batch_span_days=10, num_customers=1000, num_emails=1000, num_ips=1000)... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T18:14:26.827136+00:00
2020-07-06 18:14:27,693 | INFO | fraud-poc 0.0.1 data.make-sample-data leo-legion 42739 | Dask: <Client: 'tcp://127.0.0.1:35709' processes=4 threads=12, memory=33.54 GB> | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T18:14:26.827136+00:00
2020-07-06 18:14:27,716 | INFO | fraud-poc 0.0.1 data.make-sample-data leo-legion 42739 | Submitting batch 0... | track.operation_id=test_operation_id | track.request_id=test_request_id | track.request_ts=2020-07-06T18:14:26.827136+00:00
2020-07-06 18:14:27,727 | INFO | fraud-poc 0.0.1 data.make-sample-data leo-legion 42739 | Submit

MakeSampleDataJob(path='./data/raw', num_batches=2, batch_size=1000, batch_span_days=10, num_customers=1000, num_emails=1000, num_ips=1000)

In [None]:
import dask.dataframe as dd
dd.read_parquet(result.path).head()

Unnamed: 0,order_id,order_date,customer_id,email,ip_addr,location_lat,location_long,order_amount
0,bdd4a869-631a-4443-bcb7-4302b2151e80,2020-06-28 22:19:39+00:00,6e61ba72-4b34-402f-80e2-19ed1ded54a5,d1e2ead71798781709cd32ade395a0b61f7f0392,587024c74e62c8ab9a62f611e7782ab07de67d39,-16.149277,-14.255701,970.798558
1,ad366a5b-7666-4c73-8e5f-73c2399dc556,2020-06-30 05:41:15+00:00,3b25d5a3-f2bc-4b66-9d06-8e1fd0e10787,dafd90a39542b8e872bb148b35b41f94c571628c,9590280423a685f0936e51f7493e904a951bbb49,-18.079708,-14.677639,448.996911
2,d5b899a2-9454-4b37-8bb6-5781c9abad67,2020-06-29 21:21:25+00:00,de59522b-a0e3-4719-8b43-56530fdfcd6c,881a22883f1d92c65cded9eb6ed2b1e0c86ba145,f85db66b0e1304b4ff47bbe111d532747e78ba38,-17.80577,-15.440424,514.819719
3,0a39eec4-24d9-4638-b16a-6fca097d1863,2020-07-05 12:24:47+00:00,0ee7ba78-7138-4ae3-88aa-c8275594d076,7426788b0942d1dd7fba43de8f3e24ed66703754,394f9cd7e4b625afc947dd45df879479e1fdad1d,-14.901503,-15.15774,894.314263
4,7a4ecd3d-eb22-468a-8e1c-b04993cd93da,2020-07-05 09:25:45+00:00,8e71c1f1-4933-48d3-a8aa-ba1e344d1600,04ca3869707201b23f1edabd4135069c773ec601,a792a5f6e44b9950d9b416e71fcf8d6cf9184d1e,-12.224165,-14.701431,147.923932
