In [1]:
from typing import List, Optional, Generator
from datetime import datetime, timezone, timedelta
import random
import uuid
import hashlib
import os

import numpy as np
import pandas as pd
from dask.distributed import Client
from pathlib import Path
import dask.dataframe as dd
from dask import delayed

data_path = './data/raw'
os.makedirs(data_path, exist_ok=True)

In [2]:
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:42863  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 33.54 GB


In [3]:
class OrderRandomSource:
    @staticmethod
    def new_uuid(): return str(uuid.uuid4())
    
    @staticmethod
    def new_hash(): return hashlib.sha1(str(uuid.uuid4()).encode()).hexdigest()

    def __init__(self, num_customers:int = 10000, num_ips:int = 10000, num_emails:int = 50000, 
                 days_ago:int = 0, days_span:int = 30):
        self.num_customers = num_customers
        self.num_ips = num_ips
        self.num_emails = num_emails
        self.lat_min, self.lat_max = -20., -10.
        self.long_min, self.long_max = -20., -10.
        self.to_date = datetime.now(tz=timezone.utc) - timedelta(days=days_ago)
        self.to_date_epoch = int(self.to_date.timestamp())
        self.from_date = self.to_date - timedelta(days=days_span)
        self.from_date_epoch = int(self.from_date.timestamp())
        self.customer_ids = [self.new_uuid() for i in range(self.num_customers)]
        self.emails = [self.new_hash() for i in range(self.num_emails)]
        self.email_to_customer = [self.customer_ids[random.randint(0, self.num_customers-1)] for i in range(self.num_emails)]
        self.ips = [self.new_hash() for i in range(self.num_ips)]
        self.email_to_ip = [self.ips[random.randint(0, num_ips-1)] for i in range(self.num_emails)] 
    
    def random_order(self):
        email_idx = random.randint(0, self.num_emails-1)
        email = self.emails[email_idx]
        customer_id = self.email_to_customer[email_idx]
        ip_addr = self.email_to_ip[email_idx] if random.random() > 0.1 else self.new_hash()
        date_epoch = random.randint(self.from_date_epoch, self.to_date_epoch)
        date = datetime.fromtimestamp(date_epoch, tz=timezone.utc)
        lat = self.lat_min + random.random() * (self.lat_max - self.lat_min)
        long = self.long_min + random.random() * (self.long_max - self.long_min)
        amount = random.random() * 1000.
        return {
            'order_id': self.new_uuid(),
            'order_date': date,
            'customer_id': customer_id,
            'email': email,
            'ip_addr': ip_addr,
            'location_lat': lat,
            'location_long': long,
            'order_amount': amount
        }

    def _generate_orders(self, n: int):
        for _ in range(n):
            yield self.random_order()

    def __call__(self, n: int):
        return pd.DataFrame(self._generate_orders(n))

    @staticmethod
    def meta(): 
        return {'order_id': object, 
               'order_date': 'datetime64[ns, UTC]', 
               'customer_id': object, 
               'email': object, 
               'ip_addr': object, 
               'location_lat': float, 
               'location_long': float, 
               'order_amount': float}

In [4]:
def make_batch(i: int, size: int):
    print("Making batch", i)
    days_ago = i * 7
    random_orders = OrderRandomSource(days_ago=days_ago)
    df = random_orders(size) #, meta=random_orders.meta())
    df.to_parquet(f'{data_path}/batch{i:02}.parquet', engine='fastparquet', compression='LZ4')

In [6]:
num_batches = 52
bs = 100000
batches = []

for i in range(num_batches):
    batches.append(delayed(make_batch)(i, bs))
    
dfs = dd.compute(*batches)