In [1]:
import lsdb
import ast
from tape import Ensemble, ColumnMapper
import matplotlib.pyplot as plt
import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd
from collections.abc import Iterable

dask.config.set({'temporary_directory': '/data/epyc/users/brantd/tmp'})
dask.config.set({'dataframe.query-planning': False})

from dask.distributed import Client, performance_report
client = Client(n_workers=10, threads_per_worker=1,
                memory_limit="60G",
                dashboard_address=':38764')

# Dask Investigation: Effect of Partition Sizes

## Generate some toy data

In [26]:
# define a generator function
def generate_data(num_points):
    num_points = num_points
    num_ids = num_points//5
    all_bands = np.array(["r", "g", "b", "i"])
    rows = {
        "id": 8000 + (np.arange(num_points) % num_ids),
        "time": np.arange(num_points),
        "flux": np.arange(num_points) % len(all_bands),
        "band": np.repeat(all_bands, num_points / len(all_bands)),
        "err": 0.1 * (np.arange(num_points) % 10),
        "count": np.arange(num_points),
        "something_else": np.full(num_points, None),
        }

    ddf = dd.from_dict(rows, npartitions=1).set_index("id", sort=True)

    return ddf

# Do a bunch of repartitions and write to parquet
ddf = generate_data(100000).persist()
partition_sizes = ("100KB", "1MB", "10MB") #, "50MB", "100MB", "200MB", "500MB")
for size in partition_sizes:
    size_ddf = ddf.repartition(partition_size=size)
    size_ddf.to_parquet(f"/data/epyc/users/brantd/data/dask_ps/{size}")


## Define a Rough Task Graph Memory Estimator

In [28]:
import sys

def get_size(obj, seen=None):
    """Recursively finds size of objects"""

    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()

    obj_id = id(obj)
    if obj_id in seen:
        return 0

    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)

    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])

    return size

def approx_memusage(df):
    """Approximates Task Graph Memory Usage"""

    graph = df.__dask_graph__().to_dict()  # Convert to a dictionary
    size = get_size(graph)
    return size/1048576

## Test a Basic Workflow

In [4]:
data_path = f"/data/epyc/users/brantd/data/dask_ps/100KB/"

with performance_report(filename="/data/epyc/users/brantd/data/dask_ps/100KB_report.html"):
    dd.read_parquet(data_path).compute()
