In [1]:
import otf2
from otf2.events import *
#import darshan
from glob import glob
import pandas as pd
print(f"pd {pd.__version__}")
import dask
import dask.dataframe as dd
print(f"dask {dask.__version__}")
import pyarrow as pa
print(f"pa {pa.__version__}")
import numpy as np
print(f"np {np.__version__}")
from itertools import chain

from dask.distributed import Client, LocalCluster, progress, wait
from dask.distributed import Future, get_client
from typing import Tuple, Union
import os

pd 2.1.2


In a future release, Dask DataFrame will use new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 

  import dask.dataframe as dd


dask 2024.2.0
pa 15.0.0
np 1.26.1


In [2]:
folder="/usr/workspace/haridev/dftracer/logs/scorep"
app="dlio_40_10"
file="traces.otf2"

In [3]:
logfile=f"{folder}/{app}/{file}"

In [4]:
!ls $logfile

/usr/workspace/haridev/dftracer/logs/scorep/dlio_40_10/traces.otf2


In [5]:
def get_json(location, start, end):
    d = {}
    #print(location, start)
    d["name"] = start.region.name
    d["cat"] = start.region.region_role
    d["ts"] = start.time
    d["dur"] = end.time - start.time
    return d
def get_json_one(location, start):
    d = {}
    #print(location.group, start)
    if hasattr(start, 'region'):
        d["name"] = start.region.name
        d["cat"] = start.region.region_role
    else:
        d["name"] = start.__class__
        d["cat"] = "Program"        
    d["ts"] = start.time
    d["dur"] = 0
    d["tid"] = location.name
    d["pid"] = location.group.name
    return d
import time

start = time.time()
def read_trace(trace_name):
    map_events = {}
    count = 0
    with otf2.reader.open(trace_name) as trace:
        #print("Read {} string definitions".format(len(trace.definitions.strings)))
        for location, event in trace.events:
            if isinstance(event, Enter):
                unique_id = (location, event.region)
                map_events[unique_id] = [event]
                #print(f"Encountered enter event into {event.region} on location {location.group} at {event.attributes}")
            elif isinstance(event, Leave):
                unique_id = (location, event.region)
                if unique_id in map_events:
                    map_events[unique_id].append(event)
                else:
                    map_events[unique_id] = [event]
                #print(f"Encountered enter event int")
                if len(map_events[unique_id]) == 2:
                    yield dict(**get_json(location = location, start = map_events[unique_id][0], end = map_events[unique_id][1]))
                elif len(map_events[unique_id]) == 1:
                    yield dict(**get_json_one(location = location, start = map_events[unique_id][0]))
                del map_events[unique_id]
                #print(f"Encountered leave event for {event.region} on location {location} at {event}")
            else:
                yield dict(**get_json_one(location = location, start = event))
                #print(f"Encountered event on location {location} at {event}")
            count = count + 1
            if count % 1000 == 0:
                print(f"Done {count} in {time.time() - start}", end="\r")

In [6]:
initialized = False

In [7]:
if not initialized:
    workers = 1
    cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
    client = Client(cluster)  # Connect to distributed cluster and override default
    print(client)
    initialized = True

<Client: 'tcp://127.0.0.1:34791' processes=1 threads=48, memory=251.40 GiB>


In [8]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 48,Total memory: 251.40 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:34791,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 48
Started: Just now,Total memory: 251.40 GiB

0,1
Comm: tcp://127.0.0.1:38989,Total threads: 48
Dashboard: http://127.0.0.1:37651/status,Memory: 251.40 GiB
Nanny: tcp://127.0.0.1:45845,
Local directory: /var/tmp/haridev/dask-scratch-space/worker-su089x8b,Local directory: /var/tmp/haridev/dask-scratch-space/worker-su089x8b


In [9]:
from dask.diagnostics import ProgressBar
from dask.distributed import progress
ProgressBar().register()

In [10]:
%%timeit -n 1 -r 5 a = 2
start = time.time()
file_pattern = glob(logfile)
create_bag = dask.bag.from_delayed([dask.delayed(read_trace)(file) 
                                                for file in file_pattern])
columns = {'name':"string", 'cat': "string",
           'pid': "string",'tid': "string",
           'dur': "uint64", 'ts': "uint64"}
events = create_bag.to_dataframe(meta=columns)
#events.head()
n_partition = 1
events = events.repartition(npartitions=n_partition).persist()
progress(events)
_ = wait(events)

38.6 s ± 210 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [11]:
#events.query("cat == 'RegionRole.FILE_IO'").groupby("name").count().compute()

In [12]:
#len(events)