In [1]:
%load_ext jupyter_black

In [2]:
from __future__ import annotations

import time
import itertools
import random
import numpy as np

from mesoscaler.generic import DataWorker, DataConsumer
import enum

In [3]:
Index = tuple[int, int, int, int]


DUMMY_DATA: dict[Index, np.ndarray] = {
    # some dummy data either on disk or in
    k: np.random.rand(100, 100)
    for k in itertools.product([1, 2, 3], [4, 5, 6], [4, 5, 6], [4, 5, 6])
}


def get_data_from_disk(key: Index) -> np.ndarray:
    time.sleep(0.1)  # io latency
    return DUMMY_DATA[key]

In [4]:
indices = random.choices(list(DUMMY_DATA.keys()), k=20)

print(len(DUMMY_DATA), len(indices))

81 20


The worker class is `Mapping` that is instantiated with a Iterable sequence of indices.

In [5]:
class MyWorker(DataWorker[Index, np.ndarray]):
    def __getitem__(self, idx: Index) -> np.ndarray:
        return get_data_from_disk(idx)


worker = MyWorker(indices=indices)
print(worker[indices[0]])
worker

[[0.33573819 0.33037508 0.22273583 ... 0.38941492 0.3495945  0.3102927 ]
 [0.04058014 0.72918072 0.25039443 ... 0.782737   0.70239587 0.16724658]
 [0.48909019 0.39635605 0.5576222  ... 0.06440972 0.38304069 0.09661434]
 ...
 [0.70350836 0.86729886 0.67319113 ... 0.4088989  0.56947363 0.7441669 ]
 [0.22012821 0.1231793  0.9786876  ... 0.59367837 0.60208423 0.30414234]
 [0.41842666 0.15066768 0.10772211 ... 0.45023783 0.33957441 0.57182329]]


MyWorker(size=20):
- (3, 4, 6, 5): [(<class 'types.Undefined'>,)]
- (3, 6, 5, 5): [(<class 'types.Undefined'>,)]
- (2, 4, 6, 5): [(<class 'types.Undefined'>,)]
- (1, 5, 4, 4): [(<class 'types.Undefined'>,)]
- (3, 5, 4, 5): [(<class 'types.Undefined'>,)]
...
- (2, 6, 6, 4): [(<class 'types.Undefined'>,)]

In [6]:
from typing import Any, Generic, TypeVar, get_args
import types
import typing

K = TypeVar("K")
T = TypeVar("T")
import enum

Undefined = enum.Enum("", "Undefined")  # .Undefined
TypingGenericAlias = getattr(typing, "_GenericAlias")
print(type(TypingGenericAlias))


class FirstOrderGeneric(Generic[K, T]):
    __generic__: tuple[Any, ...]

    def __init_subclass__(cls) -> None:
        for arg in getattr(cls, "__orig_bases__", []):
            if isinstance(arg, TypingGenericAlias):
                cls.__generic__ = getattr(arg, "__args__", (Undefined,))
                break


class SomeBaseClass(FirstOrderGeneric[str, int], Generic[T]):
    ...


SomeBaseClass.__generic__  # prints "<class 'int'>"

<class 'type'>


(str, int)

In [7]:
worker.to_dict()

AttributeError: 'MyWorker' object has no attribute 'to_dict'

In [None]:
train, test = worker.split(0.8)
test

In [None]:
from typing import get_type_hints

# get_type_hints(worker.__class__)

# DataConsumer

Assuming there is some IO bottle neck involved with loading data from disk the
`DataConsumer` can be used as a DataLoader that will queue up the data to be
loaded in the background while the model is training.

In [None]:
start = time.time()
for idx in worker:
    data = worker[idx]
    time.sleep(0.1)
print("worker:", time.time() - start)

start = time.time()
for x in DataConsumer(worker):
    time.sleep(0.1)
print("consumer:", time.time() - start)