In [1]:
%load_ext autoreload
%autoreload 2
import contex

In [2]:
from typing import Callable
import pandas as pd
import numpy as np

from collections import defaultdict

from torchdata.datapipes import functional_datapipe
import torchdata.datapipes.iter as pipes
# import torchdata.datapipes as dp


In [3]:
@functional_datapipe("parse_pandas_dataframe")
class PandasParserIterDataPipe(pipes.IterDataPipe):
    def __init__(self, df) -> None:
        self.source_df = df

    def __iter__(self):
         for row in self.source_df.iterrows():
            yield list(row[1])

In [4]:
@functional_datapipe('rolling_groupby')
class RollingGrouperIterDataPipe(pipes.IterDataPipe):
    r"""
    """
    def __init__(self,
                 datapipe: pipes.IterDataPipe,
                 group_key_fn: Callable,
                 *,
                 window_size=1,
                 step_size=1,
                 buffer_size: int = 10000,
                 # group_size: Optional[int] = None,
                 # guaranteed_group_size: Optional[int] = None,
                 # drop_remaining: bool = False
                ):
        # check_lambda_fn(group_key_fn)
        self.datapipe = datapipe
        self.group_key_fn = group_key_fn

        self.window_size = window_size
        self.step_size = step_size
        
        self.group_size = window_size
        self.buffer_size = buffer_size
        self.guaranteed_group_size = None
        if self.group_size is not None and buffer_size is not None:
            assert 0 < self.group_size <= buffer_size
            self.guaranteed_group_size = self.group_size
        #if guaranteed_group_size is not None:
        #    assert group_size is not None and 0 < guaranteed_group_size <= group_size
        #    self.guaranteed_group_size = guaranteed_group_size
        self.drop_remaining = True
        # self.wrapper_class = DataChunk

    def _remove_biggest_key(self, buffer_elements, buffer_size):
        biggest_key = None
        biggest_size = 0
        result_to_yield = None
        for findkey in buffer_elements.keys():
            if len(buffer_elements[findkey]) > biggest_size:
                biggest_size = len(buffer_elements[findkey])
                biggest_key = findkey

        if self.guaranteed_group_size is not None and biggest_size < self.guaranteed_group_size and not self.drop_remaining:
            raise RuntimeError('Failed to group items', str(buffer_elements[biggest_key]))

        if self.guaranteed_group_size is None or biggest_size >= self.guaranteed_group_size:
            result_to_yield = buffer_elements[biggest_key]

        new_buffer_size = buffer_size - biggest_size
        del buffer_elements[biggest_key]

        return result_to_yield, new_buffer_size

    def __iter__(self):
        buffer_elements: DefaultDict[Any, List] = defaultdict(list)
        buffer_size = 0
        for x in self.datapipe:
            key = self.group_key_fn(x)

            buffer_elements[key].append(x)
            buffer_size += 1

            if self.group_size is not None and self.group_size == len(buffer_elements[key]):
                #yield self.wrapper_class(buffer_elements[key])
                yield buffer_elements[key]
                if self.step_size < self.window_size:
                    del buffer_elements[key][:self.step_size]
                    buffer_size -= self.step_size
                else:
                    del buffer_elements[key]
                    buffer_size -= self.window_size

            if buffer_size == self.buffer_size:
                (result_to_yield, buffer_size) = self._remove_biggest_key(buffer_elements, buffer_size)
                if result_to_yield is not None:
                    yield result_to_yield

In [43]:
df = pd.read_parquet("../data/stallion.parquet")
# add time index
df["time_idx"] = df["date"].dt.year * 12 + df["date"].dt.month
df["time_idx"] -= df["time_idx"].min()

# add additional features
df["month"] = df.date.dt.month.astype(str).astype("category")  # categories have be strings
df["log_volume"] = np.log(df.volume + 1e-8)
df["avg_volume_by_sku"] = df.groupby(["time_idx", "sku"], observed=True).volume.transform("mean")
df["avg_volume_by_agency"] = df.groupby(["time_idx", "agency"], observed=True).volume.transform("mean")
time_idx="time_idx"
target="volume"
group_ids=["agency", "sku"]
df = df.sort_values(by="time_idx")
df = df[[time_idx, target] + group_ids]
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,time_idx,volume,agency,sku
0,0,52.272,Agency_22,SKU_01
1,0,3324.2697,Agency_32,SKU_04
2,0,110.7,Agency_22,SKU_02
3,0,0.0,Agency_58,SKU_23
4,0,28.32,Agency_48,SKU_07
5,0,238.5387,Agency_22,SKU_05
6,0,0.0,Agency_58,SKU_17
7,0,126.36,Agency_31,SKU_01
8,0,475.790396,Agency_48,SKU_02
9,0,1.1502,Agency_40,SKU_04


In [48]:
# df.loc[(df.agency == 'Agency_04') & (df.sku == 'SKU_02')]

In [45]:
datapipe = PandasParserIterDataPipe(df)
for x in datapipe:
    print(x)
    break

[0, 52.272, 'Agency_22', 'SKU_01']


In [46]:
ds = datapipe.rolling_groupby(group_key_fn=lambda x: x[2] + x[3], window_size=2, step_size=2).batch(2)
#ds  = (pipes.FileOpener(datapipe, mode='rt').parse_csv(delimiter=',', skip_lines=1)
#            .map(parse_price)
#            .rolling(window_size=5, step=1)
#            .batch(4)
#      )
            



In [47]:
for i, x in enumerate(ds):
    print()
    print(x)
    if i > 2:
        break


[[[2, 15.228, 'Agency_04', 'SKU_02'], [3, 5.6160000000000005, 'Agency_04', 'SKU_02']], [[0, 8.6904, 'Agency_28', 'SKU_08'], [1, 7.3485, 'Agency_28', 'SKU_08'], [2, 6.5178, 'Agency_28', 'SKU_08'], [3, 7.667999999999999, 'Agency_28', 'SKU_08']]]

[[[2, 3760.02, 'Agency_51', 'SKU_02'], [3, 4886.568, 'Agency_51', 'SKU_02']], [[0, 2653.1279999999997, 'Agency_49', 'SKU_05'], [1, 2614.149, 'Agency_49', 'SKU_05'], [2, 2758.94625, 'Agency_49', 'SKU_05'], [3, 3387.8505, 'Agency_49', 'SKU_05']]]

[[[2, 0.0846, 'Agency_29', 'SKU_03'], [3, 0.0, 'Agency_29', 'SKU_03']], [[0, 2211.3873, 'Agency_51', 'SKU_05'], [1, 2227.6815, 'Agency_51', 'SKU_05'], [2, 2267.74725, 'Agency_51', 'SKU_05'], [3, 2991.9255, 'Agency_51', 'SKU_05']]]

[[[2, 0.1692, 'Agency_28', 'SKU_03'], [3, 0.6768, 'Agency_28', 'SKU_03']], [[0, 3384.9359999999997, 'Agency_51', 'SKU_01'], [1, 3665.0879999999997, 'Agency_51', 'SKU_01'], [2, 3868.452, 'Agency_51', 'SKU_01'], [3, 5005.8, 'Agency_51', 'SKU_01']]]


In [16]:
from torchdata.datapipes.iter import FileLister
# import torcharrow.dtypes as dt
# DTYPE = dt.Struct([dt.Field("Values", dt.int32)])
#source_dp = FileLister(".", masks="df*.parquet")
#parquet_df_dp = source_dp.load_parquet_as_df(dtype=DTYPE)
#arquet_df_dp = source_dp.load_parquet_as_df()
# list(parquet_df_dp)[0]

ImportError: The library 'torcharrow' is necessary for this DataPipe but it is not available.Please visit https://github.com/facebookresearch/torcharrow/ to install it.

In [None]:
datapipe = pipes.IterableWrapper(["HistoricalQuotes.csv"])
csv = pipes.FileOpener(datapipe, mode='rt').parse_csv(delimiter=',', skip_lines=1)


next(iter(csv))


