In [1]:
#|hide
#|eval: false
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [2]:
#|hide
#|eval: false
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [11]:
#|default_exp pipes.iter.transforms

In [25]:
#|export
# Python native modules
from typing import Callable,Union,TypeVar
# Third party libs
from fastcore.all import *
import torchdata.datapipes as dp
from torchdata.datapipes.iter import IterDataPipe
from torchdata.dataloader2.graph import find_dps,DataPipeGraph,Type,DataPipe
# Local modules

# Item and Batch Transforms
> DataPipes for calling functions over iterations

In [16]:
#|export
T_co = TypeVar("T_co", covariant=True)

class ItemTransformer(IterDataPipe[T_co]):
    def __init__(
            self,
            # Should allow iterating and producing elements to be injested by `item_tfms`
            source_datapipe:IterDataPipe[T_co],
            # A list of Callables that accept an input, and return an output
            item_tfms:List[Callable]
    ) -> None: 
        self.item_tfms:Pipeline[Callable] = Pipeline(item_tfms)
        self.source_datapipe:IterDataPipe[T_co] = source_datapipe
    
    def __iter__(self) -> Iterator[T_co]:
        for data in self.source_datapipe:
            yield self.item_tfms(data)
    
ItemTransformer.__doc__ = """Converts `item_tfms` into a `Pipeline` that is run over for every iter in `source_datapipe`"""

`ItemTransformer` can be used to do quick augmentations to an existing pipeline by passing simple
functions into `item_tfms`. For example below given an input of `0->10` we add one to each element and
then multiply that element by 2...

In [21]:
add_one = lambda o:o+1
multiple_by_two = lambda o:o*2
pipe = ItemTransformer(range(10),[add_one,multiple_by_two])
test_eq(list(pipe),[2, 4, 6, 8, 10, 12, 14, 16, 18, 20])
list(pipe)

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

In [26]:
#|export
class BatchTransformer(IterDataPipe[T_co]):
    def __init__(
            self,
            # Should allow iterating and producing batches of elements to be injested by `batch_tfms`
            source_datapipe:IterDataPipe[T_co],
            # A list of Callables that accept a (batch) input, and return an (batch) output
            batch_tfms:List[Callable]
    ) -> None:
        self.batch_tfms:Pipeline[Callable] = Pipeline(batch_tfms)
        self.source_datapipe:IterDataPipe[T_co] = source_datapipe
    
    def __iter__(self) -> Iterator[T_co]:
        for data in self.source_datapipe:
            yield self.batch_tfms(data)
            
BatchTransformer.__doc__ = """Converts `batch_tfms` into a `Pipeline` that is run over for every iter in `source_datapipe`"""

`BatchTransformer` is identical to `ItemTransformer` but semantically the functions it runs should
operate on an entire batch of elements...

In [23]:
add_one = lambda o:[element+1 for element in o]
multiple_by_two = lambda o:[element*2 for element in o]

pipe = dp.iter.IterableWrapper(range(10))
pipe = pipe.batch(2)
pipe = BatchTransformer(pipe,[add_one,multiple_by_two])
test_eq(list(pipe),[[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]])
list(pipe)

[[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]

In [24]:
#|hide
#|eval: false
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev import nbdev_export
    nbdev_export()