# Basic Workflow

The most simple workflow that you can write is a sequential pipeline of steps,
where the outputs of a component are fed as input to the following component,
employing a scikit-learn-like Pipeline.

In itwinai, a step is also called "component" and is implemented by extending
the ``itwinai.components.BaseComponent`` class. Each component implements
the `execute(...)` method, which provides a unified interface to interact with
each component.

The aim of itwinai components is to provide reusable machine learning best
practices, and some common operations are already encoded in some abstract
components. Some examples are:
- ``DataGetter``: has no input and returns a dataset, collected from somewhere
(e.g., downloaded).
- ``DataSplitter``: splits an input dataset into train, validation and test.
- ``DataPreproc``: perform preprocessing on train, validation, and test
datasets.
- ``Trainer``: trains an ML model and returns the trained model.
- ``Saver``: saved an ML artifact (e.g., dataset, model) to disk.

In this tutorial you will see how to create new components and how they
are assembled into sequential pipelines. Newly created components are
in a separate file called 'basic_components.py'.

In [49]:
import sys
# sys.path.enter('../')
# import os
# os.getcwd()
# %cd ..
%ls

/root/itwinai/itwinai
AUTHORS.md       COPYRIGHT  [0m[01;34mUNKNOWN.egg-info[0m/  pyproject.toml  [01;34muse-cases[0m/
CHANGELOG        LICENSE    [01;34mbuild[0m/             [01;34msrc[0m/
CODEOWNERS       Makefile   [01;34mdocs[0m/              [01;34mtests[0m/
CONTRIBUTING.md  README.md  [01;34menv-files[0m/         [01;34mtutorials[0m/


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [55]:
import importlib  
foobar = importlib.import_module("tutorials.ml-workflows")

In [46]:
import importlib.util spec
import_file = importlib.util.spec_from_file_location("tutorials/ml_workflows", "basic_components.py")    
# arguments shared are 
file2=importlib.util.module_from_spec(import_file)        
spec.loader.exec_module(file2)  

SyntaxError: invalid syntax (4188913185.py, line 1)

In [36]:
from itwinai.pipeline import Pipeline

# Import the custom components from file
from tubasic_components import MyDataGetter, MyDatasetSplitter, MyTrainer

SyntaxError: invalid syntax (771939628.py, line 4)

In [27]:
# %load tutorials/ml-workflows/basic_components.py
"""
Here we show how to implement component interfaces in a simple way.
"""
from typing import List, Optional, Tuple, Any
from itwinai.components import (
    DataGetter, DataSplitter, Trainer, Saver, monitor_exec
)


class MyDataGetter(DataGetter):
    def __init__(self, data_size: int, name: Optional[str] = None) -> None:
        super().__init__(name)
        self.data_size = data_size
        self.save_parameters(data_size=data_size)

    @monitor_exec
    def execute(self) -> List[int]:
        """Return a list dataset.

        Returns:
            List[int]: dataset
        """
        return list(range(self.data_size))


class MyDatasetSplitter(DataSplitter):
    @monitor_exec
    def execute(
        self,
        dataset: List[int]
    ) -> Tuple[List[int], List[int], List[int]]:
        """Splits a list dataset into train, validation and test datasets.

        Args:
            dataset (List[int]): input list dataset.

        Returns:
            Tuple[List[int], List[int], List[int]]: train, validation, and
            test datasets.
        """
        train_n = int(len(dataset)*self.train_proportion)
        valid_n = int(len(dataset)*self.validation_proportion)
        train_set = dataset[:train_n]
        vaild_set = dataset[train_n:train_n+valid_n]
        test_set = dataset[train_n+valid_n:]
        return train_set, vaild_set, test_set


class MyTrainer(Trainer):
    def __init__(self, lr: float = 1e-3, name: Optional[str] = None) -> None:
        super().__init__(name)
        self.save_parameters(name=name, lr=lr)

    @monitor_exec
    def execute(
        self,
        train_set: List[int],
        vaild_set: List[int],
        test_set: List[int]
    ) -> Tuple[List[int], List[int], List[int], str]:
        """Dummy ML trainer mocking a ML training algorithm.

        Args:
            train_set (List[int]): training dataset.
            vaild_set (List[int]): validation dataset.
            test_set (List[int]): test dataset.

        Returns:
            Tuple[List[int], List[int], List[int], str]: train, validation,
            test datasets, and trained model.
        """
        return train_set, vaild_set, test_set, "my_trained_model"


class MySaver(Saver):
    @monitor_exec
    def execute(self, artifact: Any) -> Any:
        """Saves an artifact to disk.

        Args:
            artifact (Any): artifact to save (e.g., dataset, model).

        Returns:
            Any: input artifact.
        """
        return artifact


In [28]:

from itwinai.pipeline import Pipeline

# Import the custom components from file
from basic_components import MyDataGetter, MyDatasetSplitter, MyTrainer

ModuleNotFoundError: No module named 'basic_components'

In [26]:
# %load tutorials/ml-workflows/basic_components.py
"""
Here we show how to implement component interfaces in a simple way.
"""
from typing import List, Optional, Tuple, Any
from itwinai.components import (
    DataGetter, DataSplitter, Trainer, Saver, monitor_exec
)


class MyDataGetter(DataGetter):
    def __init__(self, data_size: int, name: Optional[str] = None) -> None:
        super().__init__(name)
        self.data_size = data_size
        self.save_parameters(data_size=data_size)

    @monitor_exec
    def execute(self) -> List[int]:
        """Return a list dataset.

        Returns:
            List[int]: dataset
        """
        return list(range(self.data_size))


class MyDatasetSplitter(DataSplitter):
    @monitor_exec
    def execute(
        self,
        dataset: List[int]
    ) -> Tuple[List[int], List[int], List[int]]:
        """Splits a list dataset into train, validation and test datasets.

        Args:
            dataset (List[int]): input list dataset.

        Returns:
            Tuple[List[int], List[int], List[int]]: train, validation, and
            test datasets.
        """
        train_n = int(len(dataset)*self.train_proportion)
        valid_n = int(len(dataset)*self.validation_proportion)
        train_set = dataset[:train_n]
        vaild_set = dataset[train_n:train_n+valid_n]
        test_set = dataset[train_n+valid_n:]
        return train_set, vaild_set, test_set


class MyTrainer(Trainer):
    def __init__(self, lr: float = 1e-3, name: Optional[str] = None) -> None:
        super().__init__(name)
        self.save_parameters(name=name, lr=lr)

    @monitor_exec
    def execute(
        self,
        train_set: List[int],
        vaild_set: List[int],
        test_set: List[int]
    ) -> Tuple[List[int], List[int], List[int], str]:
        """Dummy ML trainer mocking a ML training algorithm.

        Args:
            train_set (List[int]): training dataset.
            vaild_set (List[int]): validation dataset.
            test_set (List[int]): test dataset.

        Returns:
            Tuple[List[int], List[int], List[int], str]: train, validation,
            test datasets, and trained model.
        """
        return train_set, vaild_set, test_set, "my_trained_model"


class MySaver(Saver):
    @monitor_exec
    def execute(self, artifact: Any) -> Any:
        """Saves an artifact to disk.

        Args:
            artifact (Any): artifact to save (e.g., dataset, model).

        Returns:
            Any: input artifact.
        """
        return artifact


ModuleNotFoundError: No module named 'basic_components'

In [25]:
# %load tutorials/ml-workflows/basic_components.py
"""
Here we show how to implement component interfaces in a simple way.
"""
from typing import List, Optional, Tuple, Any
from itwinai.components import (
    DataGetter, DataSplitter, Trainer, Saver, monitor_exec
)


class MyDataGetter(DataGetter):
    def __init__(self, data_size: int, name: Optional[str] = None) -> None:
        super().__init__(name)
        self.data_size = data_size
        self.save_parameters(data_size=data_size)

    @monitor_exec
    def execute(self) -> List[int]:
        """Return a list dataset.

        Returns:
            List[int]: dataset
        """
        return list(range(self.data_size))


class MyDatasetSplitter(DataSplitter):
    @monitor_exec
    def execute(
        self,
        dataset: List[int]
    ) -> Tuple[List[int], List[int], List[int]]:
        """Splits a list dataset into train, validation and test datasets.

        Args:
            dataset (List[int]): input list dataset.

        Returns:
            Tuple[List[int], List[int], List[int]]: train, validation, and
            test datasets.
        """
        train_n = int(len(dataset)*self.train_proportion)
        valid_n = int(len(dataset)*self.validation_proportion)
        train_set = dataset[:train_n]
        vaild_set = dataset[train_n:train_n+valid_n]
        test_set = dataset[train_n+valid_n:]
        return train_set, vaild_set, test_set


class MyTrainer(Trainer):
    def __init__(self, lr: float = 1e-3, name: Optional[str] = None) -> None:
        super().__init__(name)
        self.save_parameters(name=name, lr=lr)

    @monitor_exec
    def execute(
        self,
        train_set: List[int],
        vaild_set: List[int],
        test_set: List[int]
    ) -> Tuple[List[int], List[int], List[int], str]:
        """Dummy ML trainer mocking a ML training algorithm.

        Args:
            train_set (List[int]): training dataset.
            vaild_set (List[int]): validation dataset.
            test_set (List[int]): test dataset.

        Returns:
            Tuple[List[int], List[int], List[int], str]: train, validation,
            test datasets, and trained model.
        """
        return train_set, vaild_set, test_set, "my_trained_model"


class MySaver(Saver):
    @monitor_exec
    def execute(self, artifact: Any) -> Any:
        """Saves an artifact to disk.

        Args:
            artifact (Any): artifact to save (e.g., dataset, model).

        Returns:
            Any: input artifact.
        """
        return artifact


ModuleNotFoundError: No module named 'basic_components'

In [21]:

"""
Here we show how to implement component interfaces in a simple way.
"""
from typing import List, Optional, Tuple, Any
from itwinai.components import (
    DataGetter, DataSplitter, Trainer, Saver, monitor_exec
)


class MyDataGetter(DataGetter):
    def __init__(self, data_size: int, name: Optional[str] = None) -> None:
        super().__init__(name)
        self.data_size = data_size
        self.save_parameters(data_size=data_size)

    @monitor_exec
    def execute(self) -> List[int]:
        """Return a list dataset.

        Returns:
            List[int]: dataset
        """
        return list(range(self.data_size))


class MyDatasetSplitter(DataSplitter):
    @monitor_exec
    def execute(
        self,
        dataset: List[int]
    ) -> Tuple[List[int], List[int], List[int]]:
        """Splits a list dataset into train, validation and test datasets.

        Args:
            dataset (List[int]): input list dataset.

        Returns:
            Tuple[List[int], List[int], List[int]]: train, validation, and
            test datasets.
        """
        train_n = int(len(dataset)*self.train_proportion)
        valid_n = int(len(dataset)*self.validation_proportion)
        train_set = dataset[:train_n]
        vaild_set = dataset[train_n:train_n+valid_n]
        test_set = dataset[train_n+valid_n:]
        return train_set, vaild_set, test_set


class MyTrainer(Trainer):
    def __init__(self, lr: float = 1e-3, name: Optional[str] = None) -> None:
        super().__init__(name)
        self.save_parameters(name=name, lr=lr)

    @monitor_exec
    def execute(
        self,
        train_set: List[int],
        vaild_set: List[int],
        test_set: List[int]
    ) -> Tuple[List[int], List[int], List[int], str]:
        """Dummy ML trainer mocking a ML training algorithm.

        Args:
            train_set (List[int]): training dataset.
            vaild_set (List[int]): validation dataset.
            test_set (List[int]): test dataset.

        Returns:
            Tuple[List[int], List[int], List[int], str]: train, validation,
            test datasets, and trained model.
        """
        return train_set, vaild_set, test_set, "my_trained_model"


class MySaver(Saver):
    @monitor_exec
    def execute(self, artifact: Any) -> Any:
        """Saves an artifact to disk.

        Args:
            artifact (Any): artifact to save (e.g., dataset, model).

        Returns:
            Any: input artifact.
        """
        return artifact


ModuleNotFoundError: No module named 'basic_components'

In [None]:
if __name__ == "__main__":
    # Assemble them in a scikit-learn like pipeline
    pipeline = Pipeline([
        MyDataGetter(data_size=100),
        MyDatasetSplitter(
            train_proportion=.5,
            validation_proportion=.25,
            test_proportion=0.25
        ),
        MyTrainer()
    ])

    # Inspect steps
    print(pipeline[0])
    print(pipeline[2].name)
    print(pipeline[1].train_proportion)

    # Run pipeline
    _, _, _, trained_model = pipeline.execute()
    print("Trained model: ", trained_model)

    # You can also create a Pipeline from a dict of components, which
    # simplifies their retrieval by name
    pipeline = Pipeline({
        "datagetter": MyDataGetter(data_size=100),
        "splitter": MyDatasetSplitter(
            train_proportion=.5,
            validation_proportion=.25,
            test_proportion=0.25
        ),
        "trainer": MyTrainer()
    })

    # Inspect steps
    print(pipeline["datagetter"])
    print(pipeline["trainer"].name)
    print(pipeline["splitter"].train_proportion)

    # Run pipeline
    _, _, _, trained_model = pipeline.execute()
    print("Trained model: ", trained_model)
