In [None]:
%load_ext autoreload
%autoreload 2

# Debug pipeline

A pipeline that has a log statement in between each step, useful for debugging a pipeline.

In [None]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

from sklego.pipeline import DebugPipeline, example_log_message_callback

In [None]:
# DebugPipeline set-up
n_samples, n_features = 3, 5
X = np.zeros((n_samples, n_features))
y = np.arange(n_samples)


class Adder(TransformerMixin, BaseEstimator):
    def __init__(self, value):
        self._value = value
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X + self._value
    
    def __repr__(self):
        return f'Adder(value={self._value})'
    
    
steps = [
    ('add_1', Adder(value=1)),
    ('add_10', Adder(value=10)),
    ('add_100', Adder(value=100)),
    ('add_1000', Adder(value=1000)),
]

## Same as sklearn pipeline.

This pipeline behaves exactly the same as a normal pipeline.

In [None]:
pipe = DebugPipeline(steps)

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

## Sklearn pipeline has a verbose option:

In [None]:
pipe = DebugPipeline(steps, verbose=True)

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

## Log message callback

In the `DebugPipeline` it is possible to set a `log_message_callback` variable that logs in between each step.

_Note: there are __three__ log statements while there are __four__ steps, because there are __three__ moments __in between__ the steps. The output can be checked outside of the pipeline._

In [None]:
pipe = DebugPipeline(steps, log_message_callback=example_log_message_callback)

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

## Set the `log_message_callback` function later

It is possible to set the `log_message_callback` later then initialisation. 

In [None]:
pipe = DebugPipeline(steps)
pipe.log_message_callback = example_log_message_callback

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

## Custom `log_message_callback`

The custom log callback function expect the output of each step, which is an tuple containing the output of the step and the step itself, and the execution time of the step.

In [None]:
def custom_log_message_callback(transformer, X=None, y=None):
    """
    My custom `log_message_callback` function.
    
    Parameters
    ----------
    output : tuple(
            numpy.ndarray or pandas.DataFrame
            :class:estimator or :class:transformer
        )
        The output of the step and a step in the pipeline.
    execution_time : float
        The execution time of the step.
    """    
    _ = y
    msg = f'[{transformer}]'
    if X is not None:
        msg += f' shape={X.shape} nbytes={X.nbytes}'
    return msg

    
pipe.log_message_callback = custom_log_message_callback

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

# Feature union

Feature union also works with the debug pipeline.

In [None]:
pipe_w_default_log_callback = DebugPipeline(steps, log_message_callback=example_log_message_callback)
pipe_w_custom_log_callback = DebugPipeline(steps, log_message_callback=custom_log_message_callback)

pipe_union = FeatureUnion([
    ('pipe_w_default_log_callback', pipe_w_default_log_callback),
    ('pipe_w_custom_log_callback', pipe_w_custom_log_callback),
])

pipe_union.fit(X, y=y)
X_out = pipe_union.transform(X)

print('Transformed X:\n', X_out)

## Enough logging

Remove the `log_callback` function when not needed anymore.

In [None]:
pipe.log_message_callback = None

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)