# Debug pipeline

A pipeline that has a log statement in between each step, useful for debugging a pipeline.

In [1]:
import logging

import numpy as np
import pandas as pd

from sklego.transformers import RandomAdder
from sklego.pipeline import Pipeline

logging.basicConfig(
    format=('[%(funcName)s:%(lineno)d] - %(message)s'),
    level=logging.INFO
)

In [2]:
# Pipeline set-up
n_samples, n_features = 3, 5
X = np.random.randn(n_samples, n_features)
y = np.arange(n_samples)

steps = [
    ('random_add1', RandomAdder(noise=1)),
    ('random_add2', RandomAdder(noise=10)),
    ('random_add3', RandomAdder(noise=100)),
    ('random_add4', RandomAdder(noise=1000)),
]

print('Initial X:\n', X)

Initial X:
 [[-0.16773481  0.64902899  0.16391525  0.20100636 -0.98979189]
 [-0.48282472 -0.12298425  0.38843326 -0.19128699  1.29914286]
 [-0.20224853  0.73222224  1.7676616   0.33681068  0.4259045 ]]


## Same as sklearn pipeline.

This pipeline behaves exactly the same as a normal pipeline.

In [3]:
pipe = Pipeline(steps)

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

Transformed X:
 [[-2.27369773e+03 -1.28385616e+03  7.54308994e+02  8.22750482e+02
  -6.87728464e+01]
 [ 3.92726607e+02 -1.45382252e+03  5.17096026e+02  1.56225333e+03
  -9.56081438e+02]
 [ 3.45906676e+02  9.82796448e+02  1.17405280e+02 -1.37267547e-02
  -1.38418879e+03]]


## Log statements

Tt is possible to set a `log_callback` variable that logs in between each step.

_Note: there are __three__ log statements while there are __four__ steps, because there are __three__ moments __in between__ the steps. The output can be checked outside of the pipeline._

In [4]:
pipe = Pipeline(steps, log_callback='default')

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

[_default_log_callback:34] - [RandomAdder(noise=1, random_state=None)] shape=(3, 5) time=0:00:00.000152
[_default_log_callback:34] - [RandomAdder(noise=10, random_state=None)] shape=(3, 5) time=0:00:00.000152
[_default_log_callback:34] - [RandomAdder(noise=100, random_state=None)] shape=(3, 5) time=0:00:00.000112


Transformed X:
 [[ 1438.49835932   709.83155395  -493.03390806  1135.32602983
   2060.47247893]
 [-1297.46102742  -449.823489   -1320.05596955   771.60161285
   2014.15161223]
 [ -436.68625594   129.65636163   387.20585108  -528.67432801
    344.6264161 ]]


## Set the `log_callback` function later

It is possible to set the `log_callback` later then initialisation. 

In [5]:
pipe = Pipeline(steps)
pipe.log_callback='default'

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

[_default_log_callback:34] - [RandomAdder(noise=1, random_state=None)] shape=(3, 5) time=0:00:00.000179
[_default_log_callback:34] - [RandomAdder(noise=10, random_state=None)] shape=(3, 5) time=0:00:00.000147
[_default_log_callback:34] - [RandomAdder(noise=100, random_state=None)] shape=(3, 5) time=0:00:00.000106


Transformed X:
 [[-1977.6463104   1379.71525581   157.89964811  -990.57296446
    479.00164998]
 [  867.50924642  -561.88443294  1217.09638916  -916.91035666
   -377.95658548]
 [ -583.80583492   409.07921835   130.5695047    736.60527176
    879.80655116]]


## Custom `log_callback`

The custom log callback function expect the output of each step, which is an tuple containing the output of the step and the step itself, and the execution time of the step.

In [6]:
def log_callback(output, execution_time):
    '''My custom `log_callback` function
    
    Parameters
    ----------
    output : tuple(
            numpy.ndarray or pandas.DataFrame
            :class:estimator or :class:transformer
        )
        The output of the step and a step in the pipeline.
    execution_time : float
        The execution time of the step.
    '''
    logger = logging.getLogger(__name__)
    step_result, step = output
    logger.info(f'[{step}] shape={step_result.shape} '
                f'nbytes={step_result.nbytes} time={execution_time}')

    
pipe.log_callback = log_callback

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

[log_callback:16] - [RandomAdder(noise=1, random_state=None)] shape=(3, 5) nbytes=120 time=0:00:00.000147
[log_callback:16] - [RandomAdder(noise=10, random_state=None)] shape=(3, 5) nbytes=120 time=0:00:00.000124
[log_callback:16] - [RandomAdder(noise=100, random_state=None)] shape=(3, 5) nbytes=120 time=0:00:00.000116


Transformed X:
 [[-810.89092124 -602.61828357 -384.89682475 -378.56788659  321.30563172]
 [ 587.65339826  172.10799413 -197.44491653   80.98512601 -818.76491327]
 [1357.06448433 -413.0853857   947.83964204 -261.30476274  -69.51833145]]


## Enough logging

Remove the `log_callback` function when not needed anymore.

In [7]:
pipe.log_callback = None

pipe.fit(X, y=y)
X_out = pipe.transform(X)

print('Transformed X:\n', X_out)

Transformed X:
 [[ -797.78866489   821.4417173   -342.52097735   251.97853627
  -1899.29137337]
 [ -641.45239757  -343.39244121  -448.69207308  -977.23496373
   1425.05702118]
 [   -1.89991498  -373.66866774 -1859.99260412  1053.19243903
   -277.31909253]]
