In [10]:
import datetime as dt
from functools import wraps, partialmethod
import logging
import inspect
from itertools import islice

import numpy as np
import pandas as pd

from sklearn.utils.validation import check_memory
from sklearn.externals.joblib.memory import Memory
from sklearn.pipeline import Pipeline

from sklego.transformers import RandomAdder

logging.basicConfig(
    format=('[%(funcName)s:%(lineno)d] - %(message)s'),
    level=logging.INFO
)

In [22]:
def _default_log_step(func):                                                                                       
    @wraps(func)
    def wrapper(*args, **kwargs):
        logger = logging.getLogger(inspect.stack()[1].function)                                           
        
        tic = dt.datetime.now()
        out, step = func(*args, **kwargs)
        toc = str(dt.datetime.now() - tic)                                                                
        
        logger.info(f'[{step}] shape={out.shape} time={toc}')                                 
        return out, step                                                                                
    return wrapper
    

def _log_step_cache(self, func=None, *args, log_step=_default_log_step, **kwargs):
    if callable(func):
        func = log_step(func)
    return self._cache(func=func, *args, **kwargs)


class DebugPipeline(Pipeline):
        
    def __init__(self, steps, memory=None, log_step=_default_log_step):
        memory = check_memory(memory)
        memory._cache = memory.cache
        memory.cache = _log_step_cache.__get__(memory, memory.__class__)
        
        super().__init__(steps, memory=memory)

In [23]:
X = pd.DataFrame(np.random.randn(10, 5))
y = np.arange(10)

pipe_orig = Pipeline([
    ('random_add1', RandomAdder(noise=100)),
    ('random_add2', RandomAdder(noise=100)),
    ('random_add3', RandomAdder(noise=100)),
    ('random_add4', RandomAdder(noise=100)),
])

print(f'X before: {X}')
pipe_orig.fit(X, y=y)
X_out = pipe_orig.transform(X)
print(f'X after: \n{X_out}')

X before:           0         1         2         3         4
0  0.068480  0.517247 -0.122100 -0.624834  0.765233
1  2.643982  0.662962 -0.380489 -0.279044 -1.604832
2  1.204842 -0.480278 -0.111312 -0.764096  0.309626
3 -1.073669  0.523912 -0.723988 -0.259512  1.129653
4 -0.786540 -0.626178  1.189319 -1.811660 -0.857588
5  2.316485  1.323452  1.669453  0.636582  0.923132
6  0.601322  0.948684  0.314898  1.073819  0.168835
7  0.267268 -0.241968  0.840464  0.507223  2.934331
8  1.319100  0.806463 -1.041216 -0.073117  0.145911
9  1.376432 -0.279611 -0.560682  0.990296 -0.358120
X after: 
[[ -49.68871661 -177.29822815 -163.97254004 -161.76807657  160.60946758]
 [-113.36141529  399.2543983   -13.04329446 -184.16556957 -218.07789254]
 [ 273.4764539   244.96439522  214.85633084   46.44443537 -254.06816057]
 [ 105.17324678   57.65085706    4.46803883  122.26636933  277.88786634]
 [ 286.1639509   109.16647305  167.80442436 -244.99012522   27.62922914]
 [ -91.38494973   19.19929135 -298.33501819

In [24]:
X = pd.DataFrame(np.random.randn(10, 5))
y = np.arange(10)

pipe = DebugPipeline([
    ('pipeline_massive', pipe_orig),
    ('random_add1', RandomAdder(noise=1)),
    ('random_add2', RandomAdder(noise=10)),
    ('random_add3', RandomAdder(noise=100)),
    ('random_add4', RandomAdder(noise=1000)),
])

# print(f'X before: {X}')
pipe.fit(X, y=y)
X_out = pipe.transform(X)
# print(f'X after: \n{X_out}')

[wrapper:10] - [Pipeline(memory=None,
     steps=[('random_add1', RandomAdder(noise=100, random_state=None)), ('random_add2', RandomAdder(noise=100, random_state=None)), ('random_add3', RandomAdder(noise=100, random_state=None)), ('random_add4', RandomAdder(noise=100, random_state=None))])] shape=(10, 5) time=0:00:00.004066
[wrapper:10] - [RandomAdder(noise=1, random_state=None)] shape=(10, 5) time=0:00:00.000244
[wrapper:10] - [RandomAdder(noise=10, random_state=None)] shape=(10, 5) time=0:00:00.000217
[wrapper:10] - [RandomAdder(noise=100, random_state=None)] shape=(10, 5) time=0:00:00.000293


In [25]:
f = pipe.memory.cache(lambda df: (np.array([0, 1, 2]), 'foo'))
f(X)

[wrapper:10] - [foo] shape=(3,) time=0:00:00.000034


(array([0, 1, 2]), 'foo')

In [None]:
for i, name, bar in pipe._iter():
    print(i, name, bar)