In [30]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

In [21]:
class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func

    def transform(self, X, **transform_params):
        return self.func(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [22]:
# this function takes a dataframe as input and
# returns a modified version thereof
def case(X, upper=True):
    if upper:
        X["text"] = X["text"].map(lambda t: t.upper())
    return X

In [23]:
# sample dataframe
df = pd.DataFrame({
    "id":[1,2,3,4],
    "text":["foo","Bar","BAz","quux"],
    "number":[111,121,13,114],
})
df

Unnamed: 0,id,text,number
0,1,foo,111
1,2,Bar,121
2,3,BAz,13
3,4,quux,114


In [27]:
# this pipeline has a single step
pipeline = Pipeline([
    ("uppercase", DataframeFunctionTransformer(case))
])

# apply the pipeline to the input dataframe
pipeline.fit_transform(df)

Unnamed: 0,id,text,number
0,1,FOO,111
1,2,BAR,121
2,3,BAZ,13
3,4,QUUX,114


### Select Columns

In [31]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [35]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20],
    "number":[111,121,13,114,123],
})

In [38]:
# create a pipeline with a single transformer
pipe = Pipeline([
    ('selector', SelectColumnsTransformer(['name', 'age']))
])

In [39]:
pipe.fit_transform(df)

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,
3,david,38.0
4,edward,20.0


### Drop Columns

In [41]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X.drop(columns=self.columns).copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [42]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20],
    "number":[111,121,13,114,123],
})

In [43]:
# create a pipeline with a single transformer
pipe = Pipeline([
    ('selector', DropColumnsTransformer(['name', 'age']))
])

In [44]:
pipe.fit_transform(df)

Unnamed: 0,number
0,111
1,121
2,13
3,114
4,123
