In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline

class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

# this function takes a dataframe as input and
# returns a modified version thereof
def process_dataframe(input_df):
    input_df["text"] = input_df["text"].map(lambda t: t.upper())
    return input_df

# sample dataframe
df = pd.DataFrame({
    "id":[1,2,3,4],
    "text":["foo","Bar","BAz","unknown"],
    "blah":["that","is","not","true"]
})

# this pipeline has a single step
pipeline = Pipeline([
    ("uppercase", DataframeFunctionTransformer(process_dataframe))
])

# apply the pipeline to the input dataframe
pipeline.fit_transform(df)

Unnamed: 0,id,text,blah
0,1,FOO,that
1,2,BAR,is
2,3,BAZ,not
3,4,QUUX,true


In [5]:
import re

class UnknownFeatureGenerator():
    def __init__(self, feature_name, new_feature_name):
        self.feature_name = feature_name
        self.new_feature_name = new_feature_name
        
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, input_df, **transform_params):
        print( type(input_df) )
        print( input_df.shape )
        input_df_ = input_df.copy() # creating a copy to avoid changes to original dataset
        unk_pat = "Not Available|NULL|Not Mapped|unknown"
        input_df_[self.new_feature_name] = input_df_[self.feature_name].str.count(unk_pat, flags=re.IGNORECASE)
        #X_[self.new_feature_name] = 1
        return input_df_

df = pd.DataFrame({
    "id":[1,2,3,4],
    "text":["foo","Bar","BAz","unknown"],
    "blah":["that","is","not","true"]
})

# this pipeline has a single step
pipeline = Pipeline([
    ("unknown", UnknownFeatureGenerator("text", "unknown"))
])

# apply the pipeline to the input dataframe
pipeline.fit_transform(df)

<class 'pandas.core.frame.DataFrame'>
(4, 3)


Unnamed: 0,id,text,blah,unknown
0,1,foo,that,0
1,2,Bar,is,0
2,3,BAz,not,0
3,4,unknown,true,1
