In [1]:
%matplotlib inline
import os
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/jmquintana79/Datasets/master/housing.csv", header = None)
cols_x = [f"x{i}"for i in range(df.shape[1]-1)]
df.columns =  cols_x + ["y"]
df.shape

(506, 14)

In [3]:
df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [32]:
class ColumnSelector(object):
    """
    A feature selector for scikit-learn's Pipeline class that returns specified columns from a Pandas dataframe.
    cols -- list of columns to be selected.
    return_array -- return array or df with selected columns (default, array).
    """

    # constructor
    def __init__(self, cols:list, return_array:bool = True):
        self.cols = cols
        self.return_array = return_array

    def transform(self, X, y=None):
        # validation input format
        assert isinstance(X, pd.DataFrame), "It is required a dataframe as input."
        # validation selected columns are in df
        for col in self.cols:
            assert col in X.columns.tolist(), f"Column '{col}' is not in the input dataframe."
        # select and return
        if self.return_array:
            return X[self.cols].values
        else:
            return X[self.cols]

    def fit(self, X, y=None):
        return self

In [33]:
colsel = ColumnSelector(["x0", "x1"])
colsel.transform(df)

array([[6.3200e-03, 1.8000e+01],
       [2.7310e-02, 0.0000e+00],
       [2.7290e-02, 0.0000e+00],
       ...,
       [6.0760e-02, 0.0000e+00],
       [1.0959e-01, 0.0000e+00],
       [4.7410e-02, 0.0000e+00]])

In [34]:
from sklearn.preprocessing import StandardScaler

In [35]:
# only for categorical data
ppl = Pipeline([
    ('selector', ColumnSelector(["x0", "x1"])),
    ('scaler', StandardScaler()),
])

In [36]:
ppl.fit_transform(df)

array([[-0.41978194,  0.28482986],
       [-0.41733926, -0.48772236],
       [-0.41734159, -0.48772236],
       ...,
       [-0.41344658, -0.48772236],
       [-0.40776407, -0.48772236],
       [-0.41500016, -0.48772236]])

In [72]:
from sklearn.pipeline import Pipeline, FeatureUnion

class PipelineTools():
    def __init__(self):
        pass
    
    @staticmethod
    def create(list_of_steps:list):
        # validate input
        assert list(set([len(i) for i in list_of_steps]))[0] == 2, "Any step don't have a correct dimession."
        assert list(set([type(i[0]) for i in list_of_steps]))[0] == str, "Any step don't have a correct name description."
        # return pipeline
        return Pipeline(list_of_steps)
        
    @staticmethod
    def merge(list_of_pipelines:list):
        # validate input
        assert list(set([len(i) for i in list_of_pipelines]))[0] == 2, "Any pipeline don't have a correct dimession."
        assert list(set([type(i[0]) for i in list_of_pipelines]))[0] == str, "Any pipeline don't have a correct name description."
        # return pipeline
        return FeatureUnion(transformer_list=list_of_pipelines)        

In [68]:
pltools = PipelineTools()

lpipeline1 = [
    ['selector', ColumnSelector(["x0", "x1"])],
    ['scaler', StandardScaler()],
]
ppl1 = pltools.create(lpipeline1)

lpipeline2 = [
    ['selector', ColumnSelector(["x2", "x3"])],
    ['scaler', StandardScaler()],
]
ppl2 = pltools.create(lpipeline2)

ppl_preprocessing =[
    ("pipeline1", ppl1),
    ("pipeline2", ppl2),
]
ppl_preprocessing = pltools.merge(full_pipeline)

In [69]:
ppl_preprocessing.fit_transform(df)

array([[-0.41978194,  0.28482986, -1.2879095 , -0.27259857, 24.        ],
       [-0.41733926, -0.48772236, -0.59338101, -0.27259857, 21.6       ],
       [-0.41734159, -0.48772236, -0.59338101, -0.27259857, 34.7       ],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, -0.27259857, 23.9       ],
       [-0.40776407, -0.48772236,  0.11573841, -0.27259857, 22.        ],
       [-0.41500016, -0.48772236,  0.11573841, -0.27259857, 11.9       ]])

In [73]:
from sklearn.compose import ColumnTransformer
ppl_preprocessing2 = ColumnTransformer([('pipeline1', ppl1, ["x0", "x1"]),
                                   ('pipeline2', ppl2, ["x2", "x3"]),
                                  ])
ppl_preprocessing2.fit_transform(df)

array([[-0.41978194,  0.28482986, -1.2879095 , -0.27259857],
       [-0.41733926, -0.48772236, -0.59338101, -0.27259857],
       [-0.41734159, -0.48772236, -0.59338101, -0.27259857],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, -0.27259857],
       [-0.40776407, -0.48772236,  0.11573841, -0.27259857],
       [-0.41500016, -0.48772236,  0.11573841, -0.27259857]])

In [None]:
# https://www.kaggle.com/travelcodesleep/end-to-end-regression-pipeline-using-scikitlearn
class AttributeDeleter(BaseEstimator, TransformerMixin):
    def __init__(self, delete=True):
        self.delete = delete
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return np.delete(X,[fire.columns.get_loc(i) for i in['X','Y','area']],axis=1)