# Creating a example of a pipeline

We can create custom transformers that allow make data cleaning or more.  

In [1]:
import sys
sys.version

'2.7.16 |Anaconda, Inc.| (default, Mar 14 2019, 21:00:58) \n[GCC 7.3.0]'

In [2]:
from sklearn.base import BaseEstimator,  TransformerMixin
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline

In [3]:
pd.__version__
import sklearn
sklearn.__version__

'0.20.0'

In [11]:
class FilterNumericos(BaseEstimator, TransformerMixin):
	"""
    It's a transformer that filter data with a numeric threshold value. 
    
    Parameters
    ----------
    campos_numericos:list
        name of the columns that we need to filter  out
    umbral:float
        numerical threshold. Every value above this value will be filter out

    """
	def __init__(self, campos_numericos=None, umbral = 2):
		self.umbral  = umbral
		self.campos_numericos = campos_numericos
	def fit(self, X, y=None):

		return self
	def transform(self, X, y=None):
		#columnas = X.columns
        
        
        
		temporal  = X.copy()
		for campo in self.campos_numericos:
			temporal.query("{} <= {} ".format(campo, self.umbral), inplace=True)
			print temporal.shape
		return temporal
		
class FilterCategorical(TransformerMixin,  BaseEstimator):
	"""
    It's a transformer that filter data that is not in the list valores
    
    Parameters
    ----------
    campos_categoricos:list
        name of the columns that we need to filter  out
    valores:list
        Values that every column should have

    """
	def __init__(self, campos_categoricos=None,  valores= ["a", "c", "g"]):
		self.campos_categoricos  = campos_categoricos
		self.valores = valores
		
	def fit(self,  X, y=None):
		return self
	def transform(self, X, y=None):
		temporal = X.copy()
		valores = self.valores
		for campo in self.campos_categoricos:
			temporal.query("{} in @valores".format(campo ),  inplace=True)
			print temporal.shape
		return temporal

In [5]:
data = pd.DataFrame({"campo1":[1, 2, 3,4, 5] ,
					"campo2":"a,b,c,d,e".split(","),
					"campo3":[6,  7,  8,  9,  0] ,
					"campo4":"f,g,h,i,j".split(",")})

In [6]:
data.head()


Unnamed: 0,campo1,campo2,campo3,campo4
0,1,a,6,f
1,2,b,7,g
2,3,c,8,h
3,4,d,9,i
4,5,e,0,j


A pipeline accept any type of data (At least pandas Dataframes or numpy Arrays.)

In [7]:
id(data)

140704998787536

In [8]:
pipeline = Pipeline( [('numerico', FilterNumericos() ), 
						('categorical', FilterCategorical())])
						
resultado  = pipeline.set_params(numerico__campos_numericos=["campo1", "campo3"],numerico__umbral=8 ,
								categorical__valores  = ["a",  "c", "f"],
					categorical__campos_categoricos=["campo2",  "campo4"]).fit_transform(data)

(5, 4)
(4, 4)
(2, 4)
(1, 4)


In [12]:
id(resultado)
type(resultado)

pandas.core.frame.DataFrame

### We can  create  a  pipeline that makes changes on the dateframe inplace

In [14]:

class FilterNumericos_v2(BaseEstimator, TransformerMixin):
	def __init__(self, campos_numericos=None, umbral = 2):
		self.umbral  = umbral
		self.campos_numericos = campos_numericos
	def fit(self, X, y=None):
		return self
	def transform(self, X, y=None):
		#columnas = X.columns
		#temporal  = X.copy()
		print id(X)
		for campo in self.campos_numericos:
			X.query("{} <= {} ".format(campo, self.umbral), inplace=True)
			#print temporal.shape
		return X
		
class FilterCategorical_v2(TransformerMixin,  BaseEstimator):
	def __init__(self, campos_categoricos=None,  valores= ["a", "c", "g"]):
		self.campos_categoricos  = campos_categoricos
		self.valores = valores
		
	def fit(self,  X, y=None):
		return self
	def transform(self, X, y=None):
		# temporal = X.copy()
		print id(X)
		valores = self.valores
		for campo in self.campos_categoricos:
			X.query("{} in @valores".format(campo ),  inplace=True)
			#print temporal.shape
		return X		
		
pipeline_v2 = Pipeline( [('numerico', FilterNumericos_v2() ), 
						('categorical', FilterCategorical_v2())])
						
resultado_v2  = pipeline_v2.set_params(numerico__campos_numericos=["campo1", "campo3"],numerico__umbral=8 ,
								categorical__valores  = ["a",  "c", "f"],
					categorical__campos_categoricos=["campo2",  "campo4"]).fit_transform(data)		


140704998787536
140704998787536


In [15]:
id(data),  id(resultado_v2)

(140704998787536, 140704998787536)