In [1]:
#https://towardsdatascience.com/creating-custom-transformers-for-sklearn-pipelines-d3d51852ecc1
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
# The ColumnsSelector class inherits from the sklearn.base classes 
# (BaseEstimator, TransformerMixin). This makes it compatible with 
# scikit-learn’s Pipelines
class ColumnsSelector(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns
        
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        # return the dataframe with the specified features
        return X[self.columns]

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.read_csv('train.csv')
df = df[['Survived','Pclass','Sex','Age','Fare','Embarked']]
X = df.iloc[:,1:]
y = df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                  test_size = 0.3, 
                                                  stratify = y, 
                                                  random_state = 0)
X_train

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
231,3,male,29.0,7.7750,S
836,3,male,21.0,8.6625,S
639,3,male,,16.1000,S
389,2,female,17.0,12.0000,C
597,3,male,49.0,0.0000,S
...,...,...,...,...,...
131,3,male,20.0,7.0500,S
490,3,male,,19.9667,S
838,3,male,32.0,56.4958,S
48,3,male,,21.6792,C


In [21]:
from sklearn.pipeline import Pipeline
numeric_transformer = Pipeline(steps=[
    ('columns selector', ColumnsSelector(['Age','Fare'])),
])

In [22]:
numeric_transformer.fit(X_train)

Pipeline(steps=[('columns selector', ColumnsSelector(columns=['Age', 'Fare']))])

In [23]:
numeric_transformer.transform(X_train)#先fit再transform

Unnamed: 0,Age,Fare
231,29.0,7.7750
836,21.0,8.6625
639,,16.1000
389,17.0,12.0000
597,49.0,0.0000
...,...,...
131,20.0,7.0500
490,,19.9667
838,32.0,56.4958
48,,21.6792


In [26]:
numeric_transformer.fit_transform(X_train, y_train)#y train沒處理>沒該欄位


Unnamed: 0,Age,Fare
231,29.0,7.7750
836,21.0,8.6625
639,,16.1000
389,17.0,12.0000
597,49.0,0.0000
...,...,...
131,20.0,7.0500
490,,19.9667
838,32.0,56.4958
48,,21.6792


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
numeric_transformer = Pipeline(steps=[
    ('columns selector', ColumnsSelector(['Age','Fare'])),
    ('imputer', SimpleImputer(strategy='median')),
])
numeric_transformer.fit_transform(X_train)

array([[29.    ,  7.775 ],
       [21.    ,  8.6625],
       [28.75  , 16.1   ],
       ...,
       [32.    , 56.4958],
       [28.75  , 21.6792],
       [22.    ,  9.    ]])

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
numeric_transformer = Pipeline(steps=[
    ('columns selector', ColumnsSelector(['Age','Fare'])),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
numeric_transformer.fit_transform(X_train)

array([[-0.02863633, -0.47911875],
       [-0.65142052, -0.46270324],
       [-0.04809833, -0.32513665],
       ...,
       [ 0.20490775,  0.42203815],
       [-0.04809833, -0.22194182],
       [-0.5735725 , -0.45646073]])

In [29]:
numeric_transformer.transform(X_test)

array([[-0.65142052, -0.47989005],
       [-1.97483693,  0.09842973],
       [ 3.20205667, -0.47958116],
       [-0.41787645,  0.91519704],
       [-0.06756034, -0.48921406],
       [-0.80711657, -0.06803741],
       [-0.04809833, -0.47958116],
       [ 2.30680439, -0.37322716],
       [-0.34002842,  2.18019377],
       [ 1.60617218,  3.95530314],
       [-1.04066064,  0.11114597],
       [ 1.13908403, -0.47403226],
       [-1.58559681,  0.01288412],
       [-0.80711657, -0.38247533],
       [-2.208381  , -0.33176389],
       [-0.49572447, -0.47688439],
       [-0.88496459,  3.58545385],
       [-1.8969889 , -0.04237372],
       [-0.04809833, -0.49252861],
       [-0.49572447, -0.38247533],
       [ 0.12705972,  2.42650409],
       [-0.04809833, -0.47688439],
       [ 0.12705972, -0.13739871],
       [-0.88496459, -0.38247533],
       [-2.208381  , -0.41700247],
       [-0.49572447, -0.38247533],
       [-0.34002842, -0.1420228 ],
       [-2.208381  , -0.24236549],
       [ 1.21693206,

In [30]:
class MyStandardScaler(BaseEstimator, TransformerMixin): 
    def __init__(self):
        return None
    
    def fit(self, X, y = None):
        print(type(X))
        # the type of X might be a DataFrame or a NumPy array
        # depending on the previous transformer object that 
        # you use in the pipeline
        self.means = np.mean(X, axis=0)    # calculate the mean
        self.stds = np.std(X, axis=0)      # calculate the 
                                           # standard deviation
        return self
    def transform(self, X, y = None):
        return (X - self.means) / self.stds

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
numeric_transformer = Pipeline(steps=[
    ('columns selector', ColumnsSelector(['Age','Fare'])),
    ('my scaler', MyStandardScaler())  
])
numeric_transformer.fit_transform(X_train)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Age,Fare
231,-0.036551,-0.479119
836,-0.592408,-0.462703
639,,-0.325137
389,-0.870336,-0.400972
597,1.353091,-0.622928
...,...,...
131,-0.661890,-0.492529
490,,-0.253617
838,0.171895,0.422038
48,,-0.221942


In [36]:
from sklearn.utils.validation import check_is_fitted
class MyStandardScaler(BaseEstimator, TransformerMixin): 
    def __init__(self):
        return None
    
    def fit(self, X, y = None):
        print(type(X))
        # the type of X might be a DataFrame or a NumPy array
        # it depends on the previous transformer object that 
        # you use in the pipeline
        self.means = np.mean(X, axis=0)
        self.stds = np.std(X, axis=0)
        return self
    def transform( self, X, y = None ):
        check_is_fitted(self, ['means','stds'])
        return (X - self.means) / self.stds
numeric_transformer = Pipeline(steps=[
    ('columns selector', ColumnsSelector(['Age','Fare'])),
    ('imputer', SimpleImputer(strategy='median')),
    ('my scaler', MyStandardScaler())  
])
numeric_transformer.transform(X_train)##一定要fit

NotFittedError: This SimpleImputer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.