In [1]:
import numpy as np
import pandas as pd

# Import libraries and download example data
from sklearn.preprocessing import StandardScaler, OneHotEncoder

dataset = pd.read_csv("https://stats.idre.ucla.edu/stat/data/binary.csv")

# Define which columns should be encoded vs scaled
columns_to_encode = ['rank']
columns_to_scale  = ['gre', 'gpa']

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe    = OneHotEncoder(sparse=False)

In [2]:
dataset.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion


class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, df):
        return df[self.key]

pipe_encoder = Pipeline([("union", FeatureUnion(
    transformer_list=[
        ("assessments", Pipeline([
            ("selector", ItemSelector(key=columns_to_scale)),
            ("scale", scaler)
            ]),
        ),
        ("ranks", Pipeline([
            ("selector", ItemSelector(key=columns_to_encode)),
            ("encode", ohe)
            ]),
        ),
    ]))
])

print(pd.DataFrame(pipe_encoder.fit_transform(dataset)).head())

          0         1    2    3    4    5
0 -1.800263  0.579072  0.0  0.0  1.0  0.0
1  0.626668  0.736929  0.0  0.0  1.0  0.0
2  1.840134  1.605143  1.0  0.0  0.0  0.0
3  0.453316 -0.525927  0.0  0.0  0.0  1.0
4 -0.586797 -1.209974  0.0  0.0  0.0  1.0


In [28]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)


columns = ['sex', 'age', 'fare', 'pclass', 'embarked']
df = pd.DataFrame(X, columns=columns)

df.head()

Unnamed: 0,sex,age,fare,pclass,embarked
0,female,29.0,211.3375,1.0,S
1,male,0.9167,151.55,1.0,S
2,female,2.0,151.55,1.0,S
3,male,30.0,151.55,1.0,S
4,female,25.0,151.55,1.0,S


In [37]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer


num_cols = ['age', 'fare']

ct = make_column_transformer(
        #( SimpleImputer(strategy='median'), num_cols ),
        (StandardScaler(), num_cols), 
        (OneHotEncoder(), ['embarked', 'sex']), 
         remainder = "passthrough")

ct.fit_transform(df)

array([[-0.06116183,  3.44116502,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [-2.01049644,  2.28560268,  0.        , ...,  0.        ,
         1.        ,  1.        ],
       [-1.93530178,  2.28560268,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.23469331, -0.50388566,  1.        , ...,  0.        ,
         1.        ,  3.        ],
       [-0.19998701, -0.50388566,  1.        , ...,  0.        ,
         1.        ,  3.        ],
       [-0.06116183, -0.49132258,  0.        , ...,  0.        ,
         1.        ,  3.        ]])

In [38]:
ct.get_feature_names()

print(pd.DataFrame(ct.fit_transform(df), columns = ct.get_feature_names()).head())



AttributeError: Transformer standardscaler (type StandardScaler) does not provide get_feature_names.

In [42]:
from sklearn.pipeline import make_pipeline

SimpleImputer.get_feature_names_out = (lambda self, names=None:
                                       self.feature_names_in_)

num_pipeline = make_pipeline(SimpleImputer(), StandardScaler())
transformer = make_column_transformer(
    (num_pipeline, ["age", "height"]),
    (OneHotEncoder(), ["city"]))


pipeline = make_pipeline(transformer, LinearRegression())



dx = pd.DataFrame({"city": ["Rabat", "Tokyo", "Paris", "Auckland"],
                   "age": [32, 65, 18, 24],
                   "height": [172, 163, 169, 190],
                   "weight": [65, 62, 54, 95]},
                  index=["Alice", "Bunji", "Cécile", "Dave"])



pipeline.fit(dx, dx["weight"])


## get pipeline feature names
pipeline[:-1].get_feature_names_out()



array(['pipeline__age', 'pipeline__height',
       'onehotencoder__city_Auckland', 'onehotencoder__city_Paris',
       'onehotencoder__city_Rabat', 'onehotencoder__city_Tokyo'],
      dtype=object)