In [1]:
import numpy as np
import pandas as pd

# Import libraries and download example data
from sklearn.preprocessing import StandardScaler, OneHotEncoder

dataset = pd.read_csv("https://stats.idre.ucla.edu/stat/data/binary.csv")

# Define which columns should be encoded vs scaled
columns_to_encode = ['rank']
columns_to_scale  = ['gre', 'gpa']

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe    = OneHotEncoder(sparse=False)

In [2]:
dataset.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion


class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, df):
        return df[self.key]

pipe_encoder = Pipeline([("union", FeatureUnion(
    transformer_list=[
        ("assessments", Pipeline([
            ("selector", ItemSelector(key=columns_to_scale)),
            ("scale", scaler)
            ]),
        ),
        ("ranks", Pipeline([
            ("selector", ItemSelector(key=columns_to_encode)),
            ("encode", ohe)
            ]),
        ),
    ]))
])

print(pd.DataFrame(pipe_encoder.fit_transform(dataset)).head())

          0         1    2    3    4    5
0 -1.800263  0.579072  0.0  0.0  1.0  0.0
1  0.626668  0.736929  0.0  0.0  1.0  0.0
2  1.840134  1.605143  1.0  0.0  0.0  0.0
3  0.453316 -0.525927  0.0  0.0  0.0  1.0
4 -0.586797 -1.209974  0.0  0.0  0.0  1.0


In [28]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)


columns = ['sex', 'age', 'fare', 'pclass', 'embarked']
df = pd.DataFrame(X, columns=columns)

df.head()

Unnamed: 0,sex,age,fare,pclass,embarked
0,female,29.0,211.3375,1.0,S
1,male,0.9167,151.55,1.0,S
2,female,2.0,151.55,1.0,S
3,male,30.0,151.55,1.0,S
4,female,25.0,151.55,1.0,S


In [30]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

ct = make_column_transformer(
        (OneHotEncoder(), ['embarked', 'sex']), 
         remainder = "passthrough")

ct.fit_transform(df)

array([[  0.    ,   0.    ,   1.    , ...,  29.    , 211.3375,   1.    ],
       [  0.    ,   0.    ,   1.    , ...,   0.9167, 151.55  ,   1.    ],
       [  0.    ,   0.    ,   1.    , ...,   2.    , 151.55  ,   1.    ],
       ...,
       [  1.    ,   0.    ,   0.    , ...,  26.5   ,   7.225 ,   3.    ],
       [  1.    ,   0.    ,   0.    , ...,  27.    ,   7.225 ,   3.    ],
       [  0.    ,   0.    ,   1.    , ...,  29.    ,   7.875 ,   3.    ]])

In [32]:
ct.get_feature_names()

print(pd.DataFrame(ct.fit_transform(df), columns = ct.get_feature_names()).head())

   onehotencoder__x0_C  onehotencoder__x0_Q  onehotencoder__x0_S  \
0                  0.0                  0.0                  1.0   
1                  0.0                  0.0                  1.0   
2                  0.0                  0.0                  1.0   
3                  0.0                  0.0                  1.0   
4                  0.0                  0.0                  1.0   

   onehotencoder__x0_nan  onehotencoder__x1_female  onehotencoder__x1_male  \
0                    0.0                       1.0                     0.0   
1                    0.0                       0.0                     1.0   
2                    0.0                       1.0                     0.0   
3                    0.0                       0.0                     1.0   
4                    0.0                       1.0                     0.0   

       age      fare  pclass  
0  29.0000  211.3375     1.0  
1   0.9167  151.5500     1.0  
2   2.0000  151.5500     1.0 



In [12]:
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])



In [13]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))


model score: 0.790


In [18]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

df = pd.DataFrame({'brand': ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
                   'category': ['asdf', 'asfa', 'asdfas', 'as'],
                   'num1': [1, 1, 0, 0],
                   'target': [0.2, 0.11, 1.34, 1.123]})

numeric_features = ['num1']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['brand', 'category']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                    #  ('regressor',  LinearRegression())
                     ])


print(clf.fit(df.drop('target', 1), df['target']))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['num1']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(hand

