## Data generation

This creates the fake data for the colorblind example (JamesStein Encoder)

In [1]:
import numpy as np
import pandas as pd
import numpy as np

In [2]:
np.random.seed(42)

numbers = {
    ('female', 'white'): (10, 2000),
    ('male', 'white'): (150, 2000),
    ('female', 'asian'): (10, 2000),
    ('male', 'asian'): (90, 2000),
    ('female', 'black'): (5, 975),
    ('male', 'black'): (37, 975),
    ('male', 'hispanic'): (2, 50)
}

frames = []

for gender, race in numbers:
    num_colorblind, num_total = numbers[(gender, race)]
    weights = np.round(np.random.normal(loc=160, scale=8, size=num_total), 2)
    ages = np.random.binomial(90, 0.25)

    df = pd.DataFrame({
        'gender': gender,
        'race': race,
        'age': ages,
        'height': weights,
        'Colorblind': [True]*num_colorblind + [False]*(num_total - num_colorblind)
    })
    frames.append(df)
    
dataset = pd.concat(frames).sample(frac=1.0).reset_index(drop=True)

In [3]:
dataset.head()

Unnamed: 0,gender,race,age,height,Colorblind
0,female,black,29,145.95,False
1,male,black,28,161.83,False
2,male,asian,21,160.87,False
3,female,white,21,164.64,False
4,male,asian,21,158.88,False


In [4]:
dataset.to_csv('colorblind.csv')

In [5]:
import category_encoders as ce
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline

# Some fake data loaded from Github
colorblind = dataset

# Do the train test split
X_train, X_test, y_train, y_test = train_test_split(colorblind.drop('Colorblind', axis=1), colorblind['Colorblind'])

# Build the encoder
encoder = ce.JamesSteinEncoder(cols=['gender', 'race'], return_df=True)

# Build the model, including the encoder
model = Pipeline([
  ('encode_categorical', encoder),
  ('classifier', RandomForestClassifier())
])

# Here are the parameters we want to search over
# Review pipelines to see how to access the different 
# stages
params = {
  'classifier__n_estimators': [50, 100, 200],
  'classifier__max_depth': [4, 6, 8],
  'classifier__class_weight': [{0: 1, 1: 20}]
}

# build a grid search
grid = GridSearchCV(model, param_grid=params, cv=5).fit(X_train, y_train)

# How well did we do on the test set?
# Note that we don't need to explicitly transform the test
# set!
predict_test = grid.predict(X_test)
print(f"Recall on the test set is {recall_score(y_test, predict_test)}")

Recall on the test set is 0.34285714285714286


In [6]:
ce.__version__

'2.0.0'

In [7]:
import category_encoders as ce
import numpy as np
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import make_column_transformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline as skPipeline

# One-Hot Encoder for Pipeline
onehot_encoder = ce.OneHotEncoder(
     cols = ['listing_region',
             'engine_cylinder_layout',
             'engine_aspiration',
             'fuel_type',
             'drivetrain',
             'transmission_gearbox',
             'engine_type',
             'body_type',
             'buyer_source'],
     use_cat_names=True
)

# James-Stein Encoding for columns with many unique values
JS_encoding = ce.JamesSteinEncoder(
    cols=['trim', 'make', 'model',
          'exterior_color', 'interior_color']
)

class MyProcessor:
    def __init__(self, pipeline):
        self.pipeline = pipeline
        
    def fit(self, X, y):
        return self.pipeline.fit(X, y)
        
    def transform(self, X, y):
        return self.pipeline.transform(X, y)
    
    def fit_transform(self, X, y):
        return self.pipeline.fit_tranform(X, y)
    
    

# Encoders into Pipeline
categorical_pipeline = MyProcessor(
    skPipeline([
        ('one_hot_encode', onehot_encoder),
        ('JS_encode', JS_encoding)
    ])
)


# make_column_transfomer is a helper function
# to make a pipeline that only operates on some columns
process_numeric = make_column_transformer(
    (FunctionTransformer(np.log1p, validate=False), 
     ['is_lease', 'num_accidents', 'is_manual_transmission', 'percent_to_market',]),
    (FunctionTransformer(np.log, validate=False), 
     ['engine_displacement_liters', 'fuel_economy_city', 'quoted_list_price', 
      'engine_cylinder_count', 'num_owners', 'num_doors']),
    (StandardScaler(), ['quote_month', 'quote_year', 'fuel_economy_highway','years_old', 'odometer']),
    remainder='passthrough' 
    # must have 'passthrough', otherwise all the non-tranformed (i.e. categorical) features will be dropped!
)

# Full pipeline
preprocess_pipeline = Pipeline([
    ('transform_categorical', categorical_pipeline),
    ('transform_numeric', process_numeric),
    ('oversampling', RandomOverSampler())
])


In [8]:
def _validate_steps(self):
    names, estimators = zip(*self.steps)

    # validate names
    self._validate_names(names)

    # validate estimators
    transformers = estimators[:-1]
    estimator = estimators[-1]

    for t in transformers:
        if t is None or t == 'passthrough':
            continue
        if (not (hasattr(t, "fit") or
                 hasattr(t, "fit_transform") or
                 hasattr(t, "fit_resample")) or
                not (hasattr(t, "transform") or
                     hasattr(t, "fit_resample"))):
            raise TypeError(
                "All intermediate steps of the chain should "
                "be estimators that implement fit and transform or "
                "fit_resample (but not both) or be a string 'passthrough' "
                "'%s' (type %s) doesn't)" % (t, type(t)))

        if (hasattr(t, "fit_resample") and (hasattr(t, "fit_transform") or
                                            hasattr(t, "transform"))):
            raise TypeError(
                "All intermediate steps of the chain should "
                "be estimators that implement fit and transform or sample."
                " '%s' implements both)" % (t))

        if isinstance(t, pipeline.Pipeline):
            raise TypeError(
                "All intermediate steps of the chain should not be"
                " Pipelines")

    # We allow last estimator to be None as an identity transformation
    if (estimator is not None and estimator != 'passthrough'
            and not hasattr(estimator, "fit")):
        raise TypeError("Last step of Pipeline should implement fit or be "
                        "the string 'passthrough'. '%s' (type %s) doesn't"
                        % (estimator, type(estimator)))

In [9]:
the_transformer = preprocess_pipeline.steps[0][1]

In [10]:
isinstance(preprocess_pipeline, skPipeline)

True

In [11]:
!cat /anaconda3/envs/blog_env/lib/python3.7/site-packages/imblearn/pipeline.py

﻿"""
The :mod:`imblearn.pipeline` module implements utilities to build a
composite estimator, as a chain of transforms, samples and estimators.
"""
# Adapted from scikit-learn

# Author: Edouard Duchesnay
#         Gael Varoquaux
#         Virgile Fritsch
#         Alexandre Gramfort
#         Lars Buitinck
#         Christos Aridas
#         Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: BSD

from collections import defaultdict
from itertools import islice

from sklearn import pipeline
from sklearn.base import clone
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.utils.validation import check_memory

__all__ = ['Pipeline', 'make_pipeline']


class Pipeline(pipeline.Pipeline):
    """Pipeline of transforms and resamples with a final estimator.

    Sequentially apply a list of transforms, sampling, and a final estimator.
    Intermediate steps of the pipeline must be transformers or resamplers,
    that is, they must i

In [12]:
encoder = ce.TargetEncoder(cols=['gender', 'race'], return_df=True)
encoder.fit(X_train, y_train)

TargetEncoder(cols=['gender', 'race'], drop_invariant=False,
              handle_missing='value', handle_unknown='value',
              min_samples_leaf=1, return_df=True, smoothing=1.0, verbose=0)

In [13]:
encoder.transform(X_train)

Unnamed: 0,gender,race,age,height
594,0.056454,0.038721,14,160.21
5401,0.056454,0.038721,14,158.17
375,0.056454,0.028477,21,169.52
7413,0.005635,0.038721,21,163.68
376,0.056454,0.038721,14,162.37
4602,0.056454,0.038721,14,165.62
9906,0.005635,0.021739,29,156.06
6603,0.056454,0.028477,21,152.55
6589,0.005635,0.038721,21,152.22
5838,0.005635,0.028477,19,147.33
