<a href="https://colab.research.google.com/github/jyotidabass/Encoders_performance_on_real_data/blob/main/Encoders_Performance_on_Real_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
import category_encoders as ce

  import pandas.util.testing as tm


In [6]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.4.0-py2.py3-none-any.whl (86 kB)
[?25l[K     |███▉                            | 10 kB 19.4 MB/s eta 0:00:01[K     |███████▋                        | 20 kB 15.4 MB/s eta 0:00:01[K     |███████████▍                    | 30 kB 10.1 MB/s eta 0:00:01[K     |███████████████▏                | 40 kB 8.5 MB/s eta 0:00:01[K     |███████████████████             | 51 kB 3.9 MB/s eta 0:00:01[K     |██████████████████████▊         | 61 kB 4.5 MB/s eta 0:00:01[K     |██████████████████████████▌     | 71 kB 5.0 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 81 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████████████| 86 kB 1.7 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.0


In [2]:
url_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num','marital-status',
                'occupation', 'relationship', 'race', 'gender','capital-gain', 'capital-loss', 
                'hours-per-week', 'native-country','income']
adults_data = pd.read_csv(url_data, names=column_names)

In [3]:
adults_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
numeric_features = adults_data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = adults_data.select_dtypes(include=['object']).drop(['income'], axis=1).columns

X = adults_data.drop('income', axis=1)
y = adults_data['income']

le = preprocessing.LabelEncoder()
label_encoder = le.fit(y)
y = label_encoder.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
encoder_list = [ce.backward_difference.BackwardDifferenceEncoder, 
               ce.basen.BaseNEncoder,
               ce.binary.BinaryEncoder,
                ce.cat_boost.CatBoostEncoder,
                ce.hashing.HashingEncoder,
                ce.helmert.HelmertEncoder,
                ce.james_stein.JamesSteinEncoder,
                ce.one_hot.OneHotEncoder,
                ce.leave_one_out.LeaveOneOutEncoder,
                ce.m_estimate.MEstimateEncoder,
                ce.ordinal.OrdinalEncoder,
                ce.polynomial.PolynomialEncoder,
                ce.sum_coding.SumEncoder,
                ce.target_encoder.TargetEncoder,
                ce.woe.WOEEncoder
                ]

In [10]:
for encoder in encoder_list:
    
    numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('woe', encoder())])
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators=500))])
    
    model = pipe.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print(encoder)
    print(f1_score(y_test, y_pred, average='macro'))

<class 'category_encoders.backward_difference.BackwardDifferenceEncoder'>
0.7829096752331086
<class 'category_encoders.basen.BaseNEncoder'>
0.7877160677160677
<class 'category_encoders.binary.BinaryEncoder'>
0.7891439740679139
<class 'category_encoders.cat_boost.CatBoostEncoder'>
0.8044754269241113
<class 'category_encoders.hashing.HashingEncoder'>
0.7578464933868457
<class 'category_encoders.helmert.HelmertEncoder'>
0.7846744918348345
<class 'category_encoders.james_stein.JamesSteinEncoder'>
0.7955457086243035
<class 'category_encoders.one_hot.OneHotEncoder'>
0.7881988263973747
<class 'category_encoders.leave_one_out.LeaveOneOutEncoder'>
0.42963926176084877
<class 'category_encoders.m_estimate.MEstimateEncoder'>
0.7945419849628383
<class 'category_encoders.ordinal.OrdinalEncoder'>
0.7885978285978286
<class 'category_encoders.polynomial.PolynomialEncoder'>
0.7830665751671906
<class 'category_encoders.sum_coding.SumEncoder'>
0.7892119552418568
<class 'category_encoders.target_encoder.Ta

You can see from the above that for this model the CatBoost Encoder gives the best score and the leave one out encoder gives the lowest.