One Hot Encoding — Existing Solutions

* pandas .get dummies
    - can't easily apply encoding to new dataset
* sklearn.preprocessing  Label Encoder & OneHotEncoder
    - errors out if new levels in test data
* sklearn.feature extraction.DictVectorizer
    - input is list of dicts, output is numpy array

# Custom Transformer

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

class CustomOrdinalEncoder(OrdinalEncoder):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def transform(self, X, y=None):
        transformed_X = super().transform(X)
        new_X = pd.DataFrame(transformed_X, columns=self.feature_names_in_)
        return new_X

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_in_

    def inverse_transform(self, X):
        inverse_X = super().inverse_transform(X)
        return pd.DataFrame(inverse_X, columns=self.feature_names_in_)

data = pd.DataFrame(
    {
        "fruits": ["Apple", "Pears", "Cherry"],
        "colors": ["Green", "Green", "Red"],
    }
)

enc = CustomOrdinalEncoder(dtype=int)
new_data = enc.fit_transform(data)

print(new_data)
print("Categories: ", enc.categories_)

# Test get_feature_names_out
print("Feature names out: ", enc.get_feature_names_out(input_features=data.columns))

# Test inverse_transform
original_data = enc.inverse_transform(new_data)
print("Original data after inverse_transform:")
print(original_data)


   fruits  colors
0       0       0
1       2       0
2       1       1
Categories:  [array(['Apple', 'Cherry', 'Pears'], dtype=object), array(['Green', 'Red'], dtype=object)]
Feature names out:  ['fruits' 'colors']
Original data after inverse_transform:
   fruits colors
0   Apple  Green
1   Pears  Green
2  Cherry    Red


In [9]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from typing import Optional, List, Union

class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        self.categories_: dict[str, Union[None, List[str]]] = {}

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'MultiColumnLabelEncoder':
        self.label_encoders: dict[str, LabelEncoder] = {}
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
            self.categories_[col] = le.classes_
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X_transformed = X.copy()
        for col, le in self.label_encoders.items():
            X_transformed[col] = le.transform(X[col])
        return X_transformed
    
    def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> List[str]:
        return [f"{col}__{category}" for col in input_features for category in self.categories_[col]]
    
    def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X_inverse = X.copy()
        for col, le in self.label_encoders.items():
            X_inverse[col] = le.inverse_transform(X[col])
        return X_inverse

# Sample usage
df = pd.DataFrame({'col1': ['A', 'B', 'A', 'C', 'A', 'B', 'A'],
                   'col2': ['X', 'Y', 'X', 'Z', 'X', 'Y', 'X'],
                   'numeric_col': [10, 20, 30, 40, 50, 60, 70]})

encoder = MultiColumnLabelEncoder()
encoder.fit(df)
transformed_data = encoder.transform(df)
inverse_transformed_data = encoder.inverse_transform(transformed_data)

print("Transformed Data:")
print(transformed_data)
print("\nInverse Transformed Data:")
print(inverse_transformed_data)

print("Categories Out:")
print(encoder.categories_)
print("Feature Names Out:")
print(encoder.get_feature_names_out(input_features=df.columns))

Transformed Data:
   col1  col2  numeric_col
0     0     0            0
1     1     1            1
2     0     0            2
3     2     2            3
4     0     0            4
5     1     1            5
6     0     0            6

Inverse Transformed Data:
  col1 col2  numeric_col
0    A    X           10
1    B    Y           20
2    A    X           30
3    C    Z           40
4    A    X           50
5    B    Y           60
6    A    X           70
Categories Out:
{'col1': array(['A', 'B', 'C'], dtype=object), 'col2': array(['X', 'Y', 'Z'], dtype=object), 'numeric_col': array([10, 20, 30, 40, 50, 60, 70], dtype=int64)}
Feature Names Out:
['col1__A', 'col1__B', 'col1__C', 'col2__X', 'col2__Y', 'col2__Z', 'numeric_col__10', 'numeric_col__20', 'numeric_col__30', 'numeric_col__40', 'numeric_col__50', 'numeric_col__60', 'numeric_col__70']
