In [1]:
# Import python packages
import pandas as pd
from sklearn import datasets
from feature_engine.encoding import (
    RareLabelEncoder
    )

In [2]:
# Read data about houses from Open ML
house_df, _ = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
house_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,,,0.0,2.0,2008.0,WD,Normal
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,,,0.0,5.0,2007.0,WD,Normal
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,,,,0.0,9.0,2008.0,WD,Normal
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,,,,0.0,2.0,2006.0,WD,Abnorml
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,,,,0.0,12.0,2008.0,WD,Normal


In [3]:
# Explore the cardinality (in percentages of the "SaleType" column
house_df['SaleType'].value_counts() / len(house_df)

WD       0.867808
New      0.083562
COD      0.029452
ConLD    0.006164
ConLI    0.003425
ConLw    0.003425
CWD      0.002740
Oth      0.002055
Con      0.001370
Name: SaleType, dtype: float64

In [4]:
# Create the cardinality encode of feature-engine
cardinality_encoder = RareLabelEncoder(
    tol=0.05,                  # Tolerance Number
    variables=['SaleType'],    # Variable
    n_categories=5,            # Number of categories (out of 9) that should be considered as frequent - default:10
    replace_with='Rare'        # The naming used for the infrequent caterogirs
)

In [5]:
# Transform the data to group infrequent categories
encoded_df = cardinality_encoder.fit_transform(house_df)
encoded_df['SaleType'].value_counts()

WD      1267
New      122
Rare      71
Name: SaleType, dtype: int64

In [6]:
# Check in the frequency of grouped variables
encoded_df['SaleType'].value_counts() / len(encoded_df)

WD      0.867808
New     0.083562
Rare    0.048630
Name: SaleType, dtype: float64