Definition: A rare category is a category which is not seen very often and can include many different categories.




Ways to handle it:
1) Modify your model pipeline and include a new category to the existing categories
2) Try to “predict” the unknown category by using a nearest neighbour model.

In [2]:
import pandas as pd
from sklearn import preprocessing 

In [4]:
train = pd.read_csv("./resources/cat_train.csv")
test = pd.read_csv("./resources/cat_test.csv")

In [5]:
test.loc["target"] = -1

In [6]:
data = pd.concat([train, test]).reset_index(drop=True)

In [7]:
features = [x for x in train.columns if x not in ["id", "target"]]

In [9]:
for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    temp_col = data[feat].fillna("NONE").astype(str).values

    data[feat] = lbl_enc.fit_transform(temp_col)

In [11]:
train = data[data["target"] != -1].reset_index(drop=True)
test = data[data["target"] == -1].reset_index(drop=True)

## Treat those categories as rare cases, which occurs less than specific times

In [12]:
df = pd.read_csv("./resources/cat_train.csv")

In [13]:
df['ord_4'].value_counts()

ord_4
N    39978
P    37890
Y    36657
A    36633
R    33045
U    32897
M    32504
X    32347
C    32112
H    31189
Q    30145
T    29723
O    25610
B    25212
E    21871
K    21676
I    19805
D    17284
F    16721
W     8268
Z     5790
S     4595
G     3404
V     3107
J     1950
L     1657
Name: count, dtype: int64

In [24]:
df['ord_4'] = df['ord_4'].fillna('None')
df['ord_4'].value_counts()

ord_4
N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
None    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
G        3404
V        3107
J        1950
L        1657
Name: count, dtype: int64

In [25]:
df.loc[
df['ord_4'].value_counts()[df["ord_4"]].values < 2000,
"ord_4"
] = "RARE"

In [26]:
df['ord_4'].value_counts()

ord_4
N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
None    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: count, dtype: int64