# Entry 14 notebook - Encoding Categoricals

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

In [2]:
from sklearn.preprocessing import Binarizer, binarize, LabelBinarizer, label_binarize, LabelEncoder, OneHotEncoder, OrdinalEncoder
import category_encoders as ce

In [3]:
col_names = ['class', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
             'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
             'stalk_shape', 'stalk_root', 'stalk_surface_above_ring', 'stalk_surface_below_ring',
             'stalk_color_above_ring', 'stalk_color_below_ring',
             'veil_type', 'veil_color', 'ring_number', 'ring_type',
             'spore_print_color', 'population', 'habitat']
mushrooms_raw = pd.read_csv('../data/expanded.csv', header=None, names=col_names)

In [4]:
mushrooms_raw.head()

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
1,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
2,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
3,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
4,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,BROWN,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS


So they didn't include the target values in the easy-to-load-straight-from-the-internet 'agaricus-lepiota.data' file. They did encode the values in that file, still as strings. Not sure why that was done, unless it was to make the file size smaller.

I tried to load the 'expanded.Z' file directly from the online Data Folder, but couldn't figure out how to uncompress the .Z file type (the pandas decompression didn't recognize it, the zlib module doesn't decompress .Z files, and the header intro text thwarted my other efforts). So I saved it to my repository's data folder.

There is a [version on Kaggle](https://www.kaggle.com/uciml/mushroom-classification), but I haven't found a way to load directly from Kaggle without using the Kernels on the Kaggle website. I want to keep all these entries in the same place, so I'd rather not switch specific entries to Kaggle's Kernels.

In [24]:
with open('/Users/julie.fisher/Downloads/expanded.Z') as f:
   print(f)

<_io.TextIOWrapper name='/Users/julie.fisher/Downloads/expanded.Z' mode='r' encoding='UTF-8'>


In [58]:
mushrooms_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8417 entries, 0 to 8416
Data columns (total 23 columns):
class                       8417 non-null object
cap_shape                   8416 non-null object
cap_surface                 8416 non-null object
cap_color                   8416 non-null object
bruises                     8416 non-null object
odor                        8416 non-null object
gill_attachment             8416 non-null object
gill_spacing                8416 non-null object
gill_size                   8416 non-null object
gill_color                  8416 non-null object
stalk_shape                 8416 non-null object
stalk_root                  8416 non-null object
stalk_surface_above_ring    8416 non-null object
stalk_surface_below_ring    8416 non-null object
stalk_color_above_ring      8416 non-null object
stalk_color_below_ring      8416 non-null object
veil_type                   8416 non-null object
veil_color                  8416 non-null object
ring_number

In [61]:
mushrooms_raw[mushrooms_raw.isnull().any(axis=1)]

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
8416,----------------------------------------------...,,,,,,,,,,...,,,,,,,,,,


In [6]:
mushrooms = mushrooms_raw.dropna()

In [7]:
mushrooms.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8416 entries, 0 to 8415
Data columns (total 23 columns):
class                       8416 non-null object
cap_shape                   8416 non-null object
cap_surface                 8416 non-null object
cap_color                   8416 non-null object
bruises                     8416 non-null object
odor                        8416 non-null object
gill_attachment             8416 non-null object
gill_spacing                8416 non-null object
gill_size                   8416 non-null object
gill_color                  8416 non-null object
stalk_shape                 8416 non-null object
stalk_root                  8416 non-null object
stalk_surface_above_ring    8416 non-null object
stalk_surface_below_ring    8416 non-null object
stalk_color_above_ring      8416 non-null object
stalk_color_below_ring      8416 non-null object
veil_type                   8416 non-null object
veil_color                  8416 non-null object
ring_number

In [8]:
mushrooms['class'].value_counts()

EDIBLE       4488
POISONOUS    3928
Name: class, dtype: int64

### Number of categories per column

There is a column with only one value type: veil_type. I could probably get rid of this, it isn't contributing anything if everything has the same value.

In [9]:
mushrooms.nunique()

class                        2
cap_shape                    6
cap_surface                  4
cap_color                   10
bruises                      2
odor                         9
gill_attachment              2
gill_spacing                 2
gill_size                    2
gill_color                  12
stalk_shape                  2
stalk_root                   5
stalk_surface_above_ring     4
stalk_surface_below_ring     4
stalk_color_above_ring       9
stalk_color_below_ring       9
veil_type                    1
veil_color                   4
ring_number                  3
ring_type                    5
spore_print_color            9
population                   6
habitat                      7
dtype: int64

# Pandas methods

### Label encoding

Convert to 'category' datatype for reduced memory usage and keep underlying info, then convert to cat codes.

In [10]:
pd_labels = mushrooms.astype('category')
for cat in pd_labels.columns:
    pd_labels[cat] = pd_labels[cat].cat.codes
pd_labels.head()

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,0,2,3,8,0,0,1,1,1,10,...,3,7,7,0,2,1,4,6,4,6
1,0,2,3,8,0,0,1,1,1,10,...,3,7,7,0,2,1,4,1,4,6
2,0,2,3,8,0,0,1,1,1,7,...,3,7,7,0,2,1,4,6,4,6
3,0,2,3,8,0,0,1,1,1,7,...,3,7,7,0,2,1,4,1,4,6
4,0,2,3,8,0,0,1,1,1,1,...,3,7,7,0,2,1,4,6,4,6


### Binary values

Basically the same thing as label encoding, but for features with only two values (Yes/No, True/False, etc). 

In [78]:
binary_cols = ['class', 'bruises', 'gill_attachment', 'gill_spacing', 'gill_size', 'stalk_shape']
pd_binary = pd.DataFrame()
for cat in binary_cols:
    pd_binary[cat] = mushrooms[cat].astype('category').cat.codes
pd_binary.head()

Unnamed: 0,class,bruises,gill_attachment,gill_spacing,gill_size,stalk_shape
0,0,0,1,1,1,1
1,0,0,1,1,1,1
2,0,0,1,1,1,1
3,0,0,1,1,1,1
4,0,0,1,1,1,1


There is also a way to do custom encoding.

In [80]:
np.where(mushrooms['class'].str.contains('ED'), 1, 0)[:5]

array([1, 1, 1, 1, 1])

### One Hot encoding / get_dummies

Binary values should probably be handled separately. As can be seen below, 'class' was turned into two columns: EDIBLE and POISONOUS.

In [67]:
pd_one_hot = pd.get_dummies(mushrooms)
pd_one_hot.head()

Unnamed: 0,class_EDIBLE,class_POISONOUS,cap_shape_BELL,cap_shape_CONICAL,cap_shape_CONVEX,cap_shape_FLAT,cap_shape_KNOBBED,cap_shape_SUNKEN,cap_surface_FIBROUS,cap_surface_GROOVES,...,population_SCATTERED,population_SEVERAL,population_SOLITARY,habitat_GRASSES,habitat_LEAVES,habitat_MEADOWS,habitat_PATHS,habitat_URBAN,habitat_WASTE,habitat_WOODS
0,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


# Scikit Learn methods

### Ordinal Encoding

In [158]:
encoder = OrdinalEncoder()
sl_ord = encoder.fit_transform(mushrooms)
sl_ord = pd.DataFrame(sl_ord, columns=mushrooms.columns)
sl_ord.head()

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,0.0,2.0,3.0,8.0,0.0,0.0,1.0,1.0,1.0,10.0,...,3.0,7.0,7.0,0.0,2.0,1.0,4.0,6.0,4.0,6.0
1,0.0,2.0,3.0,8.0,0.0,0.0,1.0,1.0,1.0,10.0,...,3.0,7.0,7.0,0.0,2.0,1.0,4.0,1.0,4.0,6.0
2,0.0,2.0,3.0,8.0,0.0,0.0,1.0,1.0,1.0,7.0,...,3.0,7.0,7.0,0.0,2.0,1.0,4.0,6.0,4.0,6.0
3,0.0,2.0,3.0,8.0,0.0,0.0,1.0,1.0,1.0,7.0,...,3.0,7.0,7.0,0.0,2.0,1.0,4.0,1.0,4.0,6.0
4,0.0,2.0,3.0,8.0,0.0,0.0,1.0,1.0,1.0,1.0,...,3.0,7.0,7.0,0.0,2.0,1.0,4.0,6.0,4.0,6.0


### Label Encoding

In [84]:
encoder = LabelEncoder()
sl_label = mushrooms.apply(encoder.fit_transform)
sl_label.head()

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,0,2,3,8,0,0,1,1,1,10,...,3,7,7,0,2,1,4,6,4,6
1,0,2,3,8,0,0,1,1,1,10,...,3,7,7,0,2,1,4,1,4,6
2,0,2,3,8,0,0,1,1,1,7,...,3,7,7,0,2,1,4,6,4,6
3,0,2,3,8,0,0,1,1,1,7,...,3,7,7,0,2,1,4,1,4,6
4,0,2,3,8,0,0,1,1,1,1,...,3,7,7,0,2,1,4,6,4,6


### One Hot Encoding

In [97]:
encoder = OneHotEncoder()
sl_one_hot = encoder.fit_transform(mushrooms).toarray()
sl_one_hot = pd.DataFrame(sl_one_hot, columns=encoder.get_feature_names())
sl_one_hot.head()

Unnamed: 0,x0_EDIBLE,x0_POISONOUS,x1_BELL,x1_CONICAL,x1_CONVEX,x1_FLAT,x1_KNOBBED,x1_SUNKEN,x2_FIBROUS,x2_GROOVES,...,x21_SCATTERED,x21_SEVERAL,x21_SOLITARY,x22_GRASSES,x22_LEAVES,x22_MEADOWS,x22_PATHS,x22_URBAN,x22_WASTE,x22_WOODS
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### LabelBinarizer Encoding

In [150]:
encoder = LabelBinarizer()
sl_labelbin = mushrooms[[x for x in mushrooms.columns if mushrooms[x].nunique() > 2]].apply(encoder.fit_transform)
sl_labelbin.head()

ValueError: If using all scalar values, you must pass an index

In [122]:
encoder = LabelBinarizer()
sl_results = encoder.fit_transform(mushrooms['cap_color'])
sl_labelbin = pd.DataFrame(sl_results, columns = encoder.classes_)
sl_labelbin.head()

Unnamed: 0,BROWN,BUFF,CINNAMON,GRAY,GREEN,PINK,PURPLE,RED,WHITE,YELLOW
0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,1,0


In [115]:
sl_labelbin2 = pd.DataFrame()
sl_labelbin2.shape

(0, 0)

In [153]:
encoder = LabelBinarizer()
sl_labelbin2 = pd.DataFrame()
mushrooms_4_label = mushrooms[[x for x in mushrooms.columns if mushrooms[x].nunique() > 2]]

for cat in mushrooms_4_label.columns:
    print(cat)
    results = encoder.fit_transform(mushrooms[cat])
    print(results)
    results_df = pd.DataFrame(results, columns = encoder.classes_)
    print(results_df)
    if sl_labelbin2.empty:
        sl_labelbin2 = results_df
    else:
        sl_labelbin2 = sl_labelbin2.join(results_df)
    
sl_labelbin2.head()

cap_shape
[[0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 ...
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]]
      BELL  CONICAL  CONVEX  FLAT  KNOBBED  SUNKEN
0        0        0       1     0        0       0
1        0        0       1     0        0       0
2        0        0       1     0        0       0
3        0        0       1     0        0       0
4        0        0       1     0        0       0
...    ...      ...     ...   ...      ...     ...
8411     0        0       0     0        1       0
8412     0        0       0     0        1       0
8413     0        0       0     0        1       0
8414     0        0       0     0        1       0
8415     0        0       0     0        1       0

[8416 rows x 6 columns]
cap_surface
[[0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 ...
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]]
      FIBROUS  GROOVES  SCALY  SMOOTH
0           0        0      0       1
1           0        0      0       1
2           0        0      0       1
3           0        

ValueError: columns overlap but no suffix specified: Index(['BROWN', 'BUFF', 'GRAY', 'GREEN', 'PINK', 'PURPLE', 'RED', 'WHITE',
       'YELLOW'],
      dtype='object')

In [149]:
mushrooms[[x for x in mushrooms.columns if mushrooms[x].nunique() > 2]]

Unnamed: 0,cap_shape,cap_surface,cap_color,odor,gill_color,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,CONVEX,SMOOTH,WHITE,ALMOND,WHITE,BULBOUS,SMOOTH,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
1,CONVEX,SMOOTH,WHITE,ALMOND,WHITE,BULBOUS,SMOOTH,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
2,CONVEX,SMOOTH,WHITE,ALMOND,PINK,BULBOUS,SMOOTH,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
3,CONVEX,SMOOTH,WHITE,ALMOND,PINK,BULBOUS,SMOOTH,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
4,CONVEX,SMOOTH,WHITE,ALMOND,BROWN,BULBOUS,SMOOTH,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,KNOBBED,SMOOTH,BROWN,NONE,BROWN,?,SMOOTH,SMOOTH,ORANGE,ORANGE,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES
8412,KNOBBED,SMOOTH,BROWN,NONE,BROWN,?,SMOOTH,SMOOTH,ORANGE,ORANGE,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES
8413,KNOBBED,SMOOTH,BROWN,NONE,BROWN,?,SMOOTH,SMOOTH,ORANGE,ORANGE,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES
8414,KNOBBED,SMOOTH,BROWN,NONE,BROWN,?,SMOOTH,SMOOTH,ORANGE,ORANGE,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES


### label_binarize

Same as LabelBinarize, but if you know the set of classes ahead of time.

### MultiLabelBinarizer

Same as LabelBinarize, but if the values include lists. Ex: \[{'sci-fi', 'thriller'}, {'comedy'}], [(1, 2), (3,)]

# category-encoders

### Ordinal Encoding

In [162]:
mushrooms.loc[:, 'cap_shape':].columns

Index(['cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [170]:
cat_cols = mushrooms.loc[:, 'cap_shape':].columns.tolist()
index=mushrooms.index.tolist()
encoder = ce.OrdinalEncoder(cols=cat_cols)
ce_ord = pd.DataFrame(encoder.fit_transform(mushrooms[cat_cols]), columns=cat_cols, index=index)
ce_ord.head()

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,2,1,1
2,1,1,1,1,1,1,1,1,2,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,2,1,...,1,1,1,1,1,1,1,2,1,1
4,1,1,1,1,1,1,1,1,3,1,...,1,1,1,1,1,1,1,1,1,1


### One Hot Encoding

In [172]:
cat_cols = mushrooms.loc[:, 'cap_shape':].columns.tolist()
index=mushrooms.index.tolist()
encoder = ce.OneHotEncoder(cols=cat_cols)
ce_one_hot = pd.DataFrame(encoder.fit_transform(mushrooms[cat_cols]), index=index)
ce_one_hot.head()

Unnamed: 0,cap_shape_1,cap_shape_2,cap_shape_3,cap_shape_4,cap_shape_5,cap_shape_6,cap_surface_1,cap_surface_2,cap_surface_3,cap_surface_4,...,population_4,population_5,population_6,habitat_1,habitat_2,habitat_3,habitat_4,habitat_5,habitat_6,habitat_7
0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


### Binary Encoding

In [173]:
cat_cols = mushrooms.loc[:, 'cap_shape':].columns.tolist()
index=mushrooms.index.tolist()
encoder = ce.BinaryEncoder(cols=cat_cols)
ce_binary = pd.DataFrame(encoder.fit_transform(mushrooms[cat_cols]), index=index)
ce_binary.head()

Unnamed: 0,cap_shape_0,cap_shape_1,cap_shape_2,cap_shape_3,cap_surface_0,cap_surface_1,cap_surface_2,cap_color_0,cap_color_1,cap_color_2,...,spore_print_color_3,spore_print_color_4,population_0,population_1,population_2,population_3,habitat_0,habitat_1,habitat_2,habitat_3
0,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
1,0,0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
2,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
3,0,0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
4,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1


### Hashing Encoding

In [180]:
cat_cols = mushrooms.loc[:, 'cap_shape':].columns.tolist()
index=mushrooms.index.tolist()
encoder = ce.HashingEncoder(cols=cat_cols)
ce_hash = pd.DataFrame(encoder.fit_transform(mushrooms[cat_cols]), index=index)
ce_hash.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,2,1,4,2,11,1,1
1,1,1,1,4,2,11,1,1
2,0,2,1,4,2,11,1,1
3,1,1,1,4,2,11,1,1
4,1,2,1,4,2,10,1,1


In [181]:
ce_hash.nunique()

col_0     6
col_1     7
col_2     8
col_3     4
col_4     5
col_5    11
col_6     8
col_7     5
dtype: int64

### Backward Difference Encoding

In [174]:
cat_cols = mushrooms.loc[:, 'cap_shape':].columns.tolist()
index=mushrooms.index.tolist()
encoder = ce.BackwardDifferenceEncoder(cols=cat_cols)
ce_back_diff = pd.DataFrame(encoder.fit_transform(mushrooms[cat_cols]), index=index)
ce_back_diff.head()

Unnamed: 0,intercept,cap_shape_0,cap_shape_1,cap_shape_2,cap_shape_3,cap_shape_4,cap_surface_0,cap_surface_1,cap_surface_2,cap_color_0,...,population_1,population_2,population_3,population_4,habitat_0,habitat_1,habitat_2,habitat_3,habitat_4,habitat_5
0,1,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,-0.75,-0.5,-0.25,-0.9,...,-0.666667,-0.5,-0.333333,-0.166667,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
1,1,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,-0.75,-0.5,-0.25,-0.9,...,-0.666667,-0.5,-0.333333,-0.166667,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
2,1,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,-0.75,-0.5,-0.25,-0.9,...,-0.666667,-0.5,-0.333333,-0.166667,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
3,1,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,-0.75,-0.5,-0.25,-0.9,...,-0.666667,-0.5,-0.333333,-0.166667,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
4,1,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,-0.75,-0.5,-0.25,-0.9,...,-0.666667,-0.5,-0.333333,-0.166667,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857


In [176]:
ce_back_diff.nunique()

intercept      1
cap_shape_0    2
cap_shape_1    2
cap_shape_2    2
cap_shape_3    2
              ..
habitat_1      2
habitat_2      2
habitat_3      2
habitat_4      2
habitat_5      2
Length: 96, dtype: int64

### Polynomial Encoding

In [175]:
cat_cols = mushrooms.loc[:, 'cap_shape':].columns.tolist()
index=mushrooms.index.tolist()
encoder = ce.PolynomialEncoder(cols=cat_cols)
ce_poly = pd.DataFrame(encoder.fit_transform(mushrooms[cat_cols]), index=index)
ce_poly.head()

Unnamed: 0,intercept,cap_shape_0,cap_shape_1,cap_shape_2,cap_shape_3,cap_shape_4,cap_surface_0,cap_surface_1,cap_surface_2,cap_color_0,...,population_1,population_2,population_3,population_4,habitat_0,habitat_1,habitat_2,habitat_3,habitat_4,habitat_5
0,1,-0.597614,0.545545,-0.372678,0.188982,-0.062994,-0.67082,0.5,-0.223607,-0.495434,...,0.545545,-0.372678,0.188982,-0.062994,-0.566947,0.545545,-0.408248,0.241747,-0.109109,0.032898
1,1,-0.597614,0.545545,-0.372678,0.188982,-0.062994,-0.67082,0.5,-0.223607,-0.495434,...,0.545545,-0.372678,0.188982,-0.062994,-0.566947,0.545545,-0.408248,0.241747,-0.109109,0.032898
2,1,-0.597614,0.545545,-0.372678,0.188982,-0.062994,-0.67082,0.5,-0.223607,-0.495434,...,0.545545,-0.372678,0.188982,-0.062994,-0.566947,0.545545,-0.408248,0.241747,-0.109109,0.032898
3,1,-0.597614,0.545545,-0.372678,0.188982,-0.062994,-0.67082,0.5,-0.223607,-0.495434,...,0.545545,-0.372678,0.188982,-0.062994,-0.566947,0.545545,-0.408248,0.241747,-0.109109,0.032898
4,1,-0.597614,0.545545,-0.372678,0.188982,-0.062994,-0.67082,0.5,-0.223607,-0.495434,...,0.545545,-0.372678,0.188982,-0.062994,-0.566947,0.545545,-0.408248,0.241747,-0.109109,0.032898


In [179]:
ce_poly.nunique()

intercept      1
cap_shape_0    6
cap_shape_1    5
cap_shape_2    6
cap_shape_3    6
              ..
habitat_1      6
habitat_2      7
habitat_3      7
habitat_4      7
habitat_5      7
Length: 96, dtype: int64

### Leave One Out Encoding

In [184]:
cat_cols = mushrooms.loc[:, 'cap_shape':].columns.tolist()
index=mushrooms.index.tolist()
encoder = ce.LeaveOneOutEncoder(cols=cat_cols)
ce_loo = pd.DataFrame(encoder.fit_transform(mushrooms[cat_cols], mushrooms['class'].astype('category').cat.codes), index=index)
ce_loo.head()

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,0.45112,0.526277,0.307988,0.184889,0.0,0.476156,0.070396,0.877318,0.204712,0.414559,...,0.30266,0.360953,0.362147,0.466786,0.477176,0.490279,0.205697,0.0,0.70096,0.405192
1,0.45112,0.526277,0.307988,0.184889,0.0,0.476156,0.070396,0.877318,0.204712,0.414559,...,0.30266,0.360953,0.362147,0.466786,0.477176,0.490279,0.205697,0.106921,0.70096,0.405192
2,0.45112,0.526277,0.307988,0.184889,0.0,0.476156,0.070396,0.877318,0.411576,0.414559,...,0.30266,0.360953,0.362147,0.466786,0.477176,0.490279,0.205697,0.0,0.70096,0.405192
3,0.45112,0.526277,0.307988,0.184889,0.0,0.476156,0.070396,0.877318,0.411576,0.414559,...,0.30266,0.360953,0.362147,0.466786,0.477176,0.490279,0.205697,0.106921,0.70096,0.405192
4,0.45112,0.526277,0.307988,0.184889,0.0,0.476156,0.070396,0.877318,0.10081,0.414559,...,0.30266,0.360953,0.362147,0.466786,0.477176,0.490279,0.205697,0.0,0.70096,0.405192


### Target Encoding

In [185]:
cat_cols = mushrooms.loc[:, 'cap_shape':].columns.tolist()
index=mushrooms.index.tolist()
encoder = ce.TargetEncoder(cols=cat_cols)
ce_target = pd.DataFrame(encoder.fit_transform(mushrooms[cat_cols], mushrooms['class'].astype('category').cat.codes), index=index)
ce_target.head()

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,0.451001,0.52608,0.307692,0.184834,0.0,0.476098,0.070352,0.876972,0.204545,0.414474,...,0.3026,0.360877,0.362069,0.46673,0.477118,0.490216,0.205645,0.0,0.700787,0.405063
1,0.451001,0.52608,0.307692,0.184834,0.0,0.476098,0.070352,0.876972,0.204545,0.414474,...,0.3026,0.360877,0.362069,0.46673,0.477118,0.490216,0.205645,0.10687,0.700787,0.405063
2,0.451001,0.52608,0.307692,0.184834,0.0,0.476098,0.070352,0.876972,0.411311,0.414474,...,0.3026,0.360877,0.362069,0.46673,0.477118,0.490216,0.205645,0.0,0.700787,0.405063
3,0.451001,0.52608,0.307692,0.184834,0.0,0.476098,0.070352,0.876972,0.411311,0.414474,...,0.3026,0.360877,0.362069,0.46673,0.477118,0.490216,0.205645,0.10687,0.700787,0.405063
4,0.451001,0.52608,0.307692,0.184834,0.0,0.476098,0.070352,0.876972,0.100719,0.414474,...,0.3026,0.360877,0.362069,0.46673,0.477118,0.490216,0.205645,0.0,0.700787,0.405063


### M-estimate Encoder

Estimate of likelihood.

In [186]:
cat_cols = mushrooms.loc[:, 'cap_shape':].columns.tolist()
index=mushrooms.index.tolist()
encoder = ce.MEstimateEncoder(cols=cat_cols)
ce_likeli = pd.DataFrame(encoder.fit_transform(mushrooms[cat_cols], mushrooms['class'].astype('category').cat.codes), index=index)
ce_likeli.head()

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,0.451005,0.526058,0.307845,0.184918,0.001164,0.476096,0.070601,0.87681,0.204758,0.414484,...,0.302633,0.360899,0.362092,0.46673,0.477117,0.490213,0.205711,0.009525,0.70073,0.405083
1,0.451005,0.526058,0.307845,0.184918,0.001164,0.476096,0.070601,0.87681,0.204758,0.414484,...,0.302633,0.360899,0.362092,0.46673,0.477117,0.490213,0.205711,0.107042,0.70073,0.405083
2,0.451005,0.526058,0.307845,0.184918,0.001164,0.476096,0.070601,0.87681,0.411347,0.414484,...,0.302633,0.360899,0.362092,0.46673,0.477117,0.490213,0.205711,0.009525,0.70073,0.405083
3,0.451005,0.526058,0.307845,0.184918,0.001164,0.476096,0.070601,0.87681,0.411347,0.414484,...,0.302633,0.360899,0.362092,0.46673,0.477117,0.490213,0.205711,0.107042,0.70073,0.405083
4,0.451005,0.526058,0.307845,0.184918,0.001164,0.476096,0.070601,0.87681,0.101048,0.414484,...,0.302633,0.360899,0.362092,0.46673,0.477117,0.490213,0.205711,0.009525,0.70073,0.405083
