<a href="https://colab.research.google.com/github/jsdysw/approaching_almost_any_mlp/blob/master/categorical_variables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Label Encoding

In [2]:
mapping = {
    "Freezing":0,
    "Warm":1,
    "Cold":2,
    "Boiling Hot":3,
    "Hot":4,
    "Lava Hot":5
}

In [8]:
import pandas as pd

# download dataset from https://www.kaggle.com/competitions/cat-in-the-dat-ii/data

df = pd.read_csv("./train.csv")
print(df.ord_2.value_counts())

df.loc[:, "ord_2"] = df.ord_2.map(mapping)
print(df.ord_2.value_counts())

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64
0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64


In [9]:
import pandas as pd
from sklearn import preprocessing

# download dataset from https://www.kaggle.com/competitions/cat-in-the-dat-ii/data

df = pd.read_csv("./train.csv")
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")

lbl_enc = preprocessing.LabelEncoder()
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

In [15]:
# Above kind of label encoding cannot be used in linear model, svm or neural networks
# For these type of models, we can binarize the data

'''
  Freezing    -->   0 --> 0 0 0 
  Warm        -->   1 --> 0 0 1
  Cold        -->   2 --> 0 1 0
  Boiling Hot -->   3 --> 0 1 1
  Hot         -->   4 --> 1 0 0
  Lava Hot    -->   5 --> 1 0 1
               8 byte --> 72 bytes
'''
# to save memory let's save them as sparse matrix

import numpy as np
from scipy import sparse

example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

print(f"Size of dense array: {example.nbytes}")

sparse_example = sparse.csr_matrix(example)
print(sparse_example)
print(f"Size of sparse array: {sparse_example.data.nbytes}")

full_size = (
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes +
    sparse_example. indices.nbytes
)
print(f"Full size of sparse array: {full_size}")

Size of dense array: 72
  (0, 2)	1
  (1, 0)	1
  (2, 0)	1
  (2, 2)	1
Size of sparse array: 32
Full size of sparse array: 64
