# Handling categorical data

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = [
    [10.1, 'M', 'green', 'class1'],
    [13.5, 'L', 'red', 'class2'],
    [15.3, 'XL', 'blue', 'class1'],
    [11.3, 'M', 'red', 'class1'],
    [13.2, 'L', 'blue', 'class2']]

In [None]:
df = pd.DataFrame(data)
df.columns = ['price', 'size', 'color', 'y']
display(df)

## Mapping ordinal features (Fix 'size' column)

### SKLearn Implementation

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Creating object
enc = OrdinalEncoder(categories=[['M', 'L', 'XL']], dtype=np.int16)

# Fitting
enc.fit(df[['size']])
print(enc.categories_)

# Transformation
size_enc = enc.transform(df[['size']])
print(size_enc)

In [None]:
# Replacing original column
df2 = df.copy()
df2['size'] = size_enc
display(df2)

In [None]:
# Inverse transformation
temp = df2.copy()
temp['size'] = enc.inverse_transform(temp[['size']])
display(temp)

### Python Implementation

In [None]:
temp = df.copy()

# Transformation
size_mapping = {'XL': 2, 'L': 1, 'M': 0}
temp['size'] = temp['size'].map(size_mapping)
display(temp)

# Inverse transformation
inv_size_mapping = {v: k for k, v in size_mapping.items()}
temp['size'] = temp['size'].map(inv_size_mapping)
display(temp)

## One-hot encoding on nominal features (Fixing 'color' column)

### SKLearning implementation

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Creating object
ohe = OneHotEncoder(categories=[['green','red','blue']], dtype=np.int32)

# Fitting
ohe.fit(df[['color']]) 
print(ohe.categories_)

# Transformation
color_ohe = ohe.transform(df[['color']])
print(color_ohe.toarray()) #ohe.transform return sparse matrix. So, "toarray()" method is needed.

In [None]:
# Creating dataframe
cols = ohe.categories_[0]
temp = pd.DataFrame(color_ohe.toarray(), columns=cols)
display(temp)

In [None]:
# Replacing original column
df3 = pd.concat([df2, temp], axis=1)
df3 = df3.drop(columns=['color'])

# Rearranging columns
cols = list(df3.columns)
cols.remove('y')
cols.append('y')
df3 = df3[cols]
display(df3)

#### Drop column OHE
Drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into a neural network or an unregularized regression.  However, dropping one category breaks the symmetry of the original representation and can therefore induce a bias in downstream models, for instance for penalized linear classification or regression models.


In [None]:
ohe = OneHotEncoder(drop='first', categories=[['green','red','blue']], dtype=np.int32)
color_ohe = ohe.fit_transform(df[['color']])
print(color_ohe.toarray())

# All columns
cols = list(ohe.categories_[0])
print(cols)

# Drop first column
cols.pop(0)
print(cols)

temp = pd.DataFrame(color_ohe.toarray(), columns=cols)
display(temp)


### Pandas implementation

In [None]:
pd.get_dummies(df[['color']])

## Encoding class labels (fix 'y' column)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Fitting
le.fit(df['y'])
print(le.classes_)

# Transformation
y_le = le.transform(df['y'])

# Replace original column
df4 = df3.copy()
df4['y'] = y_le
display(df4)

# Inverse transformation
temp = df4.copy()
temp['y'] = le.inverse_transform(df4['y'])
display(temp) 

### Python implementation

In [None]:
temp = df3.copy()

# Transformation
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['y']))}
temp['y'] = temp['y'].map(class_mapping)
display(temp)

# Inverse transformation
inv_class_mapping = {v: k for k, v in class_mapping.items()}
temp['y'] = temp['y'].map(inv_class_mapping)
display(temp)