# Handling categorical data

In [None]:
import pandas as pd

In [None]:
data = [['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1']]

In [None]:
df = pd.DataFrame(data)
df.columns = ['color', 'size', 'price', 'classlabel']
display(df)

## Mapping ordinal features

In [None]:
size_mapping = {'XL': 3, 'L': 2, 'M': 1}

In [None]:
df['size'] = df['size'].map(size_mapping)
display(df)

In [None]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

## Encoding class labels

In [None]:
import numpy as np

In [None]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

In [None]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

In [None]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

In [None]:
class_le.inverse_transform(y)

## Performing one-hot encoding on nominal features

In [None]:
X = df[['color', 'size', 'price']].values
display(X)

#### This is not correct since we assume that green is larger than blue, and so on...

In [None]:
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
display(X)

In [None]:
X = df[['color']].values
display(X)

In [None]:
#Perform one-hot encoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
XOhe = ohe.fit_transform(X).toarray()
display(XOhe)
display(ohe.categories_[0])

In [None]:
df2 = pd.DataFrame(XOhe, columns=ohe.categories_[0])
display(df2)

In [None]:
pd.concat([df,df2],axis=1)

### Drop column OHE
Drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into a neural network or an unregularized regression.  However, dropping one category breaks the symmetry of the original representation and can therefore induce a bias in downstream models, for instance for penalized linear classification or regression models.



In [None]:
#Perform one-hot encoder with dropped column
X = df[['color']].values
ohe = OneHotEncoder(drop='first')
XOhe = ohe.fit_transform(X).toarray()
display(XOhe)
display(ohe.categories_[0])
display(ohe.drop_idx_[0])

### OHE from Pandas
An even more convenient way to create those dummy features via one-hot encoding is to use the get_dummies method implemented in pandas. Applied to a DataFrame, the get_dummies method will only convert string columns and leave all other columns unchanged:

In [None]:
pd.get_dummies(df[['price', 'color', 'size']])