# Categorical data encoding

## iris dataset

In [2]:
import sklearn
from sklearn.datasets import load_iris

In [3]:
iris = load_iris()
import pandas as pd
df_iris = pd.DataFrame(iris.data,columns = iris.feature_names)
df_iris
df_iris['species'] = iris.target_names[iris.target]


In [4]:
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## One-Hot encoding

Article : 
https://towardsdatascience.com/6-ways-to-encode-features-for-machine-learning-algorithms-21593f6238b0


In [29]:
ohe = pd.get_dummies(df_iris, columns = ['species'])
iris_df = pd.concat([df_iris,ohe],axis=1)
# axis = 1 to add concat columns
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,sepal length (cm).1,sepal width (cm).1,petal length (cm).1,petal width (cm).1,species_setosa,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,setosa,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,setosa,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,setosa,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,setosa,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,setosa,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,virginica,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,5.2,2.0,virginica,6.5,3.0,5.2,2.0,0,0,1
148,6.2,3.4,5.4,2.3,virginica,6.2,3.4,5.4,2.3,0,0,1


## One Hot Encoding using sklearn

In [6]:
from sklearn import preprocessing

In [32]:
from sklearn.preprocessing import OneHotEncoder

In [53]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(df_iris[['species']])
encoded

<150x3 sparse matrix of type '<class 'numpy.float64'>'
	with 150 stored elements in Compressed Sparse Row format>

In [54]:
encoder.get_feature_names_out()

array(['species_setosa', 'species_versicolor', 'species_virginica'],
      dtype=object)

In [58]:
one_hot_df = pd.DataFrame(encoded.toarray(),columns = encoder.get_feature_names_out(['species']))
one_hot_df

Unnamed: 0,species_setosa,species_versicolor,species_virginica
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
145,0.0,0.0,1.0
146,0.0,0.0,1.0
147,0.0,0.0,1.0
148,0.0,0.0,1.0


## Label / Ordinal Encoding

In [65]:
from sklearn.preprocessing import LabelEncoder

label_en = LabelEncoder()


In [71]:
df_iris['species'] = label_en.fit_transform(df_iris['species'])
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [94]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Create a sample dataset
df = pd.DataFrame({'fruits': ['apple', 'banana', 'orange', 'banana', 'apple']})

# Initialize the encoder
encoder = OrdinalEncoder()

# Fit the encoder on the categorical variable and transform it
df_encoded = pd.DataFrame(encoder.fit_transform(df), columns=['fruits_encoded'])

print(df,"\n")
print(df_encoded.head())


   fruits
0   apple
1  banana
2  orange
3  banana
4   apple 

   fruits_encoded
0             0.0
1             1.0
2             2.0
3             1.0
4             0.0


## Binning

In [101]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import OrdinalEncoder

# Load the Iris dataset
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Create a categorical variable 'petal_width_cat' based on the 'petal width (cm)' feature
df['petal_width_cat'] = pd.cut(df['petal width (cm)'], bins=[0, 1.5, 2.5, 3.5], labels=['petal_narrow', 'petal_medium', 'petal_wide'])

df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,petal_width_cat
0,5.1,3.5,1.4,0.2,0,petal_narrow
1,4.9,3.0,1.4,0.2,0,petal_narrow
2,4.7,3.2,1.3,0.2,0,petal_narrow
3,4.6,3.1,1.5,0.2,0,petal_narrow
4,5.0,3.6,1.4,0.2,0,petal_narrow
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,petal_medium
146,6.3,2.5,5.0,1.9,2,petal_medium
147,6.5,3.0,5.2,2.0,2,petal_medium
148,6.2,3.4,5.4,2.3,2,petal_medium


### equal-width binning

In [103]:
import pandas as pd

# Create a sample dataset
df = pd.DataFrame({'values': [2.3, 4.5, 6.2, 8.7, 3.1, 5.6, 7.9, 1.8]})

# Perform equal-width binning on the 'values' column
df['values_binned'] = pd.cut(df['values'], bins=3, labels=['low', 'medium', 'high'])

print(df.head())


   values values_binned
0     2.3           low
1     4.5        medium
2     6.2        medium
3     8.7          high
4     3.1           low


### equal frequency binning

In [106]:
import pandas as pd

# Create a sample dataset
df = pd.DataFrame({'values': [2.3, 4.5, 6.2, 8.7, 3.1, 5.6, 7.9, 1.8]})

# Perform equal frequency binning on the 'values' column
df['values_binned'] = pd.qcut(df['values'], q=3, labels=['low', 'medium', 'high'])

print(df.head())


   values values_binned
0     2.3           low
1     4.5        medium
2     6.2          high
3     8.7          high
4     3.1           low
