# Encoding

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import label_binarize

In [3]:
iris = pd.read_csv("Iris.csv")
print(iris.dtypes)

Sepal_Length    float64
Sepal_Width     float64
Petal_Length    float64
Petal_Width     float64
Species          object
dtype: object


### 1. Encoding Features

In [4]:
print(pd.get_dummies(iris, columns=["Species"], prefix=["Species"]).dtypes)

Sepal_Length           float64
Sepal_Width            float64
Petal_Length           float64
Petal_Width            float64
Species_Setosa           uint8
Species_Versicolour      uint8
Species_Virginica        uint8
dtype: object


### 2. Encoding Target
Multiclass classification makes the assumption that each sample is assigned to the only one label.<br>
All scikit-learn classifiers are capable of multiclass classification
with either string or numeric output labels.

In [5]:
targets = {"Setosa":0, "Versicolour":1, "Virginica":2}
print(pd.concat([iris.loc[[0, 50, 100], "Species"],
                 iris.loc[[0, 50, 100], "Species"].map(targets)], axis=1))

         Species  Species
0         Setosa        0
50   Versicolour        1
100    Virginica        2


In [6]:
targets = ["Setosa", "Versicolour", "Virginica"]
print(list(zip(iris.loc[[0, 50, 100], "Species"],
               label_binarize(iris["Species"], classes=targets)[[0, 50, 100]])))

[('Setosa', array([1, 0, 0])), ('Versicolour', array([0, 1, 0])), ('Virginica', array([0, 0, 1]))]


### 3. Binarization

In [7]:
print(pd.cut(iris["Sepal_Length"],
             bins=5, labels=list(range(0, 5)), retbins=False).value_counts(sort=False))

0    32
1    41
2    42
3    24
4    11
Name: Sepal_Length, dtype: int64
