In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pandas as pd
import numpy as np
import sklearn
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, cohen_kappa_score

## Encode labels using LabelEncoder

In [2]:
data = pd.read_csv('iris.data')
data = data.sample(frac=1) # shuffle rows
labels = data.to_numpy()[:,4]

# VERY inconvenient
# labels[labels[:]=='Iris-versicolor']=0
# labels[labels[:]=='Iris-setosa']=1
# labels[labels[:]=='Iris-virginica']=2

print("Original labels:",labels[:10])
encoder = sklearn.preprocessing.LabelEncoder() # encoder
intlabels = encoder.fit_transform(labels)
print("Encoded labels",intlabels[:10])
print("Decoded labels",encoder.inverse_transform(intlabels[:10]))


Original labels: ['Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica']
Encoded labels [1 1 1 2 2 2 1 2 2 2]
Decoded labels ['Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica']


## One-hot encoding using LabelBinarizer

In [3]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
ohlabels = lb.fit_transform(labels)
print("Original labels:",labels[:10])
print("Encoded labels:",ohlabels[:10])

# Decoding
label = np.array([[0,1,0]])
print("Label shape:",label.shape)
print("One-hot label", label)
print("Decoded label:",lb.inverse_transform(label))


Original labels: ['Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica']
Encoded labels: [[0 1 0]
 [0 1 0]
 [0 1 0]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 1 0]
 [0 0 1]
 [0 0 1]
 [0 0 1]]
Label shape: (1, 3)
One-hot label [[0 1 0]]
Decoded label: ['Iris-versicolor']


## One-hot for two classes with LabelBinarizer

In [4]:
data = pd.read_csv('iris.data')
data = data.sample(frac=1) # shuffle rows
data = data.drop(data[data.iris=='Iris-virginica'].index)
labels = data.to_numpy()[:,4]
lb = LabelBinarizer()
ohlabels = lb.fit_transform(labels)
print("Original labels:",labels[:10])
print("Encoded labels:",ohlabels[:10])

## Problem: it is not one-hot encoded!


Original labels: ['Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-versicolor']
Encoded labels: [[0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]]


### Use keras.to_categorical

In [5]:
tfohlabels = tf.keras.utils.to_categorical(labels)

## Problem: it requires integers!

ValueError: invalid literal for int() with base 10: 'Iris-setosa'

### Solution: use LabelEncoder and to_categorical

In [6]:
encoder = sklearn.preprocessing.LabelEncoder() # encoder
intlabels = encoder.fit_transform(labels)
tc_ohlabels = tf.keras.utils.to_categorical(intlabels)
print("One-hot labels",tc_ohlabels[:10])

## find text label for label
label = np.array([0,1])
print('Label:',label)
print("Text label:",encoder.inverse_transform(label)) # WRONG! each value treated independently!
print('argmax label:',label.argmax())
#print(encoder.inverse_transform(label.argmax())) # encoder expects colection of values!
print("Text label:",encoder.inverse_transform([label.argmax()])) # works when packed into a collection


One-hot labels [[1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]
Label: [0 1]
Text label: ['Iris-setosa' 'Iris-versicolor']
argmax label: 1
Text label: ['Iris-versicolor']
