## LABEL ENCODER

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [1]:
## Preliminaries
# Import required packages
from sklearn import preprocessing
import pandas as pd


In [2]:
#Create DataFrame
raw_data = {'patient': [1, 1, 1, 2, 2],
        'obs': [1, 2, 3, 1, 2],
        'treatment': [0, 1, 0, 1, 0],
        'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])

In [3]:
df.head()

Unnamed: 0,patient,obs,treatment,score
0,1,1,0,strong
1,1,2,1,weak
2,1,3,0,normal
3,2,1,1,weak
4,2,2,0,strong


In [4]:
# Sort the rows of dataframe by column 'Name'
dfObj = df.sort_values(by ='score' )

In [5]:
dfObj.head()

Unnamed: 0,patient,obs,treatment,score
2,1,3,0,normal
0,1,1,0,strong
4,2,2,0,strong
1,1,2,1,weak
3,2,1,1,weak


In [6]:
#Fit The Label Encoder
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()

In [7]:
# Fit the encoder to the pandas column
le.fit(dfObj['score'])
#LabelEncoder()

LabelEncoder()

In [8]:
#View The Labels
# View the labels (if you want)
list(le.classes_)


['normal', 'strong', 'weak']

In [9]:
#Transform Categories Into Integers
# Apply the fitted encoder to the pandas column
le.transform(df['score'])

array([1, 2, 0, 2, 1])

In [10]:
#Transform Integers Into Categories
# Convert some integers into their category names
list(le.inverse_transform([2, 2, 1]))

['weak', 'weak', 'strong']

## ONE HOT ENCODER

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [11]:
# Load libraries
import numpy as np
import pandas as pd


In [12]:
# Create NumPy array
x = np.array([['Texas'], 
              ['California'], 
              ['Texas'], 
              ['Delaware'], 
              ['Texas']])


In [13]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(x)
print ("OneHot Encoder:", onehot_encoded)

OneHot Encoder: [[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [14]:
onehot_encoder.categories_

[array(['California', 'Delaware', 'Texas'], dtype='<U10')]

In [15]:
onehot_encoder = OneHotEncoder(sparse=True)
onehot_encoded = onehot_encoder.fit_transform(x)
print ("OneHot Encoder:", onehot_encoded)

OneHot Encoder:   (0, 2)	1.0
  (1, 0)	1.0
  (2, 2)	1.0
  (3, 1)	1.0
  (4, 2)	1.0


## LABEL BINARIZER 


In [16]:
from sklearn.preprocessing import LabelBinarizer
# Create LabelBinzarizer object
LabelBi = LabelBinarizer()

In [17]:
# One-hot encode data
LabelBi.fit_transform(x)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [18]:
# View classes
LabelBi.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

Mirar la documentacion : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html



In [19]:
pd.get_dummies(x[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


Con pandas: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html

### Pero cúal es la diferencia?

In [20]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 
'warm', 'hot']
values = array(data)
print ("Data: ", values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print ("Classes: ", label_encoder.classes_)

print( "Label Encoder:" ,integer_encoded)

# onehot encode
onehot_encoder = OneHotEncoder(sparse=False)
print(values.shape)
values_encoded = values.reshape(len(values), 1)
print(values_encoded.shape)
onehot_encoded = onehot_encoder.fit_transform(values_encoded)
print ("OneHot Encoder:", onehot_encoded)

#Binary encode
lb = LabelBinarizer()
print ("Label Binarizer:", lb.fit_transform(values))

Data:  ['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']
Classes:  ['cold' 'hot' 'warm']
Label Encoder: [0 0 2 0 1 1 2 0 2 1]
(10,)
(10, 1)
OneHot Encoder: [[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
Label Binarizer: [[1 0 0]
 [1 0 0]
 [0 0 1]
 [1 0 0]
 [0 1 0]
 [0 1 0]
 [0 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 0]]
