## LABEL ENCODER

In [1]:
## Preliminaries
# Import required packages
from sklearn import preprocessing
import pandas as pd


In [2]:
#Create DataFrame
raw_data = {'patient': [1, 1, 1, 2, 2],
        'obs': [1, 2, 3, 1, 2],
        'treatment': [0, 1, 0, 1, 0],
        'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])

In [3]:
#Fit The Label Encoder
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()

In [4]:
# Fit the encoder to the pandas column
le.fit(df['score'])
#LabelEncoder()

LabelEncoder()

In [5]:
#View The Labels
# View the labels (if you want)
list(le.classes_)
['normal', 'strong', 'weak']

['normal', 'strong', 'weak']

In [6]:
#Transform Categories Into Integers
# Apply the fitted encoder to the pandas column
le.transform(df['score'])

array([1, 2, 0, 2, 1])

In [18]:
#Transform Integers Into Categories
# Convert some integers into their category names
list(le.inverse_transform([2, 2, 0]))

['weak', 'weak', 'normal']

## ONE HOT ENCODER

In [19]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer

In [20]:
# Create NumPy array
x = np.array([['Texas'], 
              ['California'], 
              ['Texas'], 
              ['Delaware'], 
              ['Texas']])

In [21]:
# Create LabelBinzarizer object
one_hot = LabelBinarizer()

In [23]:
# One-hot encode data
one_hot.fit_transform(x)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [24]:
# View classes
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [25]:
pd.get_dummies(x[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


## Pero cúal es la diferencia?

In [26]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 
'warm', 'hot']
values = array(data)
print ("Data: ", values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print( "Label Encoder:" ,integer_encoded)

# onehot encode
onehot_encoder = OneHotEncoder(sparse=False)
print(values.shape)
values_encoded = values.reshape(len(values), 1)
print(values_encoded.shape)
onehot_encoded = onehot_encoder.fit_transform(values_encoded)
print ("OneHot Encoder:", onehot_encoded)

#Binary encode
lb = LabelBinarizer()
print ("Label Binarizer:", lb.fit_transform(values))

Data:  ['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']
Label Encoder: [0 0 2 0 1 1 2 0 2 1]
(10,)
(10, 1)
OneHot Encoder: [[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
Label Binarizer: [[1 0 0]
 [1 0 0]
 [0 0 1]
 [1 0 0]
 [0 1 0]
 [0 1 0]
 [0 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 0]]


## IMPUTER

In [27]:
# Load libraries
import numpy as np
from sklearn.preprocessing import Imputer


In [28]:
# Create feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [0, -0.21, -1.19],
              [np.nan, 0.87, 1.31],
              [np.nan, -0.67, -0.22]])

In [29]:
# Create Imputer object
imputer = Imputer(strategy='most_frequent', axis=0)



In [30]:
imputer.fit_transform(X)

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 0.  , -0.21, -1.19],
       [ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22]])

In [31]:
print(X)

[[ 0.    2.1   1.45]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 0.   -0.21 -1.19]
 [  nan  0.87  1.31]
 [  nan -0.67 -0.22]]


 ## PIPELINE

In [32]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [33]:
#Create DataFrame
raw_data = {'patient': [1, 1, 1, 2, 2],
        'obs': [1, np.nan, 3, 1, 2],
        'treatment': [0, 1, 0, 1, np.nan],
            'sex':['F', 'M', 'F', 'F', 'M'],
        'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'sex','score'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
patient      5 non-null int64
obs          4 non-null float64
treatment    4 non-null float64
sex          5 non-null object
score        5 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 280.0+ bytes


In [34]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['obs','treatment']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

categorical_features = ['score','sex']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [35]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [37]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor)])

DATA_PIPELINE=clf.fit_transform(df)
DATA_PIPELINE


array([[1., 0., 0., 1., 0., 1., 0.],
       [1., 1., 0., 0., 1., 0., 1.],
       [3., 0., 1., 0., 0., 1., 0.],
       [1., 1., 0., 0., 1., 1., 0.],
       [2., 0., 0., 1., 0., 0., 1.]])

In [84]:
pd.get_dummies(DATA_PIPELINE[:,0])

Unnamed: 0,1.0,2.0,3.0
0,1,0,0
1,1,0,0
2,0,0,1
3,1,0,0
4,0,1,0


In [45]:
df

Unnamed: 0,patient,obs,treatment,sex,score
0,1,1.0,0.0,F,strong
1,1,,1.0,M,weak
2,1,3.0,0.0,F,normal
3,2,1.0,1.0,F,weak
4,2,2.0,,M,strong
